def encode_document(doc_id, text): """Encode a document as a JSON dictionary so that MRNgramIDFUtility can read it. We intend to use `doc_id` as a business/product/entity ID rather than the ID of an individual review.""" #text = unicode(text) RAWR some amazon reviews won't encode return JSONValueProtocol.write( None, {'doc_id': doc_id, 'text': text})
def root_to_json(root_dir, output_file): walker = EmailWalker(root_dir) output = open(output_file, "w") for email in walker: email['date'] = str(email['date']) line = JSONValueProtocol.write(None, email) + '\n' output.write(line) output.close()
def root_to_json(root_dir, output_file): walker = EmailWalker(root_dir) output = open(output_file, "w") for email in walker: email['date'] = str(email['date']) line = JSONValueProtocol.write(None, email) + '\n' output.write(line) output.close()
def encode_document(text, cats=None, id=None): """Encode a document as a JSON so that MRTextClassifier can read it. Args: text -- the text of the document (as a unicode) cats -- a dictionary mapping a category name (e.g. 'sports') to True if the document is in the category, and False if it's not. None indicates that we have no information about this documents' categories id -- a unique ID for the document (any kind of JSON-able value should work). If not specified, we'll auto-generate one. """ text = unicode(text, errors="ignore") cats = dict((unicode(cat), bool(is_in_cat)) for cat, is_in_cat in (cats or {}).iteritems()) return JSONValueProtocol.write(None, {"document": text, "cats": cats, "docid": id, "type": "document"}) + "\n"
def encode_document(text, cats=None, id=None): """Encode a document as a JSON so that MRTextClassifier can read it. Args: text -- the text of the document (as a unicode) cats -- a dictionary mapping a category name (e.g. 'sports') to True if the document is in the category, and False if it's not. None indicates that we have no information about this documents' categories id -- a unique ID for the document (any kind of JSON-able value should work). If not specified, we'll auto-generate one. """ text = unicode(text) cats = dict((unicode(cat), bool(is_in_cat)) for cat, is_in_cat in (cats or {}).iteritems()) return JSONValueProtocol.write( None, {'text': text, 'cats': cats, 'id': id}) + '\n'
def test_numerical_keys_become_strs(self): # JSON should convert numbers to strings when they are dict keys self.assertEqual( (None, {'3': 4}), JSONValueProtocol.read(JSONValueProtocol.write(None, {3: 4})))
def test_tuples_become_lists(self): # JSON should convert tuples into lists self.assertEqual( (None, [3, 4]), JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))
def test_uses_json_format(self): VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]} ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}' self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED)) self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))
def test_numerical_keys_become_strs(self): # JSON should convert numbers to strings when they are dict keys self.assertEqual( (None, {'3': 4}), JSONValueProtocol.read(JSONValueProtocol.write(None, {3: 4})))
def test_tuples_become_lists(self): # JSON should convert tuples into lists self.assertEqual( (None, [3, 4]), JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))
def test_uses_json_format(self): VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]} ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}' self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED)) self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))