def test_numerical_keys_become_strs(self): # JSON should convert numbers to strings when they are dict keys self.assertEqual( (None, { '3': 4 }), JSONValueProtocol().read(JSONValueProtocol().write(None, {3: 4})))
def test_bad_keys_and_values(self): # dictionaries have to have strings as keys self.assertCantEncode(JSONValueProtocol(), None, {(1, 2): 3}) # only unicodes (or bytes in utf-8) are allowed self.assertCantEncode(JSONValueProtocol(), None, '\xe9') # sets don't exist in JSON self.assertCantEncode(JSONValueProtocol(), None, set()) # Point class has no representation in JSON self.assertCantEncode(JSONValueProtocol(), None, Point(1, 4))
def reducer_init(self): self.idfs = {} for fname in os.listdir(DIRECTORY): # look through file names in the directory file = open(os.path.join(DIRECTORY, fname)) # open a file for line in file: # read each line in json file term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object self.idfs[term_idf['term']] = term_idf['idf']
def reducer_init(self): self.idfs = {} for fname in os.listdir(DIRECTORY): # look through file names in the directory file = open(os.path.join(DIRECTORY, fname)) # open a file for line in file: # read each line in json file term_idf = JSONValueProtocol().read(line)[1] # parse the line as a JSON object self.idfs[term_idf['term']] = term_idf['idf']
def encode_document(doc_id, text): """Encode a document as a JSON dictionary so that MRNgramIDFUtility can read it. We intend to use `doc_id` as a business/product/entity ID rather than the ID of an individual review.""" #text = unicode(text) RAWR some amazon reviews won't encode return JSONValueProtocol.write( None, {'doc_id': doc_id, 'text': text})
def root_to_json(root_dir, output_file): walker = EmailWalker(root_dir) output = open(output_file, "w") for email in walker: email['date'] = str(email['date']) line = JSONValueProtocol.write(None, email) + '\n' output.write(line) output.close()
def parse_file(filename): words = defaultdict(lambda: 0) with open(filename) as input: for line in input: email = JSONValueProtocol.read(line)[1] for term in get_terms(email['text']): words[term] += 1 for word, count in words.items(): print word, count
def data(self, minimum=1, **kw): res = [] mr_job = MRWordFreqJSON() mr_job.stdin = [JSONValueProtocol().write(None, line) for line in TEXT] with mr_job.make_runner() as runner: runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) if int(value) >= int(minimum): res.append([key, value]) return dict(data=res)
def encode_document(text, cats=None, id=None): """Encode a document as a JSON so that MRTextClassifier can read it. Args: text -- the text of the document (as a unicode) cats -- a dictionary mapping a category name (e.g. 'sports') to True if the document is in the category, and False if it's not. None indicates that we have no information about this documents' categories id -- a unique ID for the document (any kind of JSON-able value should work). If not specified, we'll auto-generate one. """ text = unicode(text, errors="ignore") cats = dict((unicode(cat), bool(is_in_cat)) for cat, is_in_cat in (cats or {}).iteritems()) return JSONValueProtocol.write(None, {"document": text, "cats": cats, "docid": id, "type": "document"}) + "\n"
def reducer_init(self): self.idfs = {} # Iterate through the files in the bucket provided by the user if self.options.aws_access_key_id and self.options.aws_secret_access_key: emr = EMRJobRunner(aws_access_key_id=self.options.aws_access_key_id, aws_secret_access_key=self.options.aws_secret_access_key) else: emr = EMRJobRunner() for key in emr.get_s3_keys("s3://" + self.options.idf_loc): # Load the whole file first, then read it line-by-line: otherwise, # chunks may not be even lines for line in StringIO(key.get_contents_as_string()): term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object self.idfs[term_idf['term']] = term_idf['idf']
def reducer_init(self): self.idfs = {} # Iterate through the files in the bucket provided by the user if self.options.aws_access_key_id and self.options.aws_secret_access_key: emr = EMRJobRunner( aws_access_key_id=self.options.aws_access_key_id, aws_secret_access_key=self.options.aws_secret_access_key) else: emr = EMRJobRunner() for key in emr.get_s3_keys("s3://" + self.options.idf_loc): # Load the whole file first, then read it line-by-line: otherwise, # chunks may not be even lines for line in StringIO(key.get_contents_as_string()): term_idf = JSONValueProtocol.read(line)[ 1] # parse the line as a JSON object self.idfs[term_idf['term']] = term_idf['idf']
def encode_document(text, cats=None, id=None): """Encode a document as a JSON so that MRTextClassifier can read it. Args: text -- the text of the document (as a unicode) cats -- a dictionary mapping a category name (e.g. 'sports') to True if the document is in the category, and False if it's not. None indicates that we have no information about this documents' categories id -- a unique ID for the document (any kind of JSON-able value should work). If not specified, we'll auto-generate one. """ text = unicode(text) cats = dict((unicode(cat), bool(is_in_cat)) for cat, is_in_cat in (cats or {}).iteritems()) return JSONValueProtocol.write( None, {'text': text, 'cats': cats, 'id': id}) + '\n'
def test_numerical_keys_become_strs(self): # JSON should convert numbers to strings when they are dict keys self.assertEqual( (None, {'3': 4}), JSONValueProtocol.read(JSONValueProtocol.write(None, {3: 4})))
def test_tuples_become_lists(self): # JSON should convert tuples into lists self.assertEqual( (None, [3, 4]), JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))
def test_uses_json_format(self): VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]} ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}' self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED)) self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))
#s3_input_path = "s3://joeloren//iceval_out//input//datasets//" tmp_dir_out = "s3://joeloren/interim_out/" tmp_dir_in = "s3://joeloren/interim_in/" tmp_dir_in_relative = "interim_in/" tmp_dir_out_relative = "interim_out/" from mrjob.protocol import JSONValueProtocol, JSONProtocol jvp = JSONValueProtocol() jp = JSONProtocol() from boto.s3.connection import S3Connection import sys c = S3Connection('AKIAI4OZ3HY56BTOHA3A', '6isbkZjBM8kt3PIk53EXVIf76VOPxOH8rNleGc6B') bucket = c.get_bucket("joeloren") datasets_bucket = c.get_bucket('joel_datasets')
def test_uses_json_format(self): VALUE = {'foo': 'bar'} ENCODED = b'{"foo": "bar"}' self.assertEqual((None, VALUE), JSONValueProtocol().read(ENCODED)) self.assertEqual(ENCODED, JSONValueProtocol().write(None, VALUE))
def test_bad_data(self): self.assertCantDecode(JSONValueProtocol(), '{@#$@#!^&*$%^')
def test_round_trip_with_trailing_tab(self): for _, v in JSON_KEYS_AND_VALUES: self.assertRoundTripWithTrailingTabOK(JSONValueProtocol(), None, v)
def test_round_trip(self): for _, v in JSON_KEYS_AND_VALUES: self.assertRoundTripOK(JSONValueProtocol(), None, v)
import re import sys from collections import defaultdict from mrjob.protocol import JSONValueProtocol from term_tools import get_terms input = open(sys.argv[1]) words = defaultdict(lambda: 0) for line in input: email = JSONValueProtocol.read(line)[1] for term in get_terms(email['text']): words[term] += 1 for word, count in words.items(): print word, count