def reducer_init(self): self.idfs = {} for fname in os.listdir(DIRECTORY): # look through file names in the directory file = open(os.path.join(DIRECTORY, fname)) # open a file for line in file: # read each line in json file term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object self.idfs[term_idf['term']] = term_idf['idf']
def parse_file(filename): words = defaultdict(lambda: 0) with open(filename) as input: for line in input: email = JSONValueProtocol.read(line)[1] for term in get_terms(email['text']): words[term] += 1 for word, count in words.items(): print word, count
def reducer_init(self): self.idfs = {} # Iterate through the files in the bucket provided by the user if self.options.aws_access_key_id and self.options.aws_secret_access_key: emr = EMRJobRunner(aws_access_key_id=self.options.aws_access_key_id, aws_secret_access_key=self.options.aws_secret_access_key) else: emr = EMRJobRunner() for key in emr.get_s3_keys("s3://" + self.options.idf_loc): # Load the whole file first, then read it line-by-line: otherwise, # chunks may not be even lines for line in StringIO(key.get_contents_as_string()): term_idf = JSONValueProtocol.read(line)[1] # parse the line as a JSON object self.idfs[term_idf['term']] = term_idf['idf']
def reducer_init(self): self.idfs = {} # Iterate through the files in the bucket provided by the user if self.options.aws_access_key_id and self.options.aws_secret_access_key: emr = EMRJobRunner( aws_access_key_id=self.options.aws_access_key_id, aws_secret_access_key=self.options.aws_secret_access_key) else: emr = EMRJobRunner() for key in emr.get_s3_keys("s3://" + self.options.idf_loc): # Load the whole file first, then read it line-by-line: otherwise, # chunks may not be even lines for line in StringIO(key.get_contents_as_string()): term_idf = JSONValueProtocol.read(line)[ 1] # parse the line as a JSON object self.idfs[term_idf['term']] = term_idf['idf']
def test_numerical_keys_become_strs(self): # JSON should convert numbers to strings when they are dict keys self.assertEqual( (None, {'3': 4}), JSONValueProtocol.read(JSONValueProtocol.write(None, {3: 4})))
def test_tuples_become_lists(self): # JSON should convert tuples into lists self.assertEqual( (None, [3, 4]), JSONValueProtocol.read(JSONValueProtocol.write(None, (3, 4))))
def test_uses_json_format(self): VALUE = {'foo': {'bar': 3}, 'baz': None, 'quz': ['a', 1]} ENCODED = '{"foo": {"bar": 3}, "baz": null, "quz": ["a", 1]}' self.assertEqual((None, VALUE), JSONValueProtocol.read(ENCODED)) self.assertEqual(ENCODED, JSONValueProtocol.write(None, VALUE))
import re import sys from collections import defaultdict from mrjob.protocol import JSONValueProtocol from term_tools import get_terms input = open(sys.argv[1]) words = defaultdict(lambda: 0) for line in input: email = JSONValueProtocol.read(line)[1] for term in get_terms(email['text']): words[term] += 1 for word, count in words.items(): print word, count