def test_uses_json_format(self): KEY = ['a', 1] VALUE = {'foo': 'bar'} ENCODED = b'["a", 1]\t{"foo": "bar"}' self.assertEqual((KEY, VALUE), JSONProtocol().read(ENCODED)) self.assertEqual(ENCODED, JSONProtocol().write(KEY, VALUE))
def test_uses_json_format(self): KEY = ['a', 1] VALUE = {'foo': {'bar': 3}, 'baz': None} ENCODED = '["a", 1]\t{"foo": {"bar": 3}, "baz": null}' self.assertEqual((KEY, VALUE), JSONProtocol().read(ENCODED)) self.assertEqual(ENCODED, JSONProtocol().write(KEY, VALUE))
def test_numerical_keys_become_strs(self): # JSON should convert numbers to strings when they are dict keys self.assertEqual(({ '1': 2 }, { '3': 4 }), JSONProtocol().read(JSONProtocol().write({1: 2}, {3: 4})))
def test_bad_keys_and_values(self): # dictionaries have to have strings as keys self.assertCantEncode(JSONProtocol(), {(1, 2): 3}, None) # only unicodes (or bytes in utf-8) are allowed self.assertCantEncode(JSONProtocol(), '0\xa2', '\xe9') # sets don't exist in JSON self.assertCantEncode(JSONProtocol(), set([1]), set()) # Point class has no representation in JSON self.assertCantEncode(JSONProtocol(), Point(2, 3), Point(1, 4))
def test_encode(self): linRegFactory = LinearRegressionFactory(11) linReg = linRegFactory.get_instance() encoded = linRegFactory.encode(linReg) protocol = JSONProtocol() print protocol.write(0, encoded)
def encode_node(node_id, links=None, score=1): node = {} if links: node['links'] = sorted(links.items()) node['score'] = score x = JSONProtocol() return x.write(node_id, node) + '\n'
def parse_output(self, protocol=None): """.. deprecated:: 0.4.2 Parse the output from the given sandboxed job's ``self.stdout``. This was only useful for testing individual mappers/reducers without using a runner; normally you'd just use :py:meth:`runner.stream_output() <mrjob.runner.MRJobRunner.stream_output()>` :type protocol: protocol :param protocol: A protocol instance to use. Defaults to ``JSONProtocol()``. """ if self.stdout == sys.stdout: raise AssertionError('You must call sandbox() first;' ' parse_output() is for testing only.') log.warning( 'parse_output() is deprecated and will be removed in v0.5.0') if protocol is None: protocol = JSONProtocol() lines = StringIO(self.stdout.getvalue()) return [protocol.read(line) for line in lines]
def test_encode(self): ''' Test whether algorithm can be json encoded (used as mrjob internal protocol) ''' layerSizes = [3,2,1] nnFactory = PredictionNNFactory(layerSizes) nn = nnFactory.get_instance() # encode obj_encoded = nnFactory.encode(nn) # call json protocol protocol = JSONProtocol() protocol.write("test_decode", obj_encoded)
def test_decode(self): linRegFactory = LinearRegressionFactory(11) linReg = linRegFactory.get_instance() obj_encoded = linRegFactory.encode(linReg) protocol = JSONProtocol() json_encoded = protocol.write(0, obj_encoded) obj_encoded = protocol.read(json_encoded) linRegArr = linRegFactory.decode([obj_encoded[1]]) assert type(linRegArr) == list, "decoded not as a list" assert type(linRegArr[0] ) == LinearRegression, "decoded not as LinearRegression"
def test_decode(self): ''' Test whether algorithm can be json encoded (used as mrjob internal protocol) ''' layerSizes = [3,2,1] nnFactory = PredictionNNFactory(layerSizes) nn = nnFactory.get_instance() # encode obj_encoded = nnFactory.encode(nn) # call json protocol protocol = JSONProtocol() json_encoded = protocol.write("test_decode", obj_encoded) obj_encoded = protocol.read(json_encoded) nnArr = nnFactory.decode([obj_encoded[1]]) assert type(nnArr) == list, "decoded not as a list" assert type(nnArr[0]) == MultilayerPerceptron, "decoded not as LinearRegression"
#s3_input_path = "s3://joeloren//iceval_out//input//datasets//" tmp_dir_out = "s3://joeloren/interim_out/" tmp_dir_in = "s3://joeloren/interim_in/" tmp_dir_in_relative = "interim_in/" tmp_dir_out_relative = "interim_out/" from mrjob.protocol import JSONValueProtocol, JSONProtocol jvp = JSONValueProtocol() jp = JSONProtocol() from boto.s3.connection import S3Connection import sys c = S3Connection('AKIAI4OZ3HY56BTOHA3A', '6isbkZjBM8kt3PIk53EXVIf76VOPxOH8rNleGc6B') bucket = c.get_bucket("joeloren") datasets_bucket = c.get_bucket('joel_datasets')
def input_protocol(self): if self.options.job_to_run != 'stats': LOG.debug('Reading text input from cdx files') return RawValueProtocol() LOG.debug('Reading JSON input from count job') return JSONProtocol()
["Ut", "pulvinar", "lectus", "quis", "feugiat", "adipiscing"], ["Nunc", "vulputate", "mauris", "congue", "diam", "ultrices", "aliquet"], ["Nulla", "pharetra", "laoreet", "est", "quis", "vestibulum"], ["Quisque", "feugiat", "pharetra", "sagittis"], ["Phasellus", "nulla", "massa", "sodales", "a", "suscipit", "blandit", "facilisis", "eu", "augue"], ["Cras", "mi", "massa", "ullamcorper", "nec", "tristique", "at", "convallis", "quis", "eros"], ["Mauris", "non", "fermentum", "lacus", "vitae", "tristique", "tellus"], ["In", "volutpat", "metus", "augue", "nec", "laoreet", "ante", "hendrerit", "vitae"], ["Vivamus", "id", "lacus", "nec", "orci", "tristique", "vulputate"] ] logging.basicConfig(level=logging.INFO) mr_job = MRWordCounter() ## JSONValueProtocol doesn't need a key #mr_job.stdin = [JSONValueProtocol().write(None, line) for line in TEXT] ## JSONProtocol wants also a key mr_job.stdin = [JSONProtocol().write(linenum, line) for linenum, line in enumerate(TEXT)] result = {} with mr_job.make_runner() as runner: runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) #print "Line: ", key, " Count: ", value result[key] = value # Print the output in JSON print json.dumps(result)
def test_round_trip(self): for k, v in JSON_KEYS_AND_VALUES: self.assertRoundTripOK(JSONProtocol(), k, v)
NUMBER_RE = re.compile(r"[-?\d']+") input_file = 'sample_input.txt' with open(input_file, 'r') as out_file: data = [x.split() for x in out_file.read().splitlines()] # print(data) nodes = {} for line in data: nodes[int(line[0])] = [] #Will be written as null for line in data: #Check for dangling nodes if line[1:] == []: nodes[int(line[0])] = [] #Will be written as null else: nodes[int(line[0])].append(int(line[1:][0])) # print('nodes',nodes) # unique_nodes = sorted(set(nodes), key = lambda ele: nodes.count(ele)) # print(nodes) # print(unique_nodes) unique_node_count = len(nodes.keys()) initial_pagerank = 1 / unique_node_count j = JSONProtocol() with open("preprocessed_" + input_file, "wb+") as out_file: j = JSONProtocol() for _id, adj in nodes.items(): out_file.write(j.write(_id, (adj, initial_pagerank))) out_file.write('\n'.encode('utf-8'))
def test_bad_data(self): self.assertCantDecode(JSONProtocol(), '{@#$@#!^&*$%^')
def INTERNAL_PROTOCOL(self): return JSONProtocol()
def test_round_trip_with_trailing_tab(self): for k, v in JSON_KEYS_AND_VALUES: self.assertRoundTripWithTrailingTabOK(JSONProtocol(), k, v)
def INPUT_PROTOCOL(self): return JSONProtocol()
def test_tuples_become_lists(self): # JSON should convert tuples into lists self.assertEqual(([1, 2], [3, 4]), JSONProtocol().read(JSONProtocol().write((1, 2), (3, 4))))