def test_jsonline_output(self): results = self.run_mr('/tmp/test-jsonline-output/', ['hello' for i in range(10)], mapper=KeyMapper(), input_format=parallel.FilenameInput(), output_format=parallel.JSONLineOutput()) for i in range(10): key = '/tmp/test-jsonline-output/input/%d.txt' % i assert key in results
def run(self): index_db = {} setid_db = self.input()['setid'].path logging.info('Joining data from setid DB: %s', setid_db) db = parallel.ShardedDB.open(setid_db) db_iter = db.range_iter(None, None) for (key, val) in db_iter: # Only need one entry from the index, pruning here to simplify mapper # The index is also used to find the latest version of SPL, so it is a bit # clunky to use in this particular case. index_db[key] = val[0][1] db_list = [ self.input()[_db].path for _db in ['ndc', 'spl', 'unii', 'rxnorm', 'upc'] ] logging.info('DB %s', db_list) parallel.mapreduce(parallel.Collection.from_sharded_list(db_list), mapper=JoinAllMapper(index_db=index_db), reducer=JoinAllReducer(), output_prefix=self.output().path, output_format=parallel.JSONLineOutput())
def output_format(self): return parallel.JSONLineOutput()