Exemplo n.º 1
0
    def test_jsonline_output(self):
        results = self.run_mr('/tmp/test-jsonline-output/',
                              ['hello' for i in range(10)],
                              mapper=KeyMapper(),
                              input_format=parallel.FilenameInput(),
                              output_format=parallel.JSONLineOutput())

        for i in range(10):
            key = '/tmp/test-jsonline-output/input/%d.txt' % i
            assert key in results
Exemplo n.º 2
0
    def run(self):
        index_db = {}
        setid_db = self.input()['setid'].path
        logging.info('Joining data from setid DB: %s', setid_db)
        db = parallel.ShardedDB.open(setid_db)
        db_iter = db.range_iter(None, None)

        for (key, val) in db_iter:
            # Only need one entry from the index, pruning here to simplify mapper
            # The index is also used to find the latest version of SPL, so it is a bit
            # clunky to use in this particular case.
            index_db[key] = val[0][1]

        db_list = [
            self.input()[_db].path
            for _db in ['ndc', 'spl', 'unii', 'rxnorm', 'upc']
        ]

        logging.info('DB %s', db_list)
        parallel.mapreduce(parallel.Collection.from_sharded_list(db_list),
                           mapper=JoinAllMapper(index_db=index_db),
                           reducer=JoinAllReducer(),
                           output_prefix=self.output().path,
                           output_format=parallel.JSONLineOutput())
Exemplo n.º 3
0
 def output_format(self):
     return parallel.JSONLineOutput()