예제 #1
0
 def run(self):
   parallel.mapreduce(
       parallel.Collection.from_glob(
         self.input().path, parallel.JSONLineInput()),
       mapper=SubstanceData2JSONMapper(),
       reducer=parallel.IdentityReducer(),
       output_prefix=self.output().path)
예제 #2
0
 def run(self):
     parallel.mapreduce(parallel.Collection.from_glob(
         self.input().path, parallel.JSONLineInput()),
                        mapper=parallel.IdentityMapper(),
                        reducer=parallel.IdentityReducer(),
                        output_prefix=self.output().path,
                        num_shards=1)
예제 #3
0
 def _run(self):
     files = glob.glob(self.input().path + '/*/*.json')
     parallel.mapreduce(
         parallel.Collection.from_glob(files, parallel.JSONLineInput()),
         mapper=ParallelExportMapper(output_dir=self.output().path),
         reducer=parallel.NullReducer(),
         output_prefix=join(BASE_DIR, 'tmp'),
         output_format=parallel.NullOutput(),
         map_workers=12)
예제 #4
0
 def run(self):
     harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict()
     json_glob = glob.glob(self.input()[1].path + '/*.json')
     parallel.mapreduce(
         parallel.Collection.from_glob(json_glob, parallel.JSONLineInput()),
         mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db),
         reducer=parallel.IdentityReducer(),
         output_prefix=self.output().path,
         num_shards=10,
         map_workers=5)
예제 #5
0
 def _run(self):
     json_dir = self.input()['data'].path
     input_glob = glob.glob(json_dir + '/*.json')
     for file_name in input_glob:
         logging.info('Running file %s', file_name)
         parallel.mapreduce(
             parallel.Collection.from_glob(file_name,
                                           parallel.JSONLineInput()),
             mapper=index_util.ReloadJSONMapper(config.es_host(),
                                                self.index_name, 'maude'),
             reducer=parallel.IdentityReducer(),
             output_format=parallel.NullOutput(),
             output_prefix='/tmp/loadjson.' + self.index_name)