def run(self): parallel.mapreduce( parallel.Collection.from_glob( self.input().path, parallel.JSONLineInput()), mapper=SubstanceData2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): parallel.mapreduce(parallel.Collection.from_glob( self.input().path, parallel.JSONLineInput()), mapper=parallel.IdentityMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=1)
def _run(self): files = glob.glob(self.input().path + '/*/*.json') parallel.mapreduce( parallel.Collection.from_glob(files, parallel.JSONLineInput()), mapper=ParallelExportMapper(output_dir=self.output().path), reducer=parallel.NullReducer(), output_prefix=join(BASE_DIR, 'tmp'), output_format=parallel.NullOutput(), map_workers=12)
def run(self): harmonized_db = parallel.ShardedDB.open(self.input()[0].path).as_dict() json_glob = glob.glob(self.input()[1].path + '/*.json') parallel.mapreduce( parallel.Collection.from_glob(json_glob, parallel.JSONLineInput()), mapper=MaudeAnnotationMapper(harmonized_db=harmonized_db), reducer=parallel.IdentityReducer(), output_prefix=self.output().path, num_shards=10, map_workers=5)
def _run(self): json_dir = self.input()['data'].path input_glob = glob.glob(json_dir + '/*.json') for file_name in input_glob: logging.info('Running file %s', file_name) parallel.mapreduce( parallel.Collection.from_glob(file_name, parallel.JSONLineInput()), mapper=index_util.ReloadJSONMapper(config.es_host(), self.index_name, 'maude'), reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), output_prefix='/tmp/loadjson.' + self.index_name)