def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD')) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False) # update metadata index again. Trying to solve mystery of missing "last_update_date" entries... elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'))
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, num_shards=1, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, last_run_date=arrow.utcnow().format('YYYY-MM-DD'), last_update_date=self.last_update_date() if callable(self.last_update_date) else self.last_update_date) # Refresh the index to make the documents visible to searches. refresh_index(self.index_name) # optimize index, if requested if self.optimize_index: optimize_index(self.index_name, wait_for_merge=False)
def _run(self): files = glob.glob(self.input().path + '/*/*.json') parallel.mapreduce( parallel.Collection.from_glob(files, parallel.JSONLineInput()), mapper=ParallelExportMapper(output_dir=self.output().path), reducer=parallel.NullReducer(), output_prefix=join(BASE_DIR, 'tmp'), output_format=parallel.NullOutput(), map_workers=12)
def _run(self): json_dir = self.input()['data'].path input_glob = glob.glob(json_dir + '/*.json') for file_name in input_glob: logging.info('Running file %s', file_name) parallel.mapreduce( parallel.Collection.from_glob(file_name, parallel.JSONLineInput()), mapper=index_util.ReloadJSONMapper(config.es_host(), self.index_name, 'maude'), reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), output_prefix='/tmp/loadjson.' + self.index_name)
def _run(self): json_dir = self.input()['data'].path mapper = LoadJSONMapper(config.es_host(), index_name=self.index_name, type_name=self.type_name, docid_key=self.docid_key, incremental=self.use_checksum) parallel.mapreduce(parallel.Collection.from_sharded(json_dir), mapper=mapper, reducer=parallel.IdentityReducer(), output_format=parallel.NullOutput(), map_workers=self.load_json_workers, output_prefix=config.tmp_dir('%s/load-json' % self.index_name)) # update metadata index elasticsearch_requests.update_process_datetime( config.es_client(), self.index_name, arrow.utcnow().format('YYYY-MM-DD'))