Пример #1
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           num_shards=1,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))

        # optimize index, if requested
        if self.optimize_index:
            optimize_index(self.index_name, wait_for_merge=False)

        # update metadata index again. Trying to solve mystery of missing "last_update_date" entries...
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))
Пример #2
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           num_shards=1,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(),
            self.index_name,
            last_run_date=arrow.utcnow().format('YYYY-MM-DD'),
            last_update_date=self.last_update_date()
            if callable(self.last_update_date) else self.last_update_date)

        # Refresh the index to make the documents visible to searches.
        refresh_index(self.index_name)

        # optimize index, if requested
        if self.optimize_index:
            optimize_index(self.index_name, wait_for_merge=False)
Пример #3
0
 def _run(self):
     files = glob.glob(self.input().path + '/*/*.json')
     parallel.mapreduce(
         parallel.Collection.from_glob(files, parallel.JSONLineInput()),
         mapper=ParallelExportMapper(output_dir=self.output().path),
         reducer=parallel.NullReducer(),
         output_prefix=join(BASE_DIR, 'tmp'),
         output_format=parallel.NullOutput(),
         map_workers=12)
Пример #4
0
 def _run(self):
     json_dir = self.input()['data'].path
     input_glob = glob.glob(json_dir + '/*.json')
     for file_name in input_glob:
         logging.info('Running file %s', file_name)
         parallel.mapreduce(
             parallel.Collection.from_glob(file_name,
                                           parallel.JSONLineInput()),
             mapper=index_util.ReloadJSONMapper(config.es_host(),
                                                self.index_name, 'maude'),
             reducer=parallel.IdentityReducer(),
             output_format=parallel.NullOutput(),
             output_prefix='/tmp/loadjson.' + self.index_name)
Пример #5
0
    def _run(self):
        json_dir = self.input()['data'].path

        mapper = LoadJSONMapper(config.es_host(),
                                index_name=self.index_name,
                                type_name=self.type_name,
                                docid_key=self.docid_key,
                                incremental=self.use_checksum)

        parallel.mapreduce(parallel.Collection.from_sharded(json_dir),
                           mapper=mapper,
                           reducer=parallel.IdentityReducer(),
                           output_format=parallel.NullOutput(),
                           map_workers=self.load_json_workers,
                           output_prefix=config.tmp_dir('%s/load-json' %
                                                        self.index_name))

        # update metadata index
        elasticsearch_requests.update_process_datetime(
            config.es_client(), self.index_name,
            arrow.utcnow().format('YYYY-MM-DD'))