def run(self): schema_file = self.get_schemafile() assert os.path.exists( schema_file ), 'No schema file available for index %s' % self.index_name es_client = elasticsearch.Elasticsearch(config.es_host()) endpoints = self.get_endpoints() # Get all of the endpoints served by this index # Create an `EndpointExport` object for each endpoint in order to export # each endpoint properly. # # Endpoint exports can be: # date range based (quarterly output) # filter based (index serves many endpoints) # vanilla (endpoint is 1 to 1 with index and it is exported all at once) endpoint_batches = [] for endpoint in endpoints: chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS) if endpoint in RANGE_ENDPOINT_MAP: params = RANGE_ENDPOINT_MAP[endpoint] params['chunks'] = chunks endpoint_batches = _make_date_range_endpoint_batch( endpoint, params) elif endpoint in FILTERED_ENPOINT_MAP: params = FILTERED_ENPOINT_MAP[endpoint] query = EndpointExport.build_term_filter(**params) endpoint_batches.append( EndpointExport(endpoint, query=query, chunks=chunks)) else: endpoint_batches.append(EndpointExport(endpoint, chunks=chunks)) # Dump each of the `EndpointExport` objects in the list for ep in endpoint_batches: # The output_dir will be the same for all outputs, once you factor out # the endpoint, so we can safely look at the first one only. output_dir = dirname(dirname(self.output()[0].path)) endpoint_dir = join(output_dir, ep.endpoint[1:]) index_util.dump_index(es_client, ep.index_name, ep.endpoint, join(endpoint_dir, ep.partition), cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host()) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3 common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3 common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3. flock is required to avoid a race condition when copying the schema file. common.shell_cmd_quiet('flock --verbose %s cp %s %s', schema_file, schema_file, endpoint_dir)