예제 #1
0
    def run(self):
        schema_file = self.get_schemafile()
        assert os.path.exists(
            schema_file
        ), 'No schema file available for index %s' % self.index_name

        es_client = elasticsearch.Elasticsearch(config.es_host())

        endpoints = self.get_endpoints()
        # Get all of the endpoints served by this index
        # Create an `EndpointExport` object for each endpoint in order to export
        # each endpoint properly.
        #
        # Endpoint exports can be:
        #   date range based (quarterly output)
        #   filter based (index serves many endpoints)
        #   vanilla (endpoint is 1 to 1 with index and it is exported all at once)
        endpoint_batches = []
        for endpoint in endpoints:
            chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS)
            if endpoint in RANGE_ENDPOINT_MAP:
                params = RANGE_ENDPOINT_MAP[endpoint]
                params['chunks'] = chunks
                endpoint_batches = _make_date_range_endpoint_batch(
                    endpoint, params)
            elif endpoint in FILTERED_ENPOINT_MAP:
                params = FILTERED_ENPOINT_MAP[endpoint]
                query = EndpointExport.build_term_filter(**params)
                endpoint_batches.append(
                    EndpointExport(endpoint, query=query, chunks=chunks))
            else:
                endpoint_batches.append(EndpointExport(endpoint,
                                                       chunks=chunks))

        # Dump each of the `EndpointExport` objects in the list
        for ep in endpoint_batches:
            # The output_dir will be the same for all outputs, once you factor out
            # the endpoint, so we can safely look at the first one only.
            output_dir = dirname(dirname(self.output()[0].path))
            endpoint_dir = join(output_dir, ep.endpoint[1:])
            index_util.dump_index(es_client,
                                  ep.index_name,
                                  ep.endpoint,
                                  join(endpoint_dir, ep.partition),
                                  cleaner=omit_internal_keys,
                                  query=ep.query,
                                  chunks=ep.chunks)
            common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
예제 #2
0
 def map(self, key, value, output):
     es_client = elasticsearch.Elasticsearch(config.es_host())
     ep = common.ObjectDict(value)
     schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
     endpoint_dir = join(self.output_dir, ep.endpoint[1:])
     target_dir = join(endpoint_dir, ep.partition)
     common.shell_cmd('mkdir -p %s', target_dir)
     index_util.dump_index(es_client,
                           ep.index_name,
                           ep.endpoint,
                           target_dir,
                           cleaner=omit_internal_keys,
                           query=ep.query,
                           chunks=ep.chunks)
     # Copy the current JSON schema to the zip location so that it is included
     # in the sync to s3
     common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
예제 #3
0
파일: pipeline.py 프로젝트: FDA/openfda
 def map(self, key, value, output):
   es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120)
   ep = common.ObjectDict(value)
   schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
   endpoint_dir = join(self.output_dir, ep.endpoint[1:])
   target_dir = join(endpoint_dir, ep.partition)
   common.shell_cmd('mkdir -p %s', target_dir)
   index_util.dump_index(es_client,
                         ep.index_name,
                         ep.endpoint,
                         target_dir,
                         cleaner=omit_internal_keys,
                         query=ep.query,
                         chunks=ep.chunks)
   # Copy the current JSON schema to the zip location so that it is included
   # in the sync to s3
   common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
예제 #4
0
 def map(self, key, value, output):
     es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120)
     ep = common.ObjectDict(value)
     schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json')
     endpoint_dir = join(self.output_dir, ep.endpoint[1:])
     target_dir = join(endpoint_dir, ep.partition)
     common.shell_cmd('mkdir -p %s', target_dir)
     index_util.dump_index(es_client,
                           ep.index_name,
                           ep.endpoint,
                           target_dir,
                           cleaner=omit_internal_keys,
                           query=ep.query,
                           chunks=ep.chunks)
     # Copy the current JSON schema to the zip location so that it is included
     # in the sync to s3. flock is required to avoid a race condition when copying the schema file.
     common.shell_cmd_quiet('flock --verbose %s cp %s %s', schema_file,
                            schema_file, endpoint_dir)