Exemplo n.º 1
0
def create_neo4j_es_job():

    tmp_folder = '/var/tmp/amundsen/metric/metric_search_data.json'

    task = DefaultTask(loader=FSElasticsearchJSONLoader(),
                       extractor=Neo4jMetricSearchDataExtractor())

    elasticsearch_client = es
    elasticsearch_new_index_key = 'metrics'
    elasticsearch_new_index_key_type = 'metric'
    elasticsearch_index_alias = 'metric_search_index'

    rand = str(random.randint(0, 1000))

    job_config = ConfigFactory.from_dict({
        'extractor.dashboard_search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY):
        neo4j_endpoint,
        'extractor.dashboard_search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY):
        'databuilder.models.metric_elasticsearch_document.MetricESDocument',
        'extractor.dashboard_search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER):
        neo4j_user,
        'extractor.dashboard_search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW):
        neo4j_password,
        'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY):
        tmp_folder,
        'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY):
        'w',
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY):
        tmp_folder,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY):
        'r',
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY):
        elasticsearch_new_index_key_type,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY):
        elasticsearch_client,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY):
        elasticsearch_new_index_key + str(rand),
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY):
        elasticsearch_index_alias,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY):
        METRIC_ES_MAP
    })
    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    return job
Exemplo n.º 2
0
def create_snowflake_es_publisher_job():
    """
    Launches databuilder job that extracts data from Neo4J backend and pushes them as search documents
    to Elasticsearch index
    """

    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = '/var/tmp/amundsen/search_data.json'

    task = DefaultTask(loader=FSElasticsearchJSONLoader(),
                       extractor=Neo4jSearchDataExtractor(),
                       transformer=NoopTransformer())

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = 'tables' + str(uuid.uuid4())
    # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38
    elasticsearch_new_index_key_type = 'table'
    # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index
    elasticsearch_index_alias = 'table_search_index'

    job_config = ConfigFactory.from_dict({
        f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint,
        f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}':
            'databuilder.models.table_elasticsearch_document.TableESDocument',
        f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user,
        f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password,
        f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path,
        f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w',
        f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path,
        f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r',
        f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}':
            elasticsearch_client,
        f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}':
            elasticsearch_new_index_key,
        f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}':
            elasticsearch_new_index_key_type,
        f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}':
            elasticsearch_index_alias
    })

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    job.launch()
Exemplo n.º 3
0
    def test_loading_with_list_of_objects(self):
        # type: () -> None
        """
        Test Loading functionality with list of objects.
        Check to ensure all objects are added to file
        """
        loader = FSElasticsearchJSONLoader()
        loader.init(conf=Scoped.get_scoped_conf(conf=self.conf,
                                                scope=loader.get_scope()))

        data = [TableESDocument(database='test_database',
                                cluster='test_cluster',
                                schema='test_schema',
                                name='test_table',
                                key='test_table_key',
                                last_updated_timestamp=123456789,
                                description='test_description',
                                column_names=['test_col1', 'test_col2'],
                                column_descriptions=['test_comment1', 'test_comment2'],
                                total_usage=10,
                                unique_usage=5,
                                tags=['test_tag1', 'test_tag2'],
                                badges=['badge1'])] * 5

        for d in data:
            loader.load(d)
        loader.close()

        expected = [
            ('{"key": "test_table_key", "column_descriptions": ["test_comment1", "test_comment2"], '
             '"schema": "test_schema", "database": "test_database", "cluster": "test_cluster", '
             '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
             '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
             '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"]}')
        ] * 5

        self._check_results_helper(expected=expected)
Exemplo n.º 4
0
    def test_loading_with_single_object(self):
        # type: () -> None
        """
        Test Loading functionality with single python object
        """
        loader = FSElasticsearchJSONLoader()
        loader.init(conf=Scoped.get_scoped_conf(conf=self.conf,
                                                scope=loader.get_scope()))

        data = ElasticsearchDocument(
            elasticsearch_index='test_es_index',
            elasticsearch_type='test_es_type',
            database='test_database',
            cluster='test_cluster',
            schema_name='test_schema',
            table_name='test_table',
            table_key='test_table_key',
            table_last_updated_epoch=123456789,
            table_description='test_description',
            column_names=['test_col1', 'test_col2'],
            column_descriptions=['test_comment1', 'test_comment2'],
            total_usage=10,
            unique_usage=5,
            tag_names=['test_tag1', 'test_tag2'])
        loader.load(data)
        loader.close()

        expected = [
            '{"index": {"_type": "test_es_type", "_index": "test_es_index"}}',
            ('{"table_key": "test_table_key", "column_descriptions": ["test_comment1", "test_comment2"], '
             '"schema_name": "test_schema", "database": "test_database", "cluster": "test_cluster", '
             '"column_names": ["test_col1", "test_col2"], "table_name": "test_table", '
             '"table_last_updated_epoch": 123456789,'
             '"table_description": "test_description", "unique_usage": 5, "total_usage": 10, '
             '"tag_names": ["test_tag1", "test_tag2"]}')
        ]

        self._check_results_helper(expected=expected)
Exemplo n.º 5
0
    def test_loading_with_single_object(self) -> None:
        """
        Test Loading functionality with single python object
        """
        loader = FSElasticsearchJSONLoader()
        loader.init(conf=Scoped.get_scoped_conf(conf=self.conf,
                                                scope=loader.get_scope()))

        data = TableESDocument(database='test_database',
                               cluster='test_cluster',
                               schema='test_schema',
                               name='test_table',
                               key='test_table_key',
                               last_updated_timestamp=123456789,
                               description='test_description',
                               column_names=['test_col1', 'test_col2'],
                               column_descriptions=['test_comment1', 'test_comment2'],
                               total_usage=10,
                               unique_usage=5,
                               tags=['test_tag1', 'test_tag2'],
                               badges=['badge1'],
                               schema_description='schema description',
                               programmatic_descriptions=['test'])
        loader.load(data)
        loader.close()

        expected = [
            ('{"key": "test_table_key", "column_descriptions": ["test_comment1", "test_comment2"], '
             '"schema": "test_schema", "database": "test_database", "cluster": "test_cluster", '
             '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
             '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
             '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
             '"tags": ["test_tag1", "test_tag2"], "schema_description": "schema description", '
             '"programmatic_descriptions": ["test"], '
             '"badges": ["badge1"]}')
        ]

        self._check_results_helper(expected=expected)
Exemplo n.º 6
0
def create_es_publisher_sample_job():
    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = "/var/tmp/amundsen/search_data.json"

    task = DefaultTask(
        loader=FSElasticsearchJSONLoader(),
        extractor=Neo4jSearchDataExtractor(),
        transformer=NoopTransformer(),
    )

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = f"tables{uuid.uuid4()}"
    # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38
    elasticsearch_new_index_key_type = "table"
    # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index
    elasticsearch_index_alias = "table_search_index"

    job_config = ConfigFactory.from_dict(
        {
            f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}": neo4j_endpoint,
            f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}": "databuilder.models.table_elasticsearch_document.TableESDocument",
            f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}": neo4j_user,
            f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}": neo4j_password,
            f"loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}": extracted_search_data_path,
            f"loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}": "w",
            f"publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}": extracted_search_data_path,
            f"publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}": "r",
            f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}": elasticsearch_client,
            f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}": elasticsearch_new_index_key,
            f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}": elasticsearch_new_index_key_type,
            f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}": elasticsearch_index_alias,
        }
    )

    job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher())
    job.launch()
Exemplo n.º 7
0
def create_es_publisher_sample_job(
        elasticsearch_index_alias='table_search_index',
        entity_type='table',
        elasticsearch_mapping=None):
    """
    :param elasticsearch_index_alias:  alias for Elasticsearch used in
                                       amundsensearchlibrary/search_service/config.py as an index
    :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
                                       `table_{uuid}`
    :param model_name:                 the Databuilder model class used in transporting between Extractor and Loader
    :param entity_type:                Entity type handed to the `Neo4jSearchDataExtractor` class, used to determine
                                       Cypher query to extract data from Neo4j. Defaults to `table`.
    :param elasticsearch_mapping:      Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
                                       if None is given (default) it uses the `Table` query baked into the Publisher
    """
    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = '/var/tmp/amundsen/search_data.json'

    task = DefaultTask(loader=FSElasticsearchJSONLoader(),
                       extractor=AtlasSearchDataExtractor(),
                       transformer=NoopTransformer())

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = f'{entity_type}_{uuid.uuid4()}'

    job_config = ConfigFactory.from_dict({
        'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_URL_CONFIG_KEY):
        atlas_host,
        'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_PORT_CONFIG_KEY):
        atlas_port,
        'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_PROTOCOL_CONFIG_KEY):
        'http',
        'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_VALIDATE_SSL_CONFIG_KEY):
        False,
        'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_USERNAME_CONFIG_KEY):
        atlas_user,
        'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_PASSWORD_CONFIG_KEY):
        atlas_password,
        'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_SEARCH_CHUNK_SIZE_KEY):
        ATLAS_SEARCH_CHUNK_SIZE,
        'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_DETAILS_CHUNK_SIZE_KEY):
        ATLAS_DETAILS_CHUNK_SIZE,
        'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.PROCESS_POOL_SIZE_KEY):
        ATLAS_PROCESS_POOL_SIZE,
        'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ENTITY_TYPE_KEY):
        entity_type.title(),
        'loader.filesystem.elasticsearch.file_path':
        extracted_search_data_path,
        'loader.filesystem.elasticsearch.mode':
        'w',
        'publisher.elasticsearch.file_path':
        extracted_search_data_path,
        'publisher.elasticsearch.mode':
        'r',
        'publisher.elasticsearch.client':
        elasticsearch_client,
        'publisher.elasticsearch.new_index':
        elasticsearch_new_index_key,
        'publisher.elasticsearch.doc_type':
        '_doc',
        'publisher.elasticsearch.alias':
        elasticsearch_index_alias,
    })

    # only optionally add these keys, so need to dynamically `put` them
    if elasticsearch_mapping:
        job_config.put(
            f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}',
            elasticsearch_mapping)

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    return job
def create_es_publisher_sample_job(
        elasticsearch_index_alias='table_search_index',
        elasticsearch_doc_type_key='table',
        model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
        cypher_query=None,
        elasticsearch_mapping=None):
    """
    :param elasticsearch_index_alias:  alias for Elasticsearch used in
                                       amundsensearchlibrary/search_service/config.py as an index
    :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
                                       `table_search_index`
    :param model_name:                 the Databuilder model class used in transporting between Extractor and Loader
    :param cypher_query:               Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default)
                                       it uses the `Table` query baked into the Extractor
    :param elasticsearch_mapping:      Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
                                       if None is given (default) it uses the `Table` query baked into the Publisher
    """
    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = '/var/tmp/amundsen/search_data.json'

    task = DefaultTask(loader=FSElasticsearchJSONLoader(),
                       extractor=Neo4jSearchDataExtractor(),
                       transformer=NoopTransformer())

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = 'tables' + str(uuid.uuid4())

    job_config = ConfigFactory.from_dict({
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY):
        neo4j_endpoint,
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY):
        model_name,
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER):
        neo4j_user,
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW):
        neo4j_password,
        'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY):
        extracted_search_data_path,
        'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY):
        'w',
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY):
        extracted_search_data_path,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY):
        'r',
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY):
        elasticsearch_client,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY):
        elasticsearch_new_index_key,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY):
        elasticsearch_doc_type_key,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY):
        elasticsearch_index_alias,
    })

    # only optionally add these keys, so need to dynamically `put` them
    if cypher_query:
        job_config.put(
            'extractor.search_data.{}'.format(
                Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY),
            cypher_query)
    if elasticsearch_mapping:
        job_config.put(
            'publisher.elasticsearch.{}'.format(
                ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY),
            elasticsearch_mapping)

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    return job
def create_es_publisher_sample_job(
        elasticsearch_index_alias='table_search_index',
        elasticsearch_doc_type_key='table',
        model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
        entity_type='table',
        elasticsearch_mapping=None):
    """
    :param elasticsearch_index_alias:  alias for Elasticsearch used in
                                       amundsensearchlibrary/search_service/config.py as an index
    :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
                                       `table_{uuid}`
    :param model_name:                 the Databuilder model class used in transporting between Extractor and Loader
    :param entity_type:                Entity type handed to the `Neo4jSearchDataExtractor` class, used to determine
                                       Cypher query to extract data from Neo4j. Defaults to `table`.
    :param elasticsearch_mapping:      Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
                                       if None is given (default) it uses the `Table` query baked into the Publisher
    """
    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = '/var/tmp/amundsen/search_data.json'

    task = DefaultTask(loader=FSElasticsearchJSONLoader(),
                       extractor=Neo4jSearchDataExtractor(),
                       transformer=NoopTransformer())

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = '{}_'.format(
        elasticsearch_doc_type_key) + str(uuid.uuid4())

    job_config = ConfigFactory.from_dict({
        'extractor.search_data.entity_type':
        entity_type,
        'extractor.search_data.extractor.neo4j.graph_url':
        neo4j_endpoint,
        'extractor.search_data.extractor.neo4j.model_class':
        model_name,
        'extractor.search_data.extractor.neo4j.neo4j_auth_user':
        neo4j_user,
        'extractor.search_data.extractor.neo4j.neo4j_auth_pw':
        neo4j_password,
        'extractor.search_data.extractor.neo4j.neo4j_encrypted':
        False,
        'loader.filesystem.elasticsearch.file_path':
        extracted_search_data_path,
        'loader.filesystem.elasticsearch.mode':
        'w',
        'publisher.elasticsearch.file_path':
        extracted_search_data_path,
        'publisher.elasticsearch.mode':
        'r',
        'publisher.elasticsearch.client':
        elasticsearch_client,
        'publisher.elasticsearch.new_index':
        elasticsearch_new_index_key,
        'publisher.elasticsearch.doc_type':
        elasticsearch_doc_type_key,
        'publisher.elasticsearch.alias':
        elasticsearch_index_alias,
    })

    # only optionally add these keys, so need to dynamically `put` them
    if elasticsearch_mapping:
        job_config.put(
            'publisher.elasticsearch.{}'.format(
                ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY),
            elasticsearch_mapping)

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    return job
    'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ENTITY_TYPE_KEY):
    entity_type,
    'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY):
    extracted_search_data_path,
    'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY):
    'w',
    'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY):
    extracted_search_data_path,
    'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY):
    'r',
    'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY):
    elasticsearch_client,
    'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY):
    elasticsearch_new_index_key,
    'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY):
    elasticsearch_new_index_key_type,
    'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY):
    elasticsearch_index_alias
})

if __name__ == "__main__":
    task = DefaultTask(extractor=AtlasSearchDataExtractor(),
                       transformer=NoopTransformer(),
                       loader=FSElasticsearchJSONLoader())

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())

    job.launch()
Exemplo n.º 11
0
def create_es_publisher_sample_job(
        elasticsearch_index_alias='table_search_index',
        elasticsearch_doc_type_key='table',
        model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
        entity_type='table',
        elasticsearch_mapping=None):
    """
    :param elasticsearch_index_alias:  alias for Elasticsearch used in
                                       amundsensearchlibrary/search_service/config.py as an index
    :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
                                       `table_{uuid}`
    :param model_name:                 the Databuilder model class used in transporting between Extractor and Loader
    :param entity_type:                Entity type handed to the `Neo4jSearchDataExtractor` class, used to determine
                                       Cypher query to extract data from Neo4j. Defaults to `table`.
    :param elasticsearch_mapping:      Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
                                       if None is given (default) it uses the `Table` query baked into the Publisher
    """
    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = '/var/tmp/amundsen/search_data.json'
    loader = FSElasticsearchJSONLoader()
    extractor = NeptuneSearchDataExtractor()

    task = DefaultTask(loader=loader,
                       extractor=extractor,
                       transformer=NoopTransformer())

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = '{}_'.format(
        elasticsearch_doc_type_key) + str(uuid.uuid4())
    publisher = ElasticsearchPublisher()

    job_config = ConfigFactory.from_dict({
        extractor.get_scope(): {
            NeptuneSearchDataExtractor.ENTITY_TYPE_CONFIG_KEY: entity_type,
            NeptuneSearchDataExtractor.MODEL_CLASS_CONFIG_KEY: model_name,
            'neptune.client': {
                NeptuneSessionClient.NEPTUNE_HOST_NAME: NEPTUNE_ENDPOINT,
                NeptuneSessionClient.AWS_REGION: AWS_REGION,
                NeptuneSessionClient.AWS_ACCESS_KEY: aws_access_key,
                NeptuneSessionClient.AWS_SECRET_ACCESS_KEY: aws_access_secret,
                NeptuneSessionClient.AWS_SESSION_TOKEN: aws_token
            }
        },
        'loader.filesystem.elasticsearch.file_path':
        extracted_search_data_path,
        'loader.filesystem.elasticsearch.mode':
        'w',
        publisher.get_scope(): {
            'file_path': extracted_search_data_path,
            'mode': 'r',
            'client': elasticsearch_client,
            'new_index': elasticsearch_new_index_key,
            'doc_type': elasticsearch_doc_type_key,
            'alias': elasticsearch_index_alias
        }
    })

    # only optionally add these keys, so need to dynamically `put` them
    if elasticsearch_mapping:
        job_config.put(
            'publisher.elasticsearch.{}'.format(
                ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY),
            elasticsearch_mapping)

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    return job
Exemplo n.º 12
0
    if cypher_query:
        job_config.put(
            f"extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}",
            cypher_query,
        )
    if elasticsearch_mapping:
        job_config.put(
            f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}",
            elasticsearch_mapping,
        )

    return job_config


if __name__ == "__main__":
    feast_job = DefaultJob(
        conf=create_feast_job_config(),
        task=DefaultTask(extractor=FeastExtractor(),
                         loader=FsNeo4jCSVLoader()),
        publisher=neo4j_csv_publisher.Neo4jCsvPublisher(),
    )
    feast_job.launch()

    es_publish_job = DefaultJob(
        conf=create_es_publish_job_config(),
        task=DefaultTask(loader=FSElasticsearchJSONLoader(),
                         extractor=Neo4jSearchDataExtractor()),
        publisher=ElasticsearchPublisher(),
    )
    es_publish_job.launch()
Exemplo n.º 13
0
def create_es_publisher_job(
        neo4j_endpoint,
        neo4j_user,
        neo4j_password,
        temp_folder_path,
        elasticsearch_index_alias='table_search_index',
        elasticsearch_doc_type_key='table',
        model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
        cypher_query=None,
        elasticsearch_mapping=None):

    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = '{temp_folder_path}/es/search_data.json'.format(
        temp_folder_path=temp_folder_path)

    task = DefaultTask(loader=FSElasticsearchJSONLoader(),
                       extractor=Neo4jSearchDataExtractor(),
                       transformer=NoopTransformer())

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = 'tables' + str(uuid.uuid4())

    job_config = ConfigFactory.from_dict({
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY):
        neo4j_endpoint,
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY):
        model_name,
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER):
        neo4j_user,
        'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW):
        neo4j_password,
        'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY):
        extracted_search_data_path,
        'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY):
        'w',
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY):
        extracted_search_data_path,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY):
        'r',
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY):
        elasticsearch_client,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY):
        elasticsearch_new_index_key,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY):
        elasticsearch_doc_type_key,
        'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY):
        elasticsearch_index_alias,
    })

    # only optionally add these keys, so need to dynamically `put` them
    if cypher_query:
        job_config.put(
            'extractor.search_data.{}'.format(
                Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY),
            cypher_query)
    if elasticsearch_mapping:
        job_config.put(
            'publisher.elasticsearch.{}'.format(
                ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY),
            elasticsearch_mapping)

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    return job