def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index', elasticsearch_doc_type_key='table', model_name='databuilder.models.table_elasticsearch_document.TableESDocument', cypher_query=None, elasticsearch_mapping=None): """ :param elasticsearch_index_alias: alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in `table_search_index` :param model_name: the Databuilder model class used in transporting between Extractor and Loader :param cypher_query: Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default) it uses the `Table` query baked into the Extractor :param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class, if None is given (default) it uses the `Table` query baked into the Publisher """ # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ 'extractor.search_data.entity_type': 'table', f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_doc_type_key, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if cypher_query: job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}', cypher_query) if elasticsearch_mapping: job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
def test_publish_with_data_and_old_index(self) -> None: """ Test Publish functionality with data and with old_index in place """ mock_data = json.dumps({'KEY_DOESNOT_MATTER': 'NO_VALUE', 'KEY_DOESNOT_MATTER2': 'NO_VALUE2'}) self.mock_es_client.indices.get_alias.return_value = {'test_old_index': 'DOES_NOT_MATTER'} with patch('builtins.open', mock_open(read_data=mock_data)) as mock_file: publisher = ElasticsearchPublisher() publisher.init(conf=Scoped.get_scoped_conf(conf=self.conf, scope=publisher.get_scope())) # assert mock was called with test_file_path and test_file_mode mock_file.assert_called_once_with(self.test_file_path, self.test_file_mode) publisher.publish() # ensure indices create endpoint was called default_mapping = ElasticsearchPublisher.DEFAULT_ELASTICSEARCH_INDEX_MAPPING self.mock_es_client.indices.create.assert_called_once_with(index=self.test_es_new_index, body=default_mapping) # bulk endpoint called once self.mock_es_client.bulk.assert_called_once_with( [{'index': {'_index': self.test_es_new_index}}, {'KEY_DOESNOT_MATTER': 'NO_VALUE', 'KEY_DOESNOT_MATTER2': 'NO_VALUE2', 'resource_type': 'test_doc_type'}] ) # update alias endpoint called once self.mock_es_client.indices.update_aliases.assert_called_once_with( {'actions': [{"add": {"index": self.test_es_new_index, "alias": self.test_es_alias}}, {"remove_index": {"index": 'test_old_index'}}]} )
def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index', elasticsearch_doc_type_key='table', model_name='databuilder.models.table_elasticsearch_document.TableESDocument', entity_type='table', elasticsearch_mapping=None): """ :param elasticsearch_index_alias: alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in `table_{uuid}` :param model_name: the Databuilder model class used in transporting between Extractor and Loader :param entity_type: Entity type handed to the `Neo4jSearchDataExtractor` class, used to determine Cypher query to extract data from Neo4j. Defaults to `table`. :param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class, if None is given (default) it uses the `Table` query baked into the Publisher """ # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = '{}_'.format(elasticsearch_doc_type_key) + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ 'extractor.search_data.entity_type': entity_type, 'extractor.search_data.extractor.neo4j.graph_url': neo4j_endpoint, 'extractor.search_data.extractor.neo4j.model_class': model_name, 'extractor.search_data.extractor.neo4j.neo4j_auth_user': neo4j_user, 'extractor.search_data.extractor.neo4j.neo4j_auth_pw': neo4j_password, 'extractor.search_data.extractor.neo4j.neo4j_encrypted': False, 'loader.filesystem.elasticsearch.file_path': extracted_search_data_path, 'loader.filesystem.elasticsearch.mode': 'w', 'publisher.elasticsearch.file_path': extracted_search_data_path, 'publisher.elasticsearch.mode': 'r', 'publisher.elasticsearch.client': elasticsearch_client, 'publisher.elasticsearch.new_index': elasticsearch_new_index_key, 'publisher.elasticsearch.doc_type': elasticsearch_doc_type_key, 'publisher.elasticsearch.alias': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if elasticsearch_mapping: job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), elasticsearch_mapping) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
def create_es_publisher_sample_job(): # loader save data to this location and publisher read if from here # extracted_search_data_path = os.path.join(BASE_DIR, "amundsen", "search_data.json") extracted_search_data_path = '/tmp/amundsen/search_data.json' task = DefaultTask( loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=ElasticsearchDocumentTransformer(), ) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = "tables" + str(uuid.uuid4()) # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38 elasticsearch_new_index_key_type = "table" # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index elasticsearch_index_alias = "table_search_index" job_config = ConfigFactory.from_dict({ "extractor.search_data.extractor.neo4j.{}".format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, "extractor.search_data.extractor.neo4j.{}".format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): "databuilder.models.neo4j_data.Neo4jDataResult", "extractor.search_data.extractor.neo4j.{}".format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, "extractor.search_data.extractor.neo4j.{}".format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, "loader.filesystem.elasticsearch.{}".format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): extracted_search_data_path, "loader.filesystem.elasticsearch.{}".format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): "w", "transformer.elasticsearch.{}".format(ElasticsearchDocumentTransformer.ELASTICSEARCH_INDEX_CONFIG_KEY): elasticsearch_new_index_key, "transformer.elasticsearch.{}".format(ElasticsearchDocumentTransformer.ELASTICSEARCH_DOC_CONFIG_KEY): elasticsearch_new_index_key_type, "publisher.elasticsearch.{}".format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): extracted_search_data_path, "publisher.elasticsearch.{}".format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): "r", "publisher.elasticsearch.{}".format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): elasticsearch_client, "publisher.elasticsearch.{}".format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): elasticsearch_new_index_key, "publisher.elasticsearch.{}".format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): elasticsearch_index_alias, }) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
def create_snowflake_es_publisher_job(): """ Launches databuilder job that extracts data from Neo4J backend and pushes them as search documents to Elasticsearch index """ # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38 elasticsearch_new_index_key_type = 'table' # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index elasticsearch_index_alias = 'table_search_index' job_config = ConfigFactory.from_dict({ 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): 'databuilder.models.table_elasticsearch_document.TableESDocument', 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): elasticsearch_client, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): elasticsearch_new_index_key, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): elasticsearch_new_index_key_type, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): elasticsearch_index_alias }) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) job.launch()
def create_es_publisher_job(*, elasticsearch, host, neo4j, **kwargs): """ :param elasticsearch_index_alias: alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in `table_search_index` :param model_name: the Databuilder model class used in transporting between Extractor and Loader :param cypher_query: Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default) it uses the `Table` query baked into the Extractor :param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class, if None is given (default) it uses the `Table` query baked into the Publisher """ elasticsearch_client = Elasticsearch([{'host': elasticsearch["host"]}]) # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) data_path = host["es_data_path"] job_config = ConfigFactory.from_dict({ f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j["endpoint"], f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': 'databuilder.models.table_elasticsearch_document.TableESDocument', f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j["user"], f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j["password"], f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': data_path, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': data_path, f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': 'table', f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': 'table_search_index', }) task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
def test_publish_with_no_data(self) -> None: """ Test Publish functionality with no data """ with patch('builtins.open', mock_open(read_data='')) as mock_file: publisher = ElasticsearchPublisher() publisher.init(conf=Scoped.get_scoped_conf(conf=self.conf, scope=publisher.get_scope())) # assert mock was called with test_file_path and test_file_mode mock_file.assert_called_with(self.test_file_path, self.test_file_mode) publisher.publish() # no calls should be made through elasticseach_client when there is no data self.assertTrue(self.mock_es_client.call_count == 0)
def create_neo4j_es_job(): tmp_folder = '/var/tmp/amundsen/dashboard/dashboards_search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jDashboardSearchDataExtractor()) elasticsearch_client = es elasticsearch_new_index_key = 'dashboards' elasticsearch_new_index_key_type = 'dashboard' elasticsearch_index_alias = 'dashboard_search_index' rand = str(random.randint(0, 1000)) job_config = ConfigFactory.from_dict({ 'extractor.dashboard_search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, 'extractor.dashboard_search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): 'databuilder.models.dashboard_elasticsearch_document.DashboardESDocument', 'extractor.dashboard_search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, 'extractor.dashboard_search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): tmp_folder, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): tmp_folder, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): elasticsearch_new_index_key_type, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): elasticsearch_client, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): elasticsearch_new_index_key + str(rand), 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): elasticsearch_index_alias, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY): DASHBOARD_ES_MAP }) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
def create_es_publisher_job(): # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38 elasticsearch_new_index_key_type = 'table' # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index elasticsearch_index_alias = 'table_search_index' job_config = ConfigFactory.from_dict({ f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': NEO4J_ENDPOINT, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': 'databuilder.models.table_elasticsearch_document.TableESDocument', f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': NEO4j_USERNAME, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': NEO4j_PASSWORD, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_new_index_key_type, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias }) return DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher())
def create_es_publisher_sample_job(): # loader saves data to this location and publisher reads it from here extracted_search_data_path = "/var/tmp/amundsen/search_data.json" task = DefaultTask( loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer(), ) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = f"tables{uuid.uuid4()}" # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38 elasticsearch_new_index_key_type = "table" # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index elasticsearch_index_alias = "table_search_index" job_config = ConfigFactory.from_dict( { f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}": neo4j_endpoint, f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}": "databuilder.models.table_elasticsearch_document.TableESDocument", f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}": neo4j_user, f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}": neo4j_password, f"loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}": extracted_search_data_path, f"loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}": "w", f"publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}": extracted_search_data_path, f"publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}": "r", f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}": elasticsearch_client, f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}": elasticsearch_new_index_key, f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}": elasticsearch_new_index_key_type, f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}": elasticsearch_index_alias, } ) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) job.launch()
def create_es_publisher_sample_job( elasticsearch_index_alias='table_search_index', entity_type='table', elasticsearch_mapping=None): """ :param elasticsearch_index_alias: alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in `table_{uuid}` :param model_name: the Databuilder model class used in transporting between Extractor and Loader :param entity_type: Entity type handed to the `Neo4jSearchDataExtractor` class, used to determine Cypher query to extract data from Neo4j. Defaults to `table`. :param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class, if None is given (default) it uses the `Table` query baked into the Publisher """ # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=AtlasSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = f'{entity_type}_{uuid.uuid4()}' job_config = ConfigFactory.from_dict({ 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_URL_CONFIG_KEY): atlas_host, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_PORT_CONFIG_KEY): atlas_port, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_PROTOCOL_CONFIG_KEY): 'http', 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_VALIDATE_SSL_CONFIG_KEY): False, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_USERNAME_CONFIG_KEY): atlas_user, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_PASSWORD_CONFIG_KEY): atlas_password, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_SEARCH_CHUNK_SIZE_KEY): ATLAS_SEARCH_CHUNK_SIZE, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_DETAILS_CHUNK_SIZE_KEY): ATLAS_DETAILS_CHUNK_SIZE, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.PROCESS_POOL_SIZE_KEY): ATLAS_PROCESS_POOL_SIZE, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ENTITY_TYPE_KEY): entity_type.title(), 'loader.filesystem.elasticsearch.file_path': extracted_search_data_path, 'loader.filesystem.elasticsearch.mode': 'w', 'publisher.elasticsearch.file_path': extracted_search_data_path, 'publisher.elasticsearch.mode': 'r', 'publisher.elasticsearch.client': elasticsearch_client, 'publisher.elasticsearch.new_index': elasticsearch_new_index_key, 'publisher.elasticsearch.doc_type': '_doc', 'publisher.elasticsearch.alias': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if elasticsearch_mapping: job_config.put( f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ENTITY_TYPE_KEY): entity_type, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): elasticsearch_client, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): elasticsearch_new_index_key, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): elasticsearch_new_index_key_type, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): elasticsearch_index_alias }) if __name__ == "__main__": task = DefaultTask(extractor=AtlasSearchDataExtractor(), transformer=NoopTransformer(), loader=FSElasticsearchJSONLoader()) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) job.launch()
def create_es_publisher_sample_job( elasticsearch_index_alias='table_search_index', elasticsearch_doc_type_key='table', model_name='databuilder.models.table_elasticsearch_document.TableESDocument', entity_type='table', elasticsearch_mapping=None): """ :param elasticsearch_index_alias: alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in `table_{uuid}` :param model_name: the Databuilder model class used in transporting between Extractor and Loader :param entity_type: Entity type handed to the `Neo4jSearchDataExtractor` class, used to determine Cypher query to extract data from Neo4j. Defaults to `table`. :param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class, if None is given (default) it uses the `Table` query baked into the Publisher """ # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' loader = FSElasticsearchJSONLoader() extractor = NeptuneSearchDataExtractor() task = DefaultTask(loader=loader, extractor=extractor, transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = '{}_'.format( elasticsearch_doc_type_key) + str(uuid.uuid4()) publisher = ElasticsearchPublisher() job_config = ConfigFactory.from_dict({ extractor.get_scope(): { NeptuneSearchDataExtractor.ENTITY_TYPE_CONFIG_KEY: entity_type, NeptuneSearchDataExtractor.MODEL_CLASS_CONFIG_KEY: model_name, 'neptune.client': { NeptuneSessionClient.NEPTUNE_HOST_NAME: NEPTUNE_ENDPOINT, NeptuneSessionClient.AWS_REGION: AWS_REGION, NeptuneSessionClient.AWS_ACCESS_KEY: aws_access_key, NeptuneSessionClient.AWS_SECRET_ACCESS_KEY: aws_access_secret, NeptuneSessionClient.AWS_SESSION_TOKEN: aws_token } }, 'loader.filesystem.elasticsearch.file_path': extracted_search_data_path, 'loader.filesystem.elasticsearch.mode': 'w', publisher.get_scope(): { 'file_path': extracted_search_data_path, 'mode': 'r', 'client': elasticsearch_client, 'new_index': elasticsearch_new_index_key, 'doc_type': elasticsearch_doc_type_key, 'alias': elasticsearch_index_alias } }) # only optionally add these keys, so need to dynamically `put` them if elasticsearch_mapping: job_config.put( 'publisher.elasticsearch.{}'.format( ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), elasticsearch_mapping) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
def create_es_publisher_job( neo4j_endpoint, neo4j_user, neo4j_password, temp_folder_path, elasticsearch_index_alias='table_search_index', elasticsearch_doc_type_key='table', model_name='databuilder.models.table_elasticsearch_document.TableESDocument', cypher_query=None, elasticsearch_mapping=None): # loader saves data to this location and publisher reads it from here extracted_search_data_path = '{temp_folder_path}/es/search_data.json'.format( temp_folder_path=temp_folder_path) task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): model_name, 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): elasticsearch_client, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): elasticsearch_new_index_key, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): elasticsearch_doc_type_key, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if cypher_query: job_config.put( 'extractor.search_data.{}'.format( Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY), cypher_query) if elasticsearch_mapping: job_config.put( 'publisher.elasticsearch.{}'.format( ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), elasticsearch_mapping) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job