def test_multi_yield_chained_transformer(self) -> None: """ Test that MultiYieldChainedTransformer is able handle both: - transformers which yield multiple nodes - transformers which transform single nodes """ transformer = ChainedTransformer([ AddFakeOwnerTransformer(), NoopTransformer(), DuplicateTransformer() ]) result = _run_transformer(transformer) expected = [ TEST_DATA[0], TEST_DATA[0], EXPECTED_OWNERS[0], EXPECTED_OWNERS[0], TEST_DATA[1], TEST_DATA[1], EXPECTED_OWNERS[1], EXPECTED_OWNERS[1], ] self.assertEqual(repr(result), repr(expected))
def run_table_lineage_job(table_lineage_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' extractor = CsvTableLineageExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvtablelineage.table_lineage_file_location': table_lineage_path, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'lineage_unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_table_wm_job(**kwargs): sql = textwrap.dedent(""" SELECT From_unixtime(A0.create_time) as create_time, 'hive' as `database`, C0.NAME as `schema`, B0.tbl_name as table_name, {func}(A0.part_name) as part_name, {watermark} as part_type FROM PARTITIONS A0 LEFT OUTER JOIN TBLS B0 ON A0.tbl_id = B0.tbl_id LEFT OUTER JOIN DBS C0 ON B0.db_id = C0.db_id WHERE C0.NAME IN {schemas} AND B0.tbl_type IN ( 'EXTERNAL_TABLE', 'MANAGED_TABLE' ) AND A0.PART_NAME NOT LIKE '%%__HIVE_DEFAULT_PARTITION__%%' GROUP BY C0.NAME, B0.tbl_name ORDER by create_time desc """).format(func=kwargs['templates_dict'].get('agg_func'), watermark=kwargs['templates_dict'].get('watermark_type'), schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE) logging.info('SQL query: {}'.format(sql)) tmp_folder = '/var/tmp/amundsen/table_{hwm}'.format( hwm=kwargs['templates_dict'].get('watermark_type').strip("\"")) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) hwm_extractor = SQLAlchemyExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=hwm_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL): sql, 'extractor.sqlalchemy.model_class': 'databuilder.models.watermark.Watermark', 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index', elasticsearch_doc_type_key='table', model_name='databuilder.models.table_elasticsearch_document.TableESDocument', cypher_query=None, elasticsearch_mapping=None): """ :param elasticsearch_index_alias: alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in `table_search_index` :param model_name: the Databuilder model class used in transporting between Extractor and Loader :param cypher_query: Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default) it uses the `Table` query baked into the Extractor :param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class, if None is given (default) it uses the `Table` query baked into the Publisher """ # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_doc_type_key, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if cypher_query: job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}', cypher_query) if elasticsearch_mapping: job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
def __init__(self, extractor: Extractor, transformer: Transformer = NoopTransformer()) -> None: self.extractor = extractor self.transformer = transformer self._closer = Closer() self._closer.register(self.extractor.close) self._closer.register(self.transformer.close)
def __init__(self, extractor, loader, transformer=NoopTransformer()): # type: (Extractor, Loader, Transformer) -> None self.extractor = extractor self.transformer = transformer self.loader = loader self._closer = Closer() self._closer.register(self.extractor.close) self._closer.register(self.transformer.close) self._closer.register(self.loader.close)
def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index', elasticsearch_doc_type_key='table', model_name='databuilder.models.table_elasticsearch_document.TableESDocument', entity_type='table', elasticsearch_mapping=None): """ :param elasticsearch_index_alias: alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in `table_{uuid}` :param model_name: the Databuilder model class used in transporting between Extractor and Loader :param entity_type: Entity type handed to the `Neo4jSearchDataExtractor` class, used to determine Cypher query to extract data from Neo4j. Defaults to `table`. :param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class, if None is given (default) it uses the `Table` query baked into the Publisher """ # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = '{}_'.format(elasticsearch_doc_type_key) + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ 'extractor.search_data.entity_type': entity_type, 'extractor.search_data.extractor.neo4j.graph_url': neo4j_endpoint, 'extractor.search_data.extractor.neo4j.model_class': model_name, 'extractor.search_data.extractor.neo4j.neo4j_auth_user': neo4j_user, 'extractor.search_data.extractor.neo4j.neo4j_auth_pw': neo4j_password, 'extractor.search_data.extractor.neo4j.neo4j_encrypted': False, 'loader.filesystem.elasticsearch.file_path': extracted_search_data_path, 'loader.filesystem.elasticsearch.mode': 'w', 'publisher.elasticsearch.file_path': extracted_search_data_path, 'publisher.elasticsearch.mode': 'r', 'publisher.elasticsearch.client': elasticsearch_client, 'publisher.elasticsearch.new_index': elasticsearch_new_index_key, 'publisher.elasticsearch.doc_type': elasticsearch_doc_type_key, 'publisher.elasticsearch.alias': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if elasticsearch_mapping: job_config.put('publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY), elasticsearch_mapping) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
def create_snowflake_es_publisher_job(): """ Launches databuilder job that extracts data from Neo4J backend and pushes them as search documents to Elasticsearch index """ # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38 elasticsearch_new_index_key_type = 'table' # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index elasticsearch_index_alias = 'table_search_index' job_config = ConfigFactory.from_dict({ 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.GRAPH_URL_CONFIG_KEY): neo4j_endpoint, 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.MODEL_CLASS_CONFIG_KEY): 'databuilder.models.table_elasticsearch_document.TableESDocument', 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_USER): neo4j_user, 'extractor.search_data.extractor.neo4j.{}'.format(Neo4jExtractor.NEO4J_AUTH_PW): neo4j_password, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): elasticsearch_client, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): elasticsearch_new_index_key, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): elasticsearch_new_index_key_type, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): elasticsearch_index_alias }) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) job.launch()
def run_dbt_job(database_name, catalog_file_loc, manifest_file_loc, source_url=None): tmp_folder = '/var/tmp/amundsen/dbt_run' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' dbt_extractor = DbtExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=dbt_extractor, loader=csv_loader, transformer=NoopTransformer()) # Catalop and manifest files can be passed in as file locations or a valid python # dict, allowing you to retrieve the files from S3 or another source and pass it in with open(manifest_file_loc, 'rb') as f: manifest_data = json.load(f) job_config = ConfigFactory.from_dict({ 'extractor.dbt.database_name': database_name, 'extractor.dbt.catalog_json': catalog_file_loc, # File 'extractor.dbt.manifest_json': json.dumps(manifest_data), # JSON Dumped objecy 'extractor.dbt.source_url': source_url, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()).launch()
def run_bq_last_upd_job(job_name): #where_clause_suffix = " " gcloud_project = "bpy---pedidosya" #label_filter = "" tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ 'extractor.bigquery_table_metadata.{}'.format(BigQueryLastUpdatedExtractor.PROJECT_ID_KEY): gcloud_project, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) #if label_filter: # job_config[ # 'extractor.bigquery_table_metadata.{}' # .format(BigQueryMetadataExtractor.FILTER_KEY) # ] = label_filter task = DefaultTask(extractor=BigQueryLastUpdatedExtractor(), loader=FsNeo4jCSVLoader(), transformer=NoopTransformer()) job = DefaultJob(conf=ConfigFactory.from_dict(job_config), task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_es_publisher_job(*, elasticsearch, host, neo4j, **kwargs): """ :param elasticsearch_index_alias: alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in `table_search_index` :param model_name: the Databuilder model class used in transporting between Extractor and Loader :param cypher_query: Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default) it uses the `Table` query baked into the Extractor :param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class, if None is given (default) it uses the `Table` query baked into the Publisher """ elasticsearch_client = Elasticsearch([{'host': elasticsearch["host"]}]) # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) data_path = host["es_data_path"] job_config = ConfigFactory.from_dict({ f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j["endpoint"], f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': 'databuilder.models.table_elasticsearch_document.TableESDocument', f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j["user"], f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j["password"], f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': data_path, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': data_path, f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': 'table', f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': 'table_search_index', }) task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
def create_sample_job(table_name, model_name): sql = textwrap.dedent(""" select * from {table_name}; """).format(table_name=table_name) tmp_folder = '/tmp/amundsen/{table_name}'.format(table_name=table_name) # tmp_folder = os.path.join( # BASE_DIR, "amundsen", f"{table_name}".format(table_name=table_name) # ) node_files_folder = "{tmp_folder}/nodes".format(tmp_folder=tmp_folder) relationship_files_folder = "{tmp_folder}/relationships".format( tmp_folder=tmp_folder) sql_extractor = SQLAlchemyExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ "extractor.sqlalchemy.{}".format(SQLAlchemyExtractor.CONN_STRING): SQLITE_CONN_STRING, "extractor.sqlalchemy.{}".format(SQLAlchemyExtractor.EXTRACT_SQL): sql, "extractor.sqlalchemy.model_class": model_name, "loader.filesystem_csv_neo4j.{}".format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, "loader.filesystem_csv_neo4j.{}".format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, "loader.filesystem_csv_neo4j.{}".format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, "publisher.neo4j.{}".format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, "publisher.neo4j.{}".format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, "publisher.neo4j.{}".format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, "publisher.neo4j.{}".format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, "publisher.neo4j.{}".format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, "publisher.neo4j.{}".format(neo4j_csv_publisher.JOB_PUBLISH_TAG): "unique_tag", # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def init(self, conf): # type: (ConfigTree) -> None self._extractor = conf.get(RAW_EXTRACTOR) # type: Extractor self._extractor.init(Scoped.get_scoped_conf(conf, self._extractor.get_scope())) regex_transformer = RegexStrReplaceTransformer() # type: Any if conf.get(regex_transformer.get_scope(), None): regex_transformer.init(Scoped.get_scoped_conf(conf, regex_transformer.get_scope())) else: LOGGER.info('{} is not defined. Not using it'.format(regex_transformer.get_scope())) regex_transformer = NoopTransformer() sql_to_usage_transformer = SqlToTblColUsageTransformer() sql_to_usage_transformer.init(Scoped.get_scoped_conf(conf, sql_to_usage_transformer.get_scope())) self._transformer = ChainedTransformer((regex_transformer, sql_to_usage_transformer))
def create_es_publisher_job(): # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38 elasticsearch_new_index_key_type = 'table' # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index elasticsearch_index_alias = 'table_search_index' job_config = ConfigFactory.from_dict({ f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': NEO4J_ENDPOINT, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': 'databuilder.models.table_elasticsearch_document.TableESDocument', f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': NEO4j_USERNAME, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': NEO4j_PASSWORD, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_new_index_key_type, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias }) return DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher())
def create_sample_job(table_name, model_name): sql = textwrap.dedent(""" select * from {table_name}; """).format(table_name=table_name) tmp_folder = '/var/tmp/amundsen/{table_name}'.format(table_name=table_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) sql_extractor = SQLAlchemyExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): SQLITE_CONN_STRING, 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL): sql, 'extractor.sqlalchemy.model_class': model_name, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def run_csv_job(file_loc, job_name, model): tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()).launch()
def create_table_extract_job(**kwargs): tmp_folder = '/var/tmp/amundsen/{metadata_type}'.format( metadata_type=kwargs['metadata_type']) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) bq_meta_extractor = BigQueryMetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=bq_meta_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY): kwargs['PROJECT_ID_KEY'], 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.FILTER_KEY): #filter desired datasets only 'labels.set_label:data_platform', 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_table_extract_job(): where_clause_suffix = textwrap.dedent(""" where table_schema in {schemas} """).format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships/'.format( tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ 'extractor.athena_metadata.{}'.format(AthenaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix, 'extractor.athena_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), 'extractor.athena_metadata.{}'.format(AthenaMetadataExtractor.CATALOG_KEY): "'AwsDataCatalog'", 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=AthenaMetadataExtractor(), loader=FsNeo4jCSVLoader(), transformer=NoopTransformer()), publisher=Neo4jCsvPublisher()) job.launch()
def run_csv_job(file_loc, table_name, model): tmp_folder = '/var/tmp/amundsen/{table_name}'.format(table_name=table_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): file_loc, 'extractor.csv.model_class': model, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()).launch()
def run_csv_job(file_loc, job_name, model): tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) csv_extractor = CsvExtractor() loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() task = DefaultTask( extractor=csv_extractor, loader=loader, transformer=NoopTransformer() ) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name }, }) DefaultJob( conf=job_config, task=task, publisher=publisher ).launch()
def create_model_job(self, file_loc, metadata_name, model): # type: (str, str, str) -> DefaultJob tmp_folder = '/tmp/amundsen/{metadata_name}'.format(metadata_name=metadata_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): file_loc, 'extractor.csv.model_class': model, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): self.neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): self.neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): self.neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag' + (str(uuid.uuid4()) if self.use_unique_id else "") }) return DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
def create_es_publisher_sample_job(): # loader saves data to this location and publisher reads it from here extracted_search_data_path = "/var/tmp/amundsen/search_data.json" task = DefaultTask( loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer(), ) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = f"tables{uuid.uuid4()}" # related to mapping type from /databuilder/publisher/elasticsearch_publisher.py#L38 elasticsearch_new_index_key_type = "table" # alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index elasticsearch_index_alias = "table_search_index" job_config = ConfigFactory.from_dict( { f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}": neo4j_endpoint, f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}": "databuilder.models.table_elasticsearch_document.TableESDocument", f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}": neo4j_user, f"extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}": neo4j_password, f"loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}": extracted_search_data_path, f"loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}": "w", f"publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}": extracted_search_data_path, f"publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}": "r", f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}": elasticsearch_client, f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}": elasticsearch_new_index_key, f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}": elasticsearch_new_index_key_type, f"publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}": elasticsearch_index_alias, } ) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) job.launch()
def create_table_extract_job(): where_clause_suffix = f"where table_schema in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}" tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = f'{tmp_folder}/nodes/' relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ f'extractor.athena_metadata.{AthenaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, f'extractor.athena_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}': "'AwsDataCatalog'", f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=AthenaMetadataExtractor(), loader=FsNeo4jCSVLoader(), transformer=NoopTransformer()), publisher=Neo4jCsvPublisher()) job.launch()
def run_csv_job(file_loc, job_name, model): tmp_folder = f'/var/tmp/amundsen/{job_name}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' csv_extractor = CsvExtractor() csv_loader = FsAtlasCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.ENTITY_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.RELATIONSHIP_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ATLAS_CLIENT}': AtlasClient(atlas_endpoint, (atlas_user, atlas_password)), f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ENTITY_DIR_PATH}': node_files_folder, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.RELATIONSHIP_DIR_PATH}': relationship_files_folder, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ATLAS_ENTITY_CREATE_BATCH_SIZE}': ATLAS_CREATE_BATCH_SIZE, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.REGISTER_ENTITY_TYPES}': False }) DefaultJob(conf=job_config, task=task, publisher=AtlasCSVPublisher()).launch()
def run_table_column_job(table_path, column_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) extractor = CsvTableColumnExtractor() csv_loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvtablecolumn.table_file_location': table_path, 'extractor.csvtablecolumn.column_file_location': column_path, csv_loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name, NeptuneCSVPublisher.AWS_REGION: AWS_REGION, NeptuneCSVPublisher.AWS_ACCESS_KEY: aws_access_key, NeptuneCSVPublisher.AWS_SECRET_ACCESS_KEY: aws_access_secret, NeptuneCSVPublisher.AWS_SESSION_TOKEN: aws_token } }) job = DefaultJob(conf=job_config, task=task, publisher=publisher) job.launch()
def create_bq_job(metadata_type, gcloud_project): tmp_folder = f'/var/tmp/amundsen/{metadata_type}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' bq_meta_extractor = BigQueryMetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=bq_meta_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': gcloud_project, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_table_column_job(table_path, column_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) extractor = CSVTableColumnExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvtablecolumn.{}'.format(CSVTableColumnExtractor.TABLE_FILE_LOCATION): table_path, 'extractor.csvtablecolumn.{}'.format(CSVTableColumnExtractor.COLUMN_FILE_LOCATION): column_path, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_glue_extractor_job(): tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = Path(tmp_folder, 'nodes') relationship_files_folder = Path(tmp_folder, 'relationships') job_config = ConfigFactory.from_dict({ f'extractor.glue.{GlueExtractor.CLUSTER_KEY}': GLUE_CLUSTER_KEY, f'extractor.glue.{GlueExtractor.FILTER_KEY}': [], f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4j_USERNAME, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4j_PASSWORD, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': str(int(datetime.utcnow().timestamp())) }) return DefaultJob(conf=job_config, task=DefaultTask( extractor=GlueExtractor(), loader=FsNeo4jCSVLoader(), transformer=NoopTransformer()), publisher=Neo4jCsvPublisher())
def create_es_publisher_sample_job( elasticsearch_index_alias='table_search_index', entity_type='table', elasticsearch_mapping=None): """ :param elasticsearch_index_alias: alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in `table_{uuid}` :param model_name: the Databuilder model class used in transporting between Extractor and Loader :param entity_type: Entity type handed to the `Neo4jSearchDataExtractor` class, used to determine Cypher query to extract data from Neo4j. Defaults to `table`. :param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class, if None is given (default) it uses the `Table` query baked into the Publisher """ # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=AtlasSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = f'{entity_type}_{uuid.uuid4()}' job_config = ConfigFactory.from_dict({ 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_URL_CONFIG_KEY): atlas_host, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_PORT_CONFIG_KEY): atlas_port, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_PROTOCOL_CONFIG_KEY): 'http', 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_VALIDATE_SSL_CONFIG_KEY): False, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_USERNAME_CONFIG_KEY): atlas_user, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_PASSWORD_CONFIG_KEY): atlas_password, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_SEARCH_CHUNK_SIZE_KEY): ATLAS_SEARCH_CHUNK_SIZE, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ATLAS_DETAILS_CHUNK_SIZE_KEY): ATLAS_DETAILS_CHUNK_SIZE, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.PROCESS_POOL_SIZE_KEY): ATLAS_PROCESS_POOL_SIZE, 'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ENTITY_TYPE_KEY): entity_type.title(), 'loader.filesystem.elasticsearch.file_path': extracted_search_data_path, 'loader.filesystem.elasticsearch.mode': 'w', 'publisher.elasticsearch.file_path': extracted_search_data_path, 'publisher.elasticsearch.mode': 'r', 'publisher.elasticsearch.client': elasticsearch_client, 'publisher.elasticsearch.new_index': elasticsearch_new_index_key, 'publisher.elasticsearch.doc_type': '_doc', 'publisher.elasticsearch.alias': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if elasticsearch_mapping: job_config.put( f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
'extractor.atlas_search_data.{}'.format(AtlasSearchDataExtractor.ENTITY_TYPE_KEY): entity_type, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'loader.filesystem.elasticsearch.{}'.format(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY): 'w', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_PATH_CONFIG_KEY): extracted_search_data_path, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.FILE_MODE_CONFIG_KEY): 'r', 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY): elasticsearch_client, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY): elasticsearch_new_index_key, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY): elasticsearch_new_index_key_type, 'publisher.elasticsearch.{}'.format(ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY): elasticsearch_index_alias }) if __name__ == "__main__": task = DefaultTask(extractor=AtlasSearchDataExtractor(), transformer=NoopTransformer(), loader=FSElasticsearchJSONLoader()) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) job.launch()