def run_hive_metastore_job(): where_clause_suffix = textwrap.dedent(""" """) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = f'{tmp_folder}/nodes/' relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ f'extractor.hive_table_metadata.{HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, f'extractor.hive_table_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=HiveTableMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) return job
def extract_catalog_job(): tmp_folder = "/var/tmp/amundsen/table_metadata" node_files_folder = f"{tmp_folder}/nodes/" relationship_files_folder = f"{tmp_folder}/relationships/" job_config = ConfigFactory.from_dict({ f"tokern.catalog.{CatalogExtractor.CATALOG_CONFIG}": ConfigFactory.from_dict(tokern_connection), f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}": node_files_folder, f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}": relationship_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}": node_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}": relationship_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}": neo4j_endpoint, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}": neo4j_user, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}": neo4j_password, f"publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}": "unique_tag", # should use unique tag here like {ds} }) job = DefaultJob( conf=job_config, task=DefaultTask(extractor=CatalogExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher(), ) return job
def run_tableau_query_job(): task = DefaultTask(extractor=TableauDashboardQueryExtractor(), loader=FsNeo4jCSVLoader()) tmp_folder = '/var/tmp/amundsen/tableau_dashboard_query' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' dict_config = common_tableau_config dict_config.update({ 'extractor.tableau_dashboard_query.api_base_url': tableau_api_base_url, 'extractor.tableau_dashboard_query.api_version': tableau_api_version, 'extractor.tableau_dashboard_query.site_name': tableau_site_name, 'extractor.tableau_dashboard_query.tableau_personal_access_token_name': tableau_personal_access_token_name, 'extractor.tableau_dashboard_query.tableau_personal_access_token_secret': tableau_personal_access_token_secret, 'extractor.tableau_dashboard_query.excluded_projects': tableau_excluded_projects, 'extractor.tableau_dashboard_query.cluster': tableau_dashboard_cluster, 'extractor.tableau_dashboard_query.database': tableau_dashboard_database, 'extractor.tableau_dashboard_query.transformer.timestamp_str_to_epoch.timestamp_format': "%Y-%m-%dT%H:%M:%SZ", 'extractor.tableau_dashboard_query.verify_request': tableau_verify_request, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'task.progress_report_frequency': 100, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, }) job_config = ConfigFactory.from_dict(dict_config) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_table_extract_job(): where_clause_suffix = f"st.schemaname in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}" tmp_folder = "/var/tmp/amundsen/table_metadata" node_files_folder = f"{tmp_folder}/nodes/" relationship_files_folder = f"{tmp_folder}/relationships/" job_config = ConfigFactory.from_dict( { f"extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}": where_clause_suffix, f"extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}": True, f"extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}": connection_string(), f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}": node_files_folder, f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}": relationship_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}": node_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}": relationship_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}": neo4j_endpoint, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}": neo4j_user, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}": neo4j_password, f"publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}": "unique_tag", # should use unique tag here like {ds} } ) job = DefaultJob( conf=job_config, task=DefaultTask( extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader() ), publisher=Neo4jCsvPublisher(), ) job.launch()
def create_sample_db2_job(): where_clause = f"WHERE c.TABSCHEMA not in ({','.join(IGNORED_SCHEMAS)}) ;" tmp_folder = '/var/tmp/amundsen/tables' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' sql_extractor = Db2MetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader) job_config = ConfigFactory.from_dict({ f'extractor.db2_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': DB2_CONN_STRING, f'extractor.db2_metadata.{Db2MetadataExtractor.DATABASE_KEY}': 'DEMODB', f'extractor.db2_metadata.{Db2MetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_table_wm_job(**kwargs): sql = textwrap.dedent(""" SELECT From_unixtime(A0.create_time) as create_time, 'hive' as `database`, C0.NAME as `schema`, B0.tbl_name as table_name, {func}(A0.part_name) as part_name, {watermark} as part_type FROM PARTITIONS A0 LEFT OUTER JOIN TBLS B0 ON A0.tbl_id = B0.tbl_id LEFT OUTER JOIN DBS C0 ON B0.db_id = C0.db_id WHERE C0.NAME IN {schemas} AND B0.tbl_type IN ( 'EXTERNAL_TABLE', 'MANAGED_TABLE' ) AND A0.PART_NAME NOT LIKE '%%__HIVE_DEFAULT_PARTITION__%%' GROUP BY C0.NAME, B0.tbl_name ORDER by create_time desc """).format(func=kwargs['templates_dict'].get('agg_func'), watermark=kwargs['templates_dict'].get('watermark_type'), schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE) logging.info('SQL query: {}'.format(sql)) tmp_folder = '/var/tmp/amundsen/table_{hwm}'.format( hwm=kwargs['templates_dict'].get('watermark_type').strip("\"")) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) hwm_extractor = SQLAlchemyExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=hwm_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL): sql, 'extractor.sqlalchemy.model_class': 'databuilder.models.watermark.Watermark', 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_dashboard_neo4j_job(**kwargs): tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships/'.format( tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ 'extractor.generic.{}'.format(GenericExtractor.EXTRACTION_ITEMS): iter(input), 'extractor.generic.{}'.format('model_class'): 'databuilder.models.dashboard_metadata.DashboardMetadata', 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=GenericExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) return job
def create_sample_dremio_job(): tmp_folder = '/var/tmp/amundsen/{}'.format('tables') node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) extractor = DremioMetadataExtractor() loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=extractor, loader=loader) job_config = ConfigFactory.from_dict({ 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_USER_KEY): DREMIO_USER, 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_PASSWORD_KEY): DREMIO_PASSWORD, 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_HOST_KEY): DREMIO_HOST, 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY): True, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): NEO4J_ENDPOINT, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): NEO4J_USER, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): NEO4J_PASSWORD, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_last_updated_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/last_updated_data' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' task = DefaultTask(extractor=EsLastUpdatedExtractor(), loader=FsNeo4jCSVLoader()) job_config = ConfigFactory.from_dict({ 'extractor.es_last_updated.model_class': 'databuilder.models.es_last_updated.ESLastUpdated', 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_lastupdated_tag', # should use unique tag here like {ds} }) return DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
def run_csv_job(): tmp_folder = '/var/tmp/amundsen/product-view' node_files_folder = Path(tmp_folder, 'nodes') relationship_files_folder = Path(tmp_folder, 'relationships') kafka_extractor = SchemaRegExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=kafka_extractor, loader=csv_loader) job_config = ConfigFactory.from_dict({ f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4j_USERNAME, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4j_PASSWORD, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': str(int(datetime.utcnow().timestamp())) }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def run_column_lineage_job(column_lineage_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' extractor = CsvColumnLineageExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvcolumnlineage.column_lineage_file_location': column_lineage_path, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'lineage_unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_last_updated_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/last_updated_data' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) task = DefaultTask(extractor=Neo4jEsLastUpdatedExtractor(), loader=FsNeo4jCSVLoader()) job_config = ConfigFactory.from_dict({ 'extractor.neo4j_es_last_updated.model_class': 'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated', 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_lastupdated_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def test_publisher(self): # type: () -> None with patch.object(GraphDatabase, 'driver') as mock_driver: mock_session = MagicMock() mock_driver.return_value.session.return_value = mock_session mock_transaction = MagicMock() mock_session.begin_transaction.return_value = mock_transaction mock_run = MagicMock() mock_transaction.run = mock_run mock_commit = MagicMock() mock_transaction.commit = mock_commit publisher = Neo4jCsvPublisher() conf = ConfigFactory.from_dict( {neo4j_csv_publisher.NEO4J_END_POINT_KEY: 'dummy://999.999.999.999:7687/', neo4j_csv_publisher.NODE_FILES_DIR: '{}/nodes'.format(self._resource_path), neo4j_csv_publisher.RELATION_FILES_DIR: '{}/relations'.format(self._resource_path), neo4j_csv_publisher.NEO4J_USER: '******', neo4j_csv_publisher.NEO4J_PASSWORD: '******', neo4j_csv_publisher.JOB_PUBLISH_TAG: '{}'.format(uuid.uuid4())} ) publisher.init(conf) publisher.publish() self.assertEqual(mock_run.call_count, 6) # 2 node files, 1 relation file, and 2 more commits before index creation self.assertEqual(mock_commit.call_count, 5)
def run_csv_job(file_loc, job_name, model): tmp_folder = f'/var/tmp/amundsen/{job_name}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()).launch()
def create_sample_dremio_job(): tmp_folder = f'/var/tmp/amundsen/{"tables"}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' extractor = DremioTableColumnExtractor() loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=extractor, loader=loader) job_config = ConfigFactory.from_dict({ f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_USER_KEY}': DREMIO_USER, f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_PASSWORD_KEY}': DREMIO_PASSWORD, f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_HOST_KEY}': DREMIO_HOST, f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4J_USER, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4J_PASSWORD, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_dashboard_tables_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/dashboard_table' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() generic_transformer = GenericTransformer() dict_to_model_transformer = DictToModel() transformer = ChainedTransformer( transformers=[generic_transformer, dict_to_model_transformer], is_init_transformers=True) task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=transformer) publisher = Neo4jCsvPublisher() job_config = ConfigFactory.from_dict({ '{}.file_location'.format(csv_extractor.get_scope()): 'example/sample_data/sample_dashboard_table.csv', '{}.{}.{}'.format(transformer.get_scope(), generic_transformer.get_scope(), FIELD_NAME): 'table_ids', '{}.{}.{}'.format(transformer.get_scope(), generic_transformer.get_scope(), CALLBACK_FUNCTION): _str_to_list, '{}.{}.{}'.format(transformer.get_scope(), dict_to_model_transformer.get_scope(), MODEL_CLASS): 'databuilder.models.dashboard.dashboard_table.DashboardTable', '{}.node_dir_path'.format(csv_loader.get_scope()): node_files_folder, '{}.relationship_dir_path'.format(csv_loader.get_scope()): relationship_files_folder, '{}.delete_created_directories'.format(csv_loader.get_scope()): True, '{}.node_files_directory'.format(publisher.get_scope()): node_files_folder, '{}.relation_files_directory'.format(publisher.get_scope()): relationship_files_folder, '{}.neo4j_endpoint'.format(publisher.get_scope()): neo4j_endpoint, '{}.neo4j_user'.format(publisher.get_scope()): neo4j_user, '{}.neo4j_password'.format(publisher.get_scope()): neo4j_password, '{}.neo4j_encrypted'.format(publisher.get_scope()): False, '{}.job_publish_tag'.format(publisher.get_scope()): 'unique_tag', # should use unique tag here like {ds} }) return DefaultJob(conf=job_config, task=task, publisher=publisher)
def run_tableau_external_table_job(): task = DefaultTask(extractor=TableauDashboardExternalTableExtractor(), loader=FsNeo4jCSVLoader()) tmp_folder = '/var/tmp/amundsen/tableau_dashboard_external_table' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) dict_config = common_tableau_config dict_config.update({ 'extractor.tableau_external_table.api_base_url': tableau_api_base_url, 'extractor.tableau_external_table.api_version': tableau_api_version, 'extractor.tableau_external_table.site_name': tableau_site_name, 'extractor.tableau_external_table.tableau_personal_access_token_name': tableau_personal_access_token_name, 'extractor.tableau_external_table.tableau_personal_access_token_secret': tableau_personal_access_token_secret, 'extractor.tableau_external_table.excluded_projects': tableau_excluded_projects, 'extractor.tableau_external_table.cluster': tableau_dashboard_cluster, 'extractor.tableau_external_table.database': tableau_dashboard_database, 'extractor.tableau_external_table.external_cluster_name': tableau_external_table_cluster, 'extractor.tableau_external_table.external_schema_name': tableau_external_table_schema, 'extractor.tableau_external_table.external_table_types': tableau_external_table_types, 'extractor.tableau_external_table.verify_request': tableau_verify_request, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'task.progress_report_frequency': 100, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, }) job_config = ConfigFactory.from_dict(dict_config) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def run_dbt_job(database_name, catalog_file_loc, manifest_file_loc, source_url=None): tmp_folder = '/var/tmp/amundsen/dbt_run' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' dbt_extractor = DbtExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=dbt_extractor, loader=csv_loader, transformer=NoopTransformer()) # Catalop and manifest files can be passed in as file locations or a valid python # dict, allowing you to retrieve the files from S3 or another source and pass it in with open(manifest_file_loc, 'rb') as f: manifest_data = json.load(f) job_config = ConfigFactory.from_dict({ 'extractor.dbt.database_name': database_name, 'extractor.dbt.catalog_json': catalog_file_loc, # File 'extractor.dbt.manifest_json': json.dumps(manifest_data), # JSON Dumped objecy 'extractor.dbt.source_url': source_url, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()).launch()
def run_bq_last_upd_job(job_name): #where_clause_suffix = " " gcloud_project = "bpy---pedidosya" #label_filter = "" tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ 'extractor.bigquery_table_metadata.{}'.format(BigQueryLastUpdatedExtractor.PROJECT_ID_KEY): gcloud_project, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) #if label_filter: # job_config[ # 'extractor.bigquery_table_metadata.{}' # .format(BigQueryMetadataExtractor.FILTER_KEY) # ] = label_filter task = DefaultTask(extractor=BigQueryLastUpdatedExtractor(), loader=FsNeo4jCSVLoader(), transformer=NoopTransformer()) job = DefaultJob(conf=ConfigFactory.from_dict(job_config), task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_tableau_metadata_job(*, host, neo4j, tableau, **kwargs): node_files_folder = host["node_files_folder"] relationship_files_folder = host["relationship_files_folder"] job_config = ConfigFactory.from_dict({ 'extractor.tableau_dashboard_metadata.tableau_host': tableau["host"], 'extractor.tableau_dashboard_metadata.api_version': tableau["api_version"], 'extractor.tableau_dashboard_metadata.site_name': tableau["site_name"], 'extractor.tableau_dashboard_metadata.tableau_personal_access_token_name': tableau["token_name"], 'extractor.tableau_dashboard_metadata.tableau_personal_access_token_secret': tableau["token_secret"], 'extractor.tableau_dashboard_metadata.excluded_projects': list(), 'extractor.tableau_dashboard_metadata.cluster': '', 'extractor.tableau_dashboard_metadata.database': '', 'extractor.tableau_dashboard_metadata.transformer.timestamp_str_to_epoch.timestamp_format': "%Y-%m-%dT%H:%M:%SZ", 'extractor.tableau_dashboard_metadata.api_base_url': tableau["host"], 'extractor.tableau_dashboard_metadata.tableau_base_url': '', f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j["endpoint"], f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j["user"], f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j["password"], f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) task = DefaultTask(extractor=TableauDashboardExtractor(), loader=FsNeo4jCSVLoader()) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_table_metadata_databuilder_job(): """ Launches data builder job that extracts table and column metadata from MySQL Hive metastore database, and publishes to Neo4j. @param kwargs: @return: """ # Adding to where clause to scope schema, filter out temp tables which start with numbers and views where_clause_suffix = textwrap.dedent(""" WHERE d.NAME IN {schemas} AND t.TBL_NAME NOT REGEXP '^[0-9]+' AND t.TBL_TYPE IN ( 'EXTERNAL_TABLE', 'MANAGED_TABLE' ) """).format(schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships/'.format( tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ 'extractor.hive_table_metadata.{}'.format(HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix, 'extractor.hive_table_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_CREATE_ONLY_NODES): [DESCRIPTION_NODE_LABEL], 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # TO-DO unique tag must be added }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=HiveTableMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) job.launch()
def create_sample_snowflake_job(): where_clause = "WHERE c.TABLE_SCHEMA not in ({0}) \ AND c.TABLE_SCHEMA not like 'STAGE_%' \ AND c.TABLE_SCHEMA not like 'HIST_%' \ AND c.TABLE_SCHEMA not like 'SNAP_%' \ AND lower(c.COLUMN_NAME) not like 'dw_%';".format( ','.join(IGNORED_SCHEMAS)) tmp_folder = '/var/tmp/amundsen/{}'.format('tables') node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) sql_extractor = SnowflakeMetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader) job_config = ConfigFactory.from_dict({ 'extractor.snowflake.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): SNOWFLAKE_CONN_STRING, 'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.DATABASE_KEY): 'YourSnowflakeDbName', 'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_sample_job(table_name, model_name): sql = textwrap.dedent(""" select * from {table_name}; """).format(table_name=table_name) tmp_folder = '/tmp/amundsen/{table_name}'.format(table_name=table_name) # tmp_folder = os.path.join( # BASE_DIR, "amundsen", f"{table_name}".format(table_name=table_name) # ) node_files_folder = "{tmp_folder}/nodes".format(tmp_folder=tmp_folder) relationship_files_folder = "{tmp_folder}/relationships".format( tmp_folder=tmp_folder) sql_extractor = SQLAlchemyExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ "extractor.sqlalchemy.{}".format(SQLAlchemyExtractor.CONN_STRING): SQLITE_CONN_STRING, "extractor.sqlalchemy.{}".format(SQLAlchemyExtractor.EXTRACT_SQL): sql, "extractor.sqlalchemy.model_class": model_name, "loader.filesystem_csv_neo4j.{}".format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, "loader.filesystem_csv_neo4j.{}".format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, "loader.filesystem_csv_neo4j.{}".format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, "publisher.neo4j.{}".format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, "publisher.neo4j.{}".format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, "publisher.neo4j.{}".format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, "publisher.neo4j.{}".format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, "publisher.neo4j.{}".format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, "publisher.neo4j.{}".format(neo4j_csv_publisher.JOB_PUBLISH_TAG): "unique_tag", # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_sample_snowflake_job(): where_clause = f"WHERE c.TABLE_SCHEMA not in ({','.join(IGNORED_SCHEMAS)}) \ AND c.TABLE_SCHEMA not like 'STAGE_%' \ AND c.TABLE_SCHEMA not like 'HIST_%' \ AND c.TABLE_SCHEMA not like 'SNAP_%' \ AND lower(c.COLUMN_NAME) not like 'dw_%';" tmp_folder = '/var/tmp/amundsen/tables' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' sql_extractor = SnowflakeMetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader) job_config = ConfigFactory.from_dict({ f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}': SNOWFLAKE_DATABASE_KEY, f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def run_bq_tu_job(job_name): #where_clause_suffix = " " gcloud_project = "peya-data-pocs" #label_filter = "" tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) bq_usage_extractor = BigQueryTableUsageExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=bq_usage_extractor, loader=csv_loader, transformer=BigqueryUsageTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): gcloud_project, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_snowflake_table_metadata_job(): """ Launches databuilder job that extracts table and column metadata from Snowflake database and publishes to Neo4j. """ where_clause_suffix = textwrap.dedent(""" WHERE c.TABLE_SCHEMA IN {schemas} AND lower(c.COLUMN_NAME) not like 'dw_%'; """).format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships/'.format( tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ 'extractor.snowflake.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), 'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY): SNOWFLAKE_DATABASE_KEY, 'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'some_unique_tag' # TO-DO unique tag must be added }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=SnowflakeMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) job.launch()
def test_preprocessor(self): # type: () -> None with patch.object(GraphDatabase, 'driver') as mock_driver: mock_session = MagicMock() mock_driver.return_value.session.return_value = mock_session mock_transaction = MagicMock() mock_session.begin_transaction.return_value = mock_transaction mock_run = MagicMock() mock_transaction.run = mock_run mock_commit = MagicMock() mock_transaction.commit = mock_commit mock_preprocessor = MagicMock() mock_preprocessor.is_perform_preprocess.return_value = MagicMock( return_value=True) mock_preprocessor.preprocess_cypher.return_value = ( 'MATCH (f:Foo) RETURN f', {}) publisher = Neo4jCsvPublisher() conf = ConfigFactory.from_dict({ neo4j_csv_publisher.NEO4J_END_POINT_KEY: 'dummy://999.999.999.999:7687/', neo4j_csv_publisher.NODE_FILES_DIR: '{}/nodes'.format(self._resource_path), neo4j_csv_publisher.RELATION_FILES_DIR: '{}/relations'.format(self._resource_path), neo4j_csv_publisher.RELATION_PREPROCESSOR: mock_preprocessor, neo4j_csv_publisher.NEO4J_USER: '******', neo4j_csv_publisher.NEO4J_PASSWORD: '******', neo4j_csv_publisher.JOB_PUBLISH_TAG: '{}'.format(uuid.uuid4()) }) publisher.init(conf) publisher.publish() self.assertEqual(mock_run.call_count, 8) # 2 node files, 1 relation file self.assertEqual(mock_commit.call_count, 1)
def run_mssql_job(): where_clause_suffix = textwrap.dedent(""" ('dbo') """) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships/'.format( tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ # MSSQL Loader 'extractor.mssql_metadata.{}'.format( MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix, 'extractor.mssql_metadata.{}'.format( MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): True, 'extractor.mssql_metadata.extractor.sqlalchemy.{}'.format( SQLAlchemyExtractor.CONN_STRING): connection_string(), # NEO4J Loader 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob( conf=job_config, task=DefaultTask( extractor=MSSQLMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) return job
def create_sample_job(table_name, model_name): sql = textwrap.dedent(""" select * from {table_name}; """).format(table_name=table_name) tmp_folder = '/var/tmp/amundsen/{table_name}'.format(table_name=table_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) sql_extractor = SQLAlchemyExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): SQLITE_CONN_STRING, 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL): sql, 'extractor.sqlalchemy.model_class': model_name, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_bq_job(gcloud_project, neo4j_endpoint, neo4j_user, neo4j_password, temp_folder_path, metadata_type): node_files_folder = '{temp_folder_path}/{metadata_type}/nodes'\ .format(temp_folder_path=temp_folder_path, metadata_type=metadata_type) relationship_files_folder = '{temp_folder_path}/{metadata_type}/relationships'\ .format(temp_folder_path=temp_folder_path, metadata_type=metadata_type) extractor, extractor_key = create_extractor(metadata_type=metadata_type) transformer = create_transformer(metadata_type=metadata_type) task = DefaultTask(extractor, loader=FsNeo4jCSVLoader(), transformer=transformer) job_config = ConfigFactory.from_dict({ extractor_key: gcloud_project, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job