def run_csv_job(file_loc, job_name, model): tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() task = DefaultTask(extractor=csv_extractor, loader=loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name }, }) DefaultJob(conf=job_config, task=task, publisher=publisher).launch()
def run_table_column_job(table_path, column_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) extractor = CsvTableColumnExtractor() csv_loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvtablecolumn.table_file_location': table_path, 'extractor.csvtablecolumn.column_file_location': column_path, csv_loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name } }) job = DefaultJob(conf=job_config, task=task, publisher=publisher) job.launch()
def create_last_updated_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/last_updated_data' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) loader = FSNeptuneCSVLoader() task = DefaultTask(extractor=EsLastUpdatedExtractor(), loader=loader) publisher = NeptuneCSVPublisher() job_config = ConfigFactory.from_dict({ 'extractor.es_last_updated.model_class': 'databuilder.models.es_last_updated.ESLastUpdated', loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name, 'job_publish_tag': 'unique_lastupdated_tag' } }) return DefaultJob(conf=job_config, task=task, publisher=publisher)
def run_tableau_query_job(): task = DefaultTask(extractor=TableauDashboardQueryExtractor(), loader=FsNeo4jCSVLoader()) tmp_folder = '/var/tmp/amundsen/tableau_dashboard_query' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' dict_config = common_tableau_config dict_config.update({ 'extractor.tableau_dashboard_query.api_base_url': tableau_api_base_url, 'extractor.tableau_dashboard_query.api_version': tableau_api_version, 'extractor.tableau_dashboard_query.site_name': tableau_site_name, 'extractor.tableau_dashboard_query.tableau_personal_access_token_name': tableau_personal_access_token_name, 'extractor.tableau_dashboard_query.tableau_personal_access_token_secret': tableau_personal_access_token_secret, 'extractor.tableau_dashboard_query.excluded_projects': tableau_excluded_projects, 'extractor.tableau_dashboard_query.cluster': tableau_dashboard_cluster, 'extractor.tableau_dashboard_query.database': tableau_dashboard_database, 'extractor.tableau_dashboard_query.transformer.timestamp_str_to_epoch.timestamp_format': "%Y-%m-%dT%H:%M:%SZ", 'extractor.tableau_dashboard_query.verify_request': tableau_verify_request, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'task.progress_report_frequency': 100, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, }) job_config = ConfigFactory.from_dict(dict_config) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def run_csv_job(): tmp_folder = '/var/tmp/amundsen/product-view' node_files_folder = Path(tmp_folder, 'nodes') relationship_files_folder = Path(tmp_folder, 'relationships') kafka_extractor = SchemaRegExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=kafka_extractor, loader=csv_loader) job_config = ConfigFactory.from_dict({ f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4j_USERNAME, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4j_PASSWORD, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': str(int(datetime.utcnow().timestamp())) }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def extract_catalog_job(): tmp_folder = "/var/tmp/amundsen/table_metadata" node_files_folder = f"{tmp_folder}/nodes/" relationship_files_folder = f"{tmp_folder}/relationships/" job_config = ConfigFactory.from_dict({ f"tokern.catalog.{CatalogExtractor.CATALOG_CONFIG}": ConfigFactory.from_dict(tokern_connection), f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}": node_files_folder, f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}": relationship_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}": node_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}": relationship_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}": neo4j_endpoint, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}": neo4j_user, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}": neo4j_password, f"publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}": "unique_tag", # should use unique tag here like {ds} }) job = DefaultJob( conf=job_config, task=DefaultTask(extractor=CatalogExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher(), ) return job
def create_sample_db2_job(): where_clause = f"WHERE c.TABSCHEMA not in ({','.join(IGNORED_SCHEMAS)}) ;" tmp_folder = '/var/tmp/amundsen/tables' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' sql_extractor = Db2MetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader) job_config = ConfigFactory.from_dict({ f'extractor.db2_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': DB2_CONN_STRING, f'extractor.db2_metadata.{Db2MetadataExtractor.DATABASE_KEY}': 'DEMODB', f'extractor.db2_metadata.{Db2MetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_table_extract_job(): where_clause_suffix = f"st.schemaname in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}" tmp_folder = "/var/tmp/amundsen/table_metadata" node_files_folder = f"{tmp_folder}/nodes/" relationship_files_folder = f"{tmp_folder}/relationships/" job_config = ConfigFactory.from_dict( { f"extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}": where_clause_suffix, f"extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}": True, f"extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}": connection_string(), f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}": node_files_folder, f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}": relationship_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}": node_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}": relationship_files_folder, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}": neo4j_endpoint, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}": neo4j_user, f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}": neo4j_password, f"publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}": "unique_tag", # should use unique tag here like {ds} } ) job = DefaultJob( conf=job_config, task=DefaultTask( extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader() ), publisher=Neo4jCsvPublisher(), ) job.launch()
def create_sample_dremio_job(): tmp_folder = f'/var/tmp/amundsen/{"tables"}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' extractor = DremioTableColumnExtractor() loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=extractor, loader=loader) job_config = ConfigFactory.from_dict({ f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_USER_KEY}': DREMIO_USER, f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_PASSWORD_KEY}': DREMIO_PASSWORD, f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_HOST_KEY}': DREMIO_HOST, f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4J_USER, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4J_PASSWORD, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def run_mysql_job(): where_clause_suffix = textwrap.dedent(""" where c.table_schema = 'mysql' """) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = f'{tmp_folder}/nodes/' relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ f'extractor.mysql_metadata.{MysqlMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, f'extractor.mysql_metadata.{MysqlMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True, f'extractor.mysql_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=MysqlMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) return job
def run_column_lineage_job(column_lineage_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' extractor = CsvColumnLineageExtractor() csv_loader = FsAtlasCSVLoader() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvcolumnlineage.column_lineage_file_location': column_lineage_path, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.ENTITY_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.RELATIONSHIP_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ATLAS_CLIENT}': AtlasClient(atlas_endpoint, (atlas_user, atlas_password)), f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ENTITY_DIR_PATH}': node_files_folder, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.RELATIONSHIP_DIR_PATH}': relationship_files_folder, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ATLAS_ENTITY_CREATE_BATCH_SIZE}': ATLAS_CREATE_BATCH_SIZE, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.REGISTER_ENTITY_TYPES}': False }) job = DefaultJob(conf=job_config, task=task, publisher=AtlasCSVPublisher()) job.launch()
def create_and_run_tasks_from_yaml(is_full_extraction_enabled=False, verbose=True): with open(CONNECTION_PATH) as f: raw_connection_dicts = yaml.safe_load(f) for raw_connection_dict in raw_connection_dicts: connection = dump_connection_config_in_schema(raw_connection_dict) if connection.type == 'presto': extractor, conf = configure_presto_extractor( connection, is_full_extraction_enabled=is_full_extraction_enabled) elif connection.type == 'neo4j': extractor, conf = configure_neo4j_extractor(connection) elif connection.type == 'bigquery': extractor, conf = configure_bigquery_extractor(connection) elif connection.type == 'build_script': run_build_script(connection) break else: break conf.put('loader.metaframe.database_name', connection.name) task = DefaultTask( extractor=extractor, transformer=MarkdownTransformer(), loader=MetaframeLoader(), ) task.init(conf) task.run()
def create_table_extract_job(): where_clause_suffix = f"where table_schema in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}" tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = f'{tmp_folder}/nodes/' relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ f'extractor.athena_metadata.{AthenaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, f'extractor.athena_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}': "'AwsDataCatalog'", f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=AthenaMetadataExtractor(), loader=FsNeo4jCSVLoader(), transformer=NoopTransformer()), publisher=Neo4jCsvPublisher()) return job
def run_table_column_job(table_path, column_path): tmp_folder = '/var/tmp/amundsen/table_column' record_files_folder = f'{tmp_folder}/records' extractor = CsvTableColumnExtractor() csv_loader = FSMySQLCSVLoader() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvtablecolumn.table_file_location': table_path, 'extractor.csvtablecolumn.column_file_location': column_path, 'loader.mysql_filesystem_csv.record_dir_path': record_files_folder, 'loader.mysql_filesystem_csv.delete_created_directories': True, 'publisher.mysql.record_files_directory': record_files_folder, 'publisher.mysql.conn_string': mysql_conn_string, 'publisher.mysql.job_publish_tag': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=MySQLCSVPublisher()) job.launch()
def run_table_column_job(table_path, column_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) extractor = CsvTableColumnExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvtablecolumn.{}'.format(CsvTableColumnExtractor.TABLE_FILE_LOCATION): table_path, 'extractor.csvtablecolumn.{}'.format(CsvTableColumnExtractor.COLUMN_FILE_LOCATION): column_path, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_glue_extractor_job(): tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = Path(tmp_folder, 'nodes') relationship_files_folder = Path(tmp_folder, 'relationships') job_config = ConfigFactory.from_dict({ 'extractor.glue.{}'.format(GlueExtractor.CLUSTER_KEY): GLUE_CLUSTER_KEY, 'extractor.glue.{}'.format(GlueExtractor.FILTER_KEY): [], 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): NEO4J_ENDPOINT, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): NEO4j_USERNAME, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): NEO4j_PASSWORD, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): str(int(datetime.utcnow().timestamp())) }) return DefaultJob(conf=job_config, task=DefaultTask(extractor=GlueExtractor(), loader=FsNeo4jCSVLoader(), transformer=NoopTransformer()), publisher=Neo4jCsvPublisher())
def create_dashboard_neo4j_job(**kwargs): tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships/'.format( tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ 'extractor.generic.{}'.format(GenericExtractor.EXTRACTION_ITEMS): iter(input), 'extractor.generic.{}'.format('model_class'): 'databuilder.models.dashboard_metadata.DashboardMetadata', 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=GenericExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) return job
def create_table_extract_job(**kwargs): where_clause_suffix = textwrap.dedent(""" where table_schema in {schemas} """.format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE)) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder) job_config = ConfigFactory.from_dict({ 'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix, 'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): True, 'extractor.postgres_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) job.launch()
def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index', elasticsearch_doc_type_key='table', model_name='databuilder.models.table_elasticsearch_document.TableESDocument', cypher_query=None, elasticsearch_mapping=None): """ :param elasticsearch_index_alias: alias for Elasticsearch used in amundsensearchlibrary/search_service/config.py as an index :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in `table_search_index` :param model_name: the Databuilder model class used in transporting between Extractor and Loader :param cypher_query: Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default) it uses the `Table` query baked into the Extractor :param elasticsearch_mapping: Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class, if None is given (default) it uses the `Table` query baked into the Publisher """ # loader saves data to this location and publisher reads it from here extracted_search_data_path = '/var/tmp/amundsen/search_data.json' task = DefaultTask(loader=FSElasticsearchJSONLoader(), extractor=Neo4jSearchDataExtractor(), transformer=NoopTransformer()) # elastic search client instance elasticsearch_client = es # unique name of new index in Elasticsearch elasticsearch_new_index_key = 'tables' + str(uuid.uuid4()) job_config = ConfigFactory.from_dict({ 'extractor.search_data.entity_type': 'table', f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user, f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w', f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path, f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r', f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}': elasticsearch_client, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}': elasticsearch_new_index_key, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}': elasticsearch_doc_type_key, f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}': elasticsearch_index_alias, }) # only optionally add these keys, so need to dynamically `put` them if cypher_query: job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}', cypher_query) if elasticsearch_mapping: job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}', elasticsearch_mapping) job = DefaultJob(conf=job_config, task=task, publisher=ElasticsearchPublisher()) return job
def run_csv_job(file_loc, job_name, model): tmp_folder = f'/var/tmp/amundsen/{job_name}' record_files_folder = f'{tmp_folder}/records' csv_extractor = CsvExtractor() csv_loader = FSMySQLCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, 'loader.mysql_filesystem_csv.record_dir_path': record_files_folder, 'loader.mysql_filesystem_csv.delete_created_directories': True, 'publisher.mysql.record_files_directory': record_files_folder, 'publisher.mysql.conn_string': mysql_conn_string, 'publisher.mysql.job_publish_tag': 'unique_tag', }) DefaultJob(conf=job_config, task=task, publisher=MySQLCSVPublisher()).launch()
def create_last_updated_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/last_updated_data' record_files_folder = f'{tmp_folder}/records' task = DefaultTask(extractor=EsLastUpdatedExtractor(), loader=FSMySQLCSVLoader()) job_config = ConfigFactory.from_dict({ 'extractor.es_last_updated.model_class': 'databuilder.models.es_last_updated.ESLastUpdated', 'loader.mysql_filesystem_csv.record_dir_path': record_files_folder, 'loader.mysql_filesystem_csv.delete_created_directories': True, 'publisher.mysql.record_files_directory': record_files_folder, 'publisher.mysql.conn_string': mysql_conn_string, 'publisher.mysql.job_publish_tag': 'unique_tag' }) return DefaultJob(conf=job_config, task=task, publisher=MySQLCSVPublisher())
def create_sample_dremio_job(): tmp_folder = '/var/tmp/amundsen/{}'.format('tables') node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) extractor = DremioMetadataExtractor() loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=extractor, loader=loader) job_config = ConfigFactory.from_dict({ 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_USER_KEY): DREMIO_USER, 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_PASSWORD_KEY): DREMIO_PASSWORD, 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_HOST_KEY): DREMIO_HOST, 'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY): True, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): NEO4J_ENDPOINT, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): NEO4J_USER, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): NEO4J_PASSWORD, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_table_wm_job(**kwargs): sql = textwrap.dedent(""" SELECT From_unixtime(A0.create_time) as create_time, 'hive' as `database`, C0.NAME as `schema`, B0.tbl_name as table_name, {func}(A0.part_name) as part_name, {watermark} as part_type FROM PARTITIONS A0 LEFT OUTER JOIN TBLS B0 ON A0.tbl_id = B0.tbl_id LEFT OUTER JOIN DBS C0 ON B0.db_id = C0.db_id WHERE C0.NAME IN {schemas} AND B0.tbl_type IN ( 'EXTERNAL_TABLE', 'MANAGED_TABLE' ) AND A0.PART_NAME NOT LIKE '%%__HIVE_DEFAULT_PARTITION__%%' GROUP BY C0.NAME, B0.tbl_name ORDER by create_time desc """).format(func=kwargs['templates_dict'].get('agg_func'), watermark=kwargs['templates_dict'].get('watermark_type'), schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE) logging.info('SQL query: {}'.format(sql)) tmp_folder = '/var/tmp/amundsen/table_{hwm}'.format( hwm=kwargs['templates_dict'].get('watermark_type').strip("\"")) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) hwm_extractor = SQLAlchemyExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=hwm_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL): sql, 'extractor.sqlalchemy.model_class': 'databuilder.models.watermark.Watermark', 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_last_updated_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/last_updated_data' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) task = DefaultTask(extractor=Neo4jEsLastUpdatedExtractor(), loader=FsNeo4jCSVLoader()) job_config = ConfigFactory.from_dict({ 'extractor.neo4j_es_last_updated.model_class': 'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated', 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_last_updated_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def run_csv_job(file_loc, job_name, model): tmp_folder = f'/var/tmp/amundsen/{job_name}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()).launch()
def create_last_updated_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/last_updated_data' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' task = DefaultTask(extractor=EsLastUpdatedExtractor(), loader=FsNeo4jCSVLoader()) job_config = ConfigFactory.from_dict({ 'extractor.es_last_updated.model_class': 'databuilder.models.es_last_updated.ESLastUpdated', 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_lastupdated_tag', # should use unique tag here like {ds} }) return DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
def run_table_column_job(table_path, column_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' extractor = CsvTableColumnExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvtablecolumn.table_file_location': table_path, 'extractor.csvtablecolumn.column_file_location': column_path, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_snowflake_table_metadata_job(): """ Launches databuilder job that extracts table and column metadata from Snowflake database and publishes to Neo4j. """ where_clause_suffix = textwrap.dedent(""" WHERE c.TABLE_SCHEMA IN {schemas} AND lower(c.COLUMN_NAME) not like 'dw_%'; """).format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = f'{tmp_folder}/nodes/' relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}': SNOWFLAKE_DATABASE_KEY, f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'some_unique_tag' # TO-DO unique tag must be added }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=SnowflakeMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) job.launch()
def create_dashboard_tables_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/dashboard_table' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() generic_transformer = GenericTransformer() dict_to_model_transformer = DictToModel() transformer = ChainedTransformer( transformers=[generic_transformer, dict_to_model_transformer], is_init_transformers=True) task = DefaultTask(extractor=csv_extractor, loader=loader, transformer=transformer) job_config = ConfigFactory.from_dict({ csv_extractor.get_scope(): { CsvExtractor.FILE_LOCATION: 'example/sample_data/sample_dashboard_table.csv' }, transformer.get_scope(): { generic_transformer.get_scope(): { FIELD_NAME: 'table_ids', CALLBACK_FUNCTION: _str_to_list }, dict_to_model_transformer.get_scope(): { MODEL_CLASS: 'databuilder.models.dashboard.dashboard_table.DashboardTable', } }, loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name, NeptuneCSVPublisher.AWS_REGION: AWS_REGION, NeptuneCSVPublisher.AWS_ACCESS_KEY: aws_access_key, NeptuneCSVPublisher.AWS_SECRET_ACCESS_KEY: aws_access_secret, NeptuneCSVPublisher.AWS_SESSION_TOKEN: aws_token } }) return DefaultJob(conf=job_config, task=task, publisher=publisher)
def create_dashboard_tables_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/dashboard_table' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() generic_transformer = GenericTransformer() dict_to_model_transformer = DictToModel() transformer = ChainedTransformer( transformers=[generic_transformer, dict_to_model_transformer], is_init_transformers=True) task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=transformer) publisher = Neo4jCsvPublisher() job_config = ConfigFactory.from_dict({ '{}.file_location'.format(csv_extractor.get_scope()): 'example/sample_data/sample_dashboard_table.csv', '{}.{}.{}'.format(transformer.get_scope(), generic_transformer.get_scope(), FIELD_NAME): 'table_ids', '{}.{}.{}'.format(transformer.get_scope(), generic_transformer.get_scope(), CALLBACK_FUNCTION): _str_to_list, '{}.{}.{}'.format(transformer.get_scope(), dict_to_model_transformer.get_scope(), MODEL_CLASS): 'databuilder.models.dashboard.dashboard_table.DashboardTable', '{}.node_dir_path'.format(csv_loader.get_scope()): node_files_folder, '{}.relationship_dir_path'.format(csv_loader.get_scope()): relationship_files_folder, '{}.delete_created_directories'.format(csv_loader.get_scope()): True, '{}.node_files_directory'.format(publisher.get_scope()): node_files_folder, '{}.relation_files_directory'.format(publisher.get_scope()): relationship_files_folder, '{}.neo4j_endpoint'.format(publisher.get_scope()): neo4j_endpoint, '{}.neo4j_user'.format(publisher.get_scope()): neo4j_user, '{}.neo4j_password'.format(publisher.get_scope()): neo4j_password, '{}.neo4j_encrypted'.format(publisher.get_scope()): False, '{}.job_publish_tag'.format(publisher.get_scope()): 'unique_tag', # should use unique tag here like {ds} }) return DefaultJob(conf=job_config, task=task, publisher=publisher)