示例#1
0
def run_csv_job(file_loc, job_name, model):
    tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name)
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    csv_extractor = CsvExtractor()
    loader = FSNeptuneCSVLoader()
    publisher = NeptuneCSVPublisher()

    task = DefaultTask(extractor=csv_extractor,
                       loader=loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.csv.file_location': file_loc,
        'extractor.csv.model_class': model,
        loader.get_scope(): {
            FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder,
            FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder,
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True,
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag'
        },
        publisher.get_scope(): {
            NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder,
            NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder,
            NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME,
            NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH,
            NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT,
            NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name
        },
    })

    DefaultJob(conf=job_config, task=task, publisher=publisher).launch()
示例#2
0
def run_table_column_job(table_path, column_path):
    tmp_folder = '/var/tmp/amundsen/table_column'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)
    extractor = CsvTableColumnExtractor()
    csv_loader = FSNeptuneCSVLoader()
    publisher = NeptuneCSVPublisher()
    task = DefaultTask(extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())
    job_config = ConfigFactory.from_dict({
        'extractor.csvtablecolumn.table_file_location':
        table_path,
        'extractor.csvtablecolumn.column_file_location':
        column_path,
        csv_loader.get_scope(): {
            FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder,
            FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder,
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True,
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag'
        },
        publisher.get_scope(): {
            NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder,
            NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder,
            NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME,
            NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH,
            NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT,
            NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name
        }
    })
    job = DefaultJob(conf=job_config, task=task, publisher=publisher)
    job.launch()
示例#3
0
def create_last_updated_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/last_updated_data'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    loader = FSNeptuneCSVLoader()
    task = DefaultTask(extractor=EsLastUpdatedExtractor(), loader=loader)

    publisher = NeptuneCSVPublisher()

    job_config = ConfigFactory.from_dict({
        'extractor.es_last_updated.model_class':
        'databuilder.models.es_last_updated.ESLastUpdated',
        loader.get_scope(): {
            FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder,
            FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder,
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True,
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag'
        },
        publisher.get_scope(): {
            NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder,
            NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder,
            NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME,
            NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH,
            NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT,
            NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name,
            'job_publish_tag': 'unique_lastupdated_tag'
        }
    })

    return DefaultJob(conf=job_config, task=task, publisher=publisher)
def run_tableau_query_job():
    task = DefaultTask(extractor=TableauDashboardQueryExtractor(), loader=FsNeo4jCSVLoader())

    tmp_folder = '/var/tmp/amundsen/tableau_dashboard_query'

    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    dict_config = common_tableau_config
    dict_config.update({
        'extractor.tableau_dashboard_query.api_base_url': tableau_api_base_url,
        'extractor.tableau_dashboard_query.api_version': tableau_api_version,
        'extractor.tableau_dashboard_query.site_name': tableau_site_name,
        'extractor.tableau_dashboard_query.tableau_personal_access_token_name': tableau_personal_access_token_name,
        'extractor.tableau_dashboard_query.tableau_personal_access_token_secret': tableau_personal_access_token_secret,
        'extractor.tableau_dashboard_query.excluded_projects': tableau_excluded_projects,
        'extractor.tableau_dashboard_query.cluster': tableau_dashboard_cluster,
        'extractor.tableau_dashboard_query.database': tableau_dashboard_database,
        'extractor.tableau_dashboard_query.transformer.timestamp_str_to_epoch.timestamp_format': "%Y-%m-%dT%H:%M:%SZ",
        'extractor.tableau_dashboard_query.verify_request': tableau_verify_request,
        'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder,
        'loader.filesystem_csv_neo4j.delete_created_directories': True,
        'task.progress_report_frequency': 100,
        'publisher.neo4j.node_files_directory': node_files_folder,
        'publisher.neo4j.relation_files_directory': relationship_files_folder,
    })
    job_config = ConfigFactory.from_dict(dict_config)

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())

    job.launch()
示例#5
0
def run_csv_job():
    tmp_folder = '/var/tmp/amundsen/product-view'
    node_files_folder = Path(tmp_folder, 'nodes')
    relationship_files_folder = Path(tmp_folder, 'relationships')

    kafka_extractor = SchemaRegExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=kafka_extractor, loader=csv_loader)

    job_config = ConfigFactory.from_dict({
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        NEO4J_ENDPOINT,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        NEO4j_USERNAME,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        NEO4j_PASSWORD,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        str(int(datetime.utcnow().timestamp()))
    })

    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())

    return job
示例#6
0
def extract_catalog_job():
    tmp_folder = "/var/tmp/amundsen/table_metadata"
    node_files_folder = f"{tmp_folder}/nodes/"
    relationship_files_folder = f"{tmp_folder}/relationships/"

    job_config = ConfigFactory.from_dict({
        f"tokern.catalog.{CatalogExtractor.CATALOG_CONFIG}":
        ConfigFactory.from_dict(tokern_connection),
        f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}":
        node_files_folder,
        f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}":
        relationship_files_folder,
        f"publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}":
        node_files_folder,
        f"publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}":
        relationship_files_folder,
        f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}":
        neo4j_endpoint,
        f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}":
        neo4j_user,
        f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}":
        neo4j_password,
        f"publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}":
        "unique_tag",  # should use unique tag here like {ds}
    })
    job = DefaultJob(
        conf=job_config,
        task=DefaultTask(extractor=CatalogExtractor(),
                         loader=FsNeo4jCSVLoader()),
        publisher=Neo4jCsvPublisher(),
    )
    return job
def create_sample_db2_job():
    where_clause = f"WHERE c.TABSCHEMA not in ({','.join(IGNORED_SCHEMAS)}) ;"

    tmp_folder = '/var/tmp/amundsen/tables'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    sql_extractor = Db2MetadataExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=sql_extractor,
                       loader=csv_loader)

    job_config = ConfigFactory.from_dict({
        f'extractor.db2_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': DB2_CONN_STRING,
        f'extractor.db2_metadata.{Db2MetadataExtractor.DATABASE_KEY}': 'DEMODB',
        f'extractor.db2_metadata.{Db2MetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag'
    })
    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())
    return job
示例#8
0
def create_table_extract_job():
    where_clause_suffix = f"st.schemaname in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}"

    tmp_folder = "/var/tmp/amundsen/table_metadata"
    node_files_folder = f"{tmp_folder}/nodes/"
    relationship_files_folder = f"{tmp_folder}/relationships/"

    job_config = ConfigFactory.from_dict(
        {
            f"extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}": where_clause_suffix,
            f"extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}": True,
            f"extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}": connection_string(),
            f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}": node_files_folder,
            f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}": relationship_files_folder,
            f"publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}": node_files_folder,
            f"publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}": relationship_files_folder,
            f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}": neo4j_endpoint,
            f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}": neo4j_user,
            f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}": neo4j_password,
            f"publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}": "unique_tag",  # should use unique tag here like {ds}
        }
    )
    job = DefaultJob(
        conf=job_config,
        task=DefaultTask(
            extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader()
        ),
        publisher=Neo4jCsvPublisher(),
    )
    job.launch()
def create_sample_dremio_job():
    tmp_folder = f'/var/tmp/amundsen/{"tables"}'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    extractor = DremioTableColumnExtractor()
    loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=extractor,
                       loader=loader)

    job_config = ConfigFactory.from_dict({
        f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_USER_KEY}': DREMIO_USER,
        f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_PASSWORD_KEY}': DREMIO_PASSWORD,
        f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_HOST_KEY}': DREMIO_HOST,
        f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY}': True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4J_USER,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4J_PASSWORD,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag'
    })

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())

    return job
示例#10
0
def run_mysql_job():
    where_clause_suffix = textwrap.dedent("""
        where c.table_schema = 'mysql'
    """)

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = f'{tmp_folder}/nodes/'
    relationship_files_folder = f'{tmp_folder}/relationships/'

    job_config = ConfigFactory.from_dict({
        f'extractor.mysql_metadata.{MysqlMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix,
        f'extractor.mysql_metadata.{MysqlMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True,
        f'extractor.mysql_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(),
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=MysqlMetadataExtractor(), loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    return job
示例#11
0
def run_column_lineage_job(column_lineage_path):
    tmp_folder = '/var/tmp/amundsen/table_column'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'
    extractor = CsvColumnLineageExtractor()
    csv_loader = FsAtlasCSVLoader()
    task = DefaultTask(extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())
    job_config = ConfigFactory.from_dict({
        'extractor.csvcolumnlineage.column_lineage_file_location':
        column_lineage_path,
        f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.ENTITY_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.RELATIONSHIP_DIR_PATH}':
        relationship_files_folder,
        f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.SHOULD_DELETE_CREATED_DIR}':
        True,
        f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ATLAS_CLIENT}':
        AtlasClient(atlas_endpoint, (atlas_user, atlas_password)),
        f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ENTITY_DIR_PATH}':
        node_files_folder,
        f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.RELATIONSHIP_DIR_PATH}':
        relationship_files_folder,
        f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ATLAS_ENTITY_CREATE_BATCH_SIZE}':
        ATLAS_CREATE_BATCH_SIZE,
        f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.REGISTER_ENTITY_TYPES}':
        False
    })
    job = DefaultJob(conf=job_config, task=task, publisher=AtlasCSVPublisher())
    job.launch()
示例#12
0
def create_and_run_tasks_from_yaml(is_full_extraction_enabled=False,
                                   verbose=True):
    with open(CONNECTION_PATH) as f:
        raw_connection_dicts = yaml.safe_load(f)

    for raw_connection_dict in raw_connection_dicts:
        connection = dump_connection_config_in_schema(raw_connection_dict)

        if connection.type == 'presto':
            extractor, conf = configure_presto_extractor(
                connection,
                is_full_extraction_enabled=is_full_extraction_enabled)
        elif connection.type == 'neo4j':
            extractor, conf = configure_neo4j_extractor(connection)
        elif connection.type == 'bigquery':
            extractor, conf = configure_bigquery_extractor(connection)
        elif connection.type == 'build_script':
            run_build_script(connection)
            break
        else:
            break

        conf.put('loader.metaframe.database_name', connection.name)

        task = DefaultTask(
            extractor=extractor,
            transformer=MarkdownTransformer(),
            loader=MetaframeLoader(),
        )
        task.init(conf)
        task.run()
示例#13
0
def create_table_extract_job():
    where_clause_suffix = f"where table_schema in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}"

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = f'{tmp_folder}/nodes/'
    relationship_files_folder = f'{tmp_folder}/relationships/'

    job_config = ConfigFactory.from_dict({
        f'extractor.athena_metadata.{AthenaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix,
        f'extractor.athena_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(),
        f'extractor.athena_metadata.{AthenaMetadataExtractor.CATALOG_KEY}': "'AwsDataCatalog'",
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=AthenaMetadataExtractor(), loader=FsNeo4jCSVLoader(),
                                      transformer=NoopTransformer()),
                     publisher=Neo4jCsvPublisher())

    return job
示例#14
0
def run_table_column_job(table_path, column_path):
    tmp_folder = '/var/tmp/amundsen/table_column'
    record_files_folder = f'{tmp_folder}/records'

    extractor = CsvTableColumnExtractor()
    csv_loader = FSMySQLCSVLoader()
    task = DefaultTask(extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())
    job_config = ConfigFactory.from_dict({
        'extractor.csvtablecolumn.table_file_location':
        table_path,
        'extractor.csvtablecolumn.column_file_location':
        column_path,
        'loader.mysql_filesystem_csv.record_dir_path':
        record_files_folder,
        'loader.mysql_filesystem_csv.delete_created_directories':
        True,
        'publisher.mysql.record_files_directory':
        record_files_folder,
        'publisher.mysql.conn_string':
        mysql_conn_string,
        'publisher.mysql.job_publish_tag':
        'unique_tag'
    })
    job = DefaultJob(conf=job_config, task=task, publisher=MySQLCSVPublisher())
    job.launch()
def run_table_column_job(table_path, column_path):
    tmp_folder = '/var/tmp/amundsen/table_column'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)
    extractor = CsvTableColumnExtractor()
    csv_loader = FsNeo4jCSVLoader()
    task = DefaultTask(extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())
    job_config = ConfigFactory.from_dict({
        'extractor.csvtablecolumn.{}'.format(CsvTableColumnExtractor.TABLE_FILE_LOCATION): table_path,
        'extractor.csvtablecolumn.{}'.format(CsvTableColumnExtractor.COLUMN_FILE_LOCATION): column_path,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
            node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
            relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
            True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
            node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
            relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
            neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
            neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
            neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
            'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())
    job.launch()
示例#16
0
def create_glue_extractor_job():

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = Path(tmp_folder, 'nodes')
    relationship_files_folder = Path(tmp_folder, 'relationships')

    job_config = ConfigFactory.from_dict({
        'extractor.glue.{}'.format(GlueExtractor.CLUSTER_KEY):
        GLUE_CLUSTER_KEY,
        'extractor.glue.{}'.format(GlueExtractor.FILTER_KEY): [],
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        NEO4J_ENDPOINT,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        NEO4j_USERNAME,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        NEO4j_PASSWORD,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        str(int(datetime.utcnow().timestamp()))
    })

    return DefaultJob(conf=job_config,
                      task=DefaultTask(extractor=GlueExtractor(),
                                       loader=FsNeo4jCSVLoader(),
                                       transformer=NoopTransformer()),
                      publisher=Neo4jCsvPublisher())
def create_dashboard_neo4j_job(**kwargs):

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships/'.format(
        tmp_folder=tmp_folder)

    job_config = ConfigFactory.from_dict({
        'extractor.generic.{}'.format(GenericExtractor.EXTRACTION_ITEMS):
        iter(input),
        'extractor.generic.{}'.format('model_class'):
        'databuilder.models.dashboard_metadata.DashboardMetadata',
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=GenericExtractor(),
                                      loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    return job
def create_table_extract_job(**kwargs):
    where_clause_suffix = textwrap.dedent("""
        where table_schema in {schemas}
    """.format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE))

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships/'.format(tmp_folder=tmp_folder)

    job_config = ConfigFactory.from_dict({
        'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix,
        'extractor.postgres_metadata.{}'.format(PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): True,
        'extractor.postgres_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(),
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag',  # should use unique tag here like {ds}
        })
    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    job.launch()
示例#19
0
def create_es_publisher_sample_job(elasticsearch_index_alias='table_search_index',
                                   elasticsearch_doc_type_key='table',
                                   model_name='databuilder.models.table_elasticsearch_document.TableESDocument',
                                   cypher_query=None,
                                   elasticsearch_mapping=None):
    """
    :param elasticsearch_index_alias:  alias for Elasticsearch used in
                                       amundsensearchlibrary/search_service/config.py as an index
    :param elasticsearch_doc_type_key: name the ElasticSearch index is prepended with. Defaults to `table` resulting in
                                       `table_search_index`
    :param model_name:                 the Databuilder model class used in transporting between Extractor and Loader
    :param cypher_query:               Query handed to the `Neo4jSearchDataExtractor` class, if None is given (default)
                                       it uses the `Table` query baked into the Extractor
    :param elasticsearch_mapping:      Elasticsearch field mapping "DDL" handed to the `ElasticsearchPublisher` class,
                                       if None is given (default) it uses the `Table` query baked into the Publisher
    """
    # loader saves data to this location and publisher reads it from here
    extracted_search_data_path = '/var/tmp/amundsen/search_data.json'

    task = DefaultTask(loader=FSElasticsearchJSONLoader(),
                       extractor=Neo4jSearchDataExtractor(),
                       transformer=NoopTransformer())

    # elastic search client instance
    elasticsearch_client = es
    # unique name of new index in Elasticsearch
    elasticsearch_new_index_key = 'tables' + str(uuid.uuid4())

    job_config = ConfigFactory.from_dict({
        'extractor.search_data.entity_type': 'table',
        f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.GRAPH_URL_CONFIG_KEY}': neo4j_endpoint,
        f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.MODEL_CLASS_CONFIG_KEY}': model_name,
        f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_USER}': neo4j_user,
        f'extractor.search_data.extractor.neo4j.{Neo4jExtractor.NEO4J_AUTH_PW}': neo4j_password,
        f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY}': extracted_search_data_path,
        f'loader.filesystem.elasticsearch.{FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY}': 'w',
        f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_PATH_CONFIG_KEY}': extracted_search_data_path,
        f'publisher.elasticsearch.{ElasticsearchPublisher.FILE_MODE_CONFIG_KEY}': 'r',
        f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_CLIENT_CONFIG_KEY}':
            elasticsearch_client,
        f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_NEW_INDEX_CONFIG_KEY}':
            elasticsearch_new_index_key,
        f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_DOC_TYPE_CONFIG_KEY}':
            elasticsearch_doc_type_key,
        f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_ALIAS_CONFIG_KEY}':
            elasticsearch_index_alias,
    })

    # only optionally add these keys, so need to dynamically `put` them
    if cypher_query:
        job_config.put(f'extractor.search_data.{Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY}',
                       cypher_query)
    if elasticsearch_mapping:
        job_config.put(f'publisher.elasticsearch.{ElasticsearchPublisher.ELASTICSEARCH_MAPPING_CONFIG_KEY}',
                       elasticsearch_mapping)

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=ElasticsearchPublisher())
    return job
示例#20
0
def run_csv_job(file_loc, job_name, model):
    tmp_folder = f'/var/tmp/amundsen/{job_name}'
    record_files_folder = f'{tmp_folder}/records'

    csv_extractor = CsvExtractor()
    csv_loader = FSMySQLCSVLoader()

    task = DefaultTask(extractor=csv_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.csv.file_location':
        file_loc,
        'extractor.csv.model_class':
        model,
        'loader.mysql_filesystem_csv.record_dir_path':
        record_files_folder,
        'loader.mysql_filesystem_csv.delete_created_directories':
        True,
        'publisher.mysql.record_files_directory':
        record_files_folder,
        'publisher.mysql.conn_string':
        mysql_conn_string,
        'publisher.mysql.job_publish_tag':
        'unique_tag',
    })

    DefaultJob(conf=job_config, task=task,
               publisher=MySQLCSVPublisher()).launch()
示例#21
0
def create_last_updated_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/last_updated_data'
    record_files_folder = f'{tmp_folder}/records'

    task = DefaultTask(extractor=EsLastUpdatedExtractor(),
                       loader=FSMySQLCSVLoader())

    job_config = ConfigFactory.from_dict({
        'extractor.es_last_updated.model_class':
        'databuilder.models.es_last_updated.ESLastUpdated',
        'loader.mysql_filesystem_csv.record_dir_path':
        record_files_folder,
        'loader.mysql_filesystem_csv.delete_created_directories':
        True,
        'publisher.mysql.record_files_directory':
        record_files_folder,
        'publisher.mysql.conn_string':
        mysql_conn_string,
        'publisher.mysql.job_publish_tag':
        'unique_tag'
    })

    return DefaultJob(conf=job_config,
                      task=task,
                      publisher=MySQLCSVPublisher())
def create_sample_dremio_job():

    tmp_folder = '/var/tmp/amundsen/{}'.format('tables')
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)

    extractor = DremioMetadataExtractor()
    loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=extractor,
                       loader=loader)

    job_config = ConfigFactory.from_dict({
        'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_USER_KEY): DREMIO_USER,
        'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_PASSWORD_KEY): DREMIO_PASSWORD,
        'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_HOST_KEY): DREMIO_HOST,
        'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY): True,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): NEO4J_ENDPOINT,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): NEO4J_USER,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): NEO4J_PASSWORD,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag'
    })

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())

    return job
示例#23
0
def create_table_wm_job(**kwargs):
    sql = textwrap.dedent("""
        SELECT From_unixtime(A0.create_time) as create_time,
               'hive'                        as `database`,
               C0.NAME                       as `schema`,
               B0.tbl_name as table_name,
               {func}(A0.part_name) as part_name,
               {watermark} as part_type
        FROM   PARTITIONS A0
               LEFT OUTER JOIN TBLS B0
                            ON A0.tbl_id = B0.tbl_id
               LEFT OUTER JOIN DBS C0
                            ON B0.db_id = C0.db_id
        WHERE  C0.NAME IN {schemas}
               AND B0.tbl_type IN ( 'EXTERNAL_TABLE', 'MANAGED_TABLE' )
               AND A0.PART_NAME NOT LIKE '%%__HIVE_DEFAULT_PARTITION__%%'
        GROUP  BY C0.NAME, B0.tbl_name
        ORDER by create_time desc
    """).format(func=kwargs['templates_dict'].get('agg_func'),
                watermark=kwargs['templates_dict'].get('watermark_type'),
                schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE)

    logging.info('SQL query: {}'.format(sql))
    tmp_folder = '/var/tmp/amundsen/table_{hwm}'.format(
        hwm=kwargs['templates_dict'].get('watermark_type').strip("\""))
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    hwm_extractor = SQLAlchemyExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=hwm_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
        connection_string(),
        'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL):
        sql,
        'extractor.sqlalchemy.model_class':
        'databuilder.models.watermark.Watermark',
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    job.launch()
def create_last_updated_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/last_updated_data'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    task = DefaultTask(extractor=Neo4jEsLastUpdatedExtractor(),
                       loader=FsNeo4jCSVLoader())

    job_config = ConfigFactory.from_dict({
        'extractor.neo4j_es_last_updated.model_class':
        'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated',
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_last_updated_tag',  # should use unique tag here like {ds}
    })

    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
def run_csv_job(file_loc, job_name, model):
    tmp_folder = f'/var/tmp/amundsen/{job_name}'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    csv_extractor = CsvExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=csv_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.csv.file_location': file_loc,
        'extractor.csv.model_class': model,
        'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder,
        'loader.filesystem_csv_neo4j.delete_created_directories': True,
        'publisher.neo4j.node_files_directory': node_files_folder,
        'publisher.neo4j.relation_files_directory': relationship_files_folder,
        'publisher.neo4j.neo4j_endpoint': neo4j_endpoint,
        'publisher.neo4j.neo4j_user': neo4j_user,
        'publisher.neo4j.neo4j_password': neo4j_password,
        'publisher.neo4j.neo4j_encrypted': False,
        'publisher.neo4j.job_publish_tag': 'unique_tag',  # should use unique tag here like {ds}
    })

    DefaultJob(conf=job_config,
               task=task,
               publisher=Neo4jCsvPublisher()).launch()
def create_last_updated_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/last_updated_data'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    task = DefaultTask(extractor=EsLastUpdatedExtractor(),
                       loader=FsNeo4jCSVLoader())

    job_config = ConfigFactory.from_dict({
        'extractor.es_last_updated.model_class':
            'databuilder.models.es_last_updated.ESLastUpdated',

        'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder,
        'publisher.neo4j.node_files_directory': node_files_folder,
        'publisher.neo4j.relation_files_directory': relationship_files_folder,
        'publisher.neo4j.neo4j_endpoint': neo4j_endpoint,
        'publisher.neo4j.neo4j_user': neo4j_user,
        'publisher.neo4j.neo4j_password': neo4j_password,
        'publisher.neo4j.neo4j_encrypted': False,
        'publisher.neo4j.job_publish_tag': 'unique_lastupdated_tag',  # should use unique tag here like {ds}
    })

    return DefaultJob(conf=job_config,
                      task=task,
                      publisher=Neo4jCsvPublisher())
def run_table_column_job(table_path, column_path):
    tmp_folder = '/var/tmp/amundsen/table_column'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'
    extractor = CsvTableColumnExtractor()
    csv_loader = FsNeo4jCSVLoader()
    task = DefaultTask(extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())
    job_config = ConfigFactory.from_dict({
        'extractor.csvtablecolumn.table_file_location': table_path,
        'extractor.csvtablecolumn.column_file_location': column_path,
        'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder,
        'loader.filesystem_csv_neo4j.delete_created_directories': True,
        'publisher.neo4j.node_files_directory': node_files_folder,
        'publisher.neo4j.relation_files_directory': relationship_files_folder,
        'publisher.neo4j.neo4j_endpoint': neo4j_endpoint,
        'publisher.neo4j.neo4j_user': neo4j_user,
        'publisher.neo4j.neo4j_password': neo4j_password,
        'publisher.neo4j.neo4j_encrypted': False,
        'publisher.neo4j.job_publish_tag': 'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())
    job.launch()
示例#28
0
def create_snowflake_table_metadata_job():
    """
    Launches databuilder job that extracts table and column metadata from Snowflake database and publishes
    to Neo4j.
    """

    where_clause_suffix = textwrap.dedent("""
            WHERE c.TABLE_SCHEMA IN {schemas}
            AND lower(c.COLUMN_NAME) not like 'dw_%';
    """).format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE)

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = f'{tmp_folder}/nodes/'
    relationship_files_folder = f'{tmp_folder}/relationships/'

    job_config = ConfigFactory.from_dict({
        f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(),
        f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}': SNOWFLAKE_DATABASE_KEY,
        f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'some_unique_tag'  # TO-DO unique tag must be added
    })

    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=SnowflakeMetadataExtractor(), loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    job.launch()
示例#29
0
def create_dashboard_tables_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/dashboard_table'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    csv_extractor = CsvExtractor()
    loader = FSNeptuneCSVLoader()
    publisher = NeptuneCSVPublisher()

    generic_transformer = GenericTransformer()
    dict_to_model_transformer = DictToModel()
    transformer = ChainedTransformer(
        transformers=[generic_transformer, dict_to_model_transformer],
        is_init_transformers=True)

    task = DefaultTask(extractor=csv_extractor,
                       loader=loader,
                       transformer=transformer)

    job_config = ConfigFactory.from_dict({
        csv_extractor.get_scope(): {
            CsvExtractor.FILE_LOCATION:
            'example/sample_data/sample_dashboard_table.csv'
        },
        transformer.get_scope(): {
            generic_transformer.get_scope(): {
                FIELD_NAME: 'table_ids',
                CALLBACK_FUNCTION: _str_to_list
            },
            dict_to_model_transformer.get_scope(): {
                MODEL_CLASS:
                'databuilder.models.dashboard.dashboard_table.DashboardTable',
            }
        },
        loader.get_scope(): {
            FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder,
            FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder,
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True,
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag'
        },
        publisher.get_scope(): {
            NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder,
            NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder,
            NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME,
            NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH,
            NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT,
            NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name,
            NeptuneCSVPublisher.AWS_REGION: AWS_REGION,
            NeptuneCSVPublisher.AWS_ACCESS_KEY: aws_access_key,
            NeptuneCSVPublisher.AWS_SECRET_ACCESS_KEY: aws_access_secret,
            NeptuneCSVPublisher.AWS_SESSION_TOKEN: aws_token
        }
    })

    return DefaultJob(conf=job_config, task=task, publisher=publisher)
def create_dashboard_tables_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/dashboard_table'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    csv_extractor = CsvExtractor()
    csv_loader = FsNeo4jCSVLoader()

    generic_transformer = GenericTransformer()
    dict_to_model_transformer = DictToModel()
    transformer = ChainedTransformer(
        transformers=[generic_transformer, dict_to_model_transformer],
        is_init_transformers=True)

    task = DefaultTask(extractor=csv_extractor,
                       loader=csv_loader,
                       transformer=transformer)
    publisher = Neo4jCsvPublisher()

    job_config = ConfigFactory.from_dict({
        '{}.file_location'.format(csv_extractor.get_scope()):
        'example/sample_data/sample_dashboard_table.csv',
        '{}.{}.{}'.format(transformer.get_scope(),
                          generic_transformer.get_scope(), FIELD_NAME):
        'table_ids',
        '{}.{}.{}'.format(transformer.get_scope(),
                          generic_transformer.get_scope(), CALLBACK_FUNCTION):
        _str_to_list,
        '{}.{}.{}'.format(transformer.get_scope(),
                          dict_to_model_transformer.get_scope(), MODEL_CLASS):
        'databuilder.models.dashboard.dashboard_table.DashboardTable',
        '{}.node_dir_path'.format(csv_loader.get_scope()):
        node_files_folder,
        '{}.relationship_dir_path'.format(csv_loader.get_scope()):
        relationship_files_folder,
        '{}.delete_created_directories'.format(csv_loader.get_scope()):
        True,
        '{}.node_files_directory'.format(publisher.get_scope()):
        node_files_folder,
        '{}.relation_files_directory'.format(publisher.get_scope()):
        relationship_files_folder,
        '{}.neo4j_endpoint'.format(publisher.get_scope()):
        neo4j_endpoint,
        '{}.neo4j_user'.format(publisher.get_scope()):
        neo4j_user,
        '{}.neo4j_password'.format(publisher.get_scope()):
        neo4j_password,
        '{}.neo4j_encrypted'.format(publisher.get_scope()):
        False,
        '{}.job_publish_tag'.format(publisher.get_scope()):
        'unique_tag',  # should use unique tag here like {ds}
    })

    return DefaultJob(conf=job_config, task=task, publisher=publisher)