def run_hive_metastore_job():
    where_clause_suffix = textwrap.dedent("""
    """)

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = f'{tmp_folder}/nodes/'
    relationship_files_folder = f'{tmp_folder}/relationships/'

    job_config = ConfigFactory.from_dict({
        f'extractor.hive_table_metadata.{HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}':
        where_clause_suffix,
        f'extractor.hive_table_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}':
        connection_string(),
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=HiveTableMetadataExtractor(),
                                      loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    return job
예제 #2
0
def extract_catalog_job():
    tmp_folder = "/var/tmp/amundsen/table_metadata"
    node_files_folder = f"{tmp_folder}/nodes/"
    relationship_files_folder = f"{tmp_folder}/relationships/"

    job_config = ConfigFactory.from_dict({
        f"tokern.catalog.{CatalogExtractor.CATALOG_CONFIG}":
        ConfigFactory.from_dict(tokern_connection),
        f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}":
        node_files_folder,
        f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}":
        relationship_files_folder,
        f"publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}":
        node_files_folder,
        f"publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}":
        relationship_files_folder,
        f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}":
        neo4j_endpoint,
        f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}":
        neo4j_user,
        f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}":
        neo4j_password,
        f"publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}":
        "unique_tag",  # should use unique tag here like {ds}
    })
    job = DefaultJob(
        conf=job_config,
        task=DefaultTask(extractor=CatalogExtractor(),
                         loader=FsNeo4jCSVLoader()),
        publisher=Neo4jCsvPublisher(),
    )
    return job
def run_tableau_query_job():
    task = DefaultTask(extractor=TableauDashboardQueryExtractor(), loader=FsNeo4jCSVLoader())

    tmp_folder = '/var/tmp/amundsen/tableau_dashboard_query'

    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    dict_config = common_tableau_config
    dict_config.update({
        'extractor.tableau_dashboard_query.api_base_url': tableau_api_base_url,
        'extractor.tableau_dashboard_query.api_version': tableau_api_version,
        'extractor.tableau_dashboard_query.site_name': tableau_site_name,
        'extractor.tableau_dashboard_query.tableau_personal_access_token_name': tableau_personal_access_token_name,
        'extractor.tableau_dashboard_query.tableau_personal_access_token_secret': tableau_personal_access_token_secret,
        'extractor.tableau_dashboard_query.excluded_projects': tableau_excluded_projects,
        'extractor.tableau_dashboard_query.cluster': tableau_dashboard_cluster,
        'extractor.tableau_dashboard_query.database': tableau_dashboard_database,
        'extractor.tableau_dashboard_query.transformer.timestamp_str_to_epoch.timestamp_format': "%Y-%m-%dT%H:%M:%SZ",
        'extractor.tableau_dashboard_query.verify_request': tableau_verify_request,
        'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder,
        'loader.filesystem_csv_neo4j.delete_created_directories': True,
        'task.progress_report_frequency': 100,
        'publisher.neo4j.node_files_directory': node_files_folder,
        'publisher.neo4j.relation_files_directory': relationship_files_folder,
    })
    job_config = ConfigFactory.from_dict(dict_config)

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())

    job.launch()
예제 #4
0
def create_table_extract_job():
    where_clause_suffix = f"st.schemaname in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}"

    tmp_folder = "/var/tmp/amundsen/table_metadata"
    node_files_folder = f"{tmp_folder}/nodes/"
    relationship_files_folder = f"{tmp_folder}/relationships/"

    job_config = ConfigFactory.from_dict(
        {
            f"extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}": where_clause_suffix,
            f"extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}": True,
            f"extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}": connection_string(),
            f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}": node_files_folder,
            f"loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}": relationship_files_folder,
            f"publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}": node_files_folder,
            f"publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}": relationship_files_folder,
            f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}": neo4j_endpoint,
            f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}": neo4j_user,
            f"publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}": neo4j_password,
            f"publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}": "unique_tag",  # should use unique tag here like {ds}
        }
    )
    job = DefaultJob(
        conf=job_config,
        task=DefaultTask(
            extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader()
        ),
        publisher=Neo4jCsvPublisher(),
    )
    job.launch()
예제 #5
0
def create_sample_db2_job():
    where_clause = f"WHERE c.TABSCHEMA not in ({','.join(IGNORED_SCHEMAS)}) ;"

    tmp_folder = '/var/tmp/amundsen/tables'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    sql_extractor = Db2MetadataExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=sql_extractor,
                       loader=csv_loader)

    job_config = ConfigFactory.from_dict({
        f'extractor.db2_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': DB2_CONN_STRING,
        f'extractor.db2_metadata.{Db2MetadataExtractor.DATABASE_KEY}': 'DEMODB',
        f'extractor.db2_metadata.{Db2MetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag'
    })
    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())
    return job
예제 #6
0
def create_table_wm_job(**kwargs):
    sql = textwrap.dedent("""
        SELECT From_unixtime(A0.create_time) as create_time,
               'hive'                        as `database`,
               C0.NAME                       as `schema`,
               B0.tbl_name as table_name,
               {func}(A0.part_name) as part_name,
               {watermark} as part_type
        FROM   PARTITIONS A0
               LEFT OUTER JOIN TBLS B0
                            ON A0.tbl_id = B0.tbl_id
               LEFT OUTER JOIN DBS C0
                            ON B0.db_id = C0.db_id
        WHERE  C0.NAME IN {schemas}
               AND B0.tbl_type IN ( 'EXTERNAL_TABLE', 'MANAGED_TABLE' )
               AND A0.PART_NAME NOT LIKE '%%__HIVE_DEFAULT_PARTITION__%%'
        GROUP  BY C0.NAME, B0.tbl_name
        ORDER by create_time desc
    """).format(func=kwargs['templates_dict'].get('agg_func'),
                watermark=kwargs['templates_dict'].get('watermark_type'),
                schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE)

    logging.info('SQL query: {}'.format(sql))
    tmp_folder = '/var/tmp/amundsen/table_{hwm}'.format(
        hwm=kwargs['templates_dict'].get('watermark_type').strip("\""))
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    hwm_extractor = SQLAlchemyExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=hwm_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
        connection_string(),
        'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL):
        sql,
        'extractor.sqlalchemy.model_class':
        'databuilder.models.watermark.Watermark',
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    job.launch()
def create_dashboard_neo4j_job(**kwargs):

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships/'.format(
        tmp_folder=tmp_folder)

    job_config = ConfigFactory.from_dict({
        'extractor.generic.{}'.format(GenericExtractor.EXTRACTION_ITEMS):
        iter(input),
        'extractor.generic.{}'.format('model_class'):
        'databuilder.models.dashboard_metadata.DashboardMetadata',
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=GenericExtractor(),
                                      loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    return job
def create_sample_dremio_job():

    tmp_folder = '/var/tmp/amundsen/{}'.format('tables')
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)

    extractor = DremioMetadataExtractor()
    loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=extractor,
                       loader=loader)

    job_config = ConfigFactory.from_dict({
        'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_USER_KEY): DREMIO_USER,
        'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_PASSWORD_KEY): DREMIO_PASSWORD,
        'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_HOST_KEY): DREMIO_HOST,
        'extractor.dremio.{}'.format(DremioMetadataExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY): True,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): NEO4J_ENDPOINT,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): NEO4J_USER,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): NEO4J_PASSWORD,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag'
    })

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())

    return job
예제 #9
0
def create_last_updated_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/last_updated_data'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    task = DefaultTask(extractor=EsLastUpdatedExtractor(),
                       loader=FsNeo4jCSVLoader())

    job_config = ConfigFactory.from_dict({
        'extractor.es_last_updated.model_class':
            'databuilder.models.es_last_updated.ESLastUpdated',

        'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder,
        'publisher.neo4j.node_files_directory': node_files_folder,
        'publisher.neo4j.relation_files_directory': relationship_files_folder,
        'publisher.neo4j.neo4j_endpoint': neo4j_endpoint,
        'publisher.neo4j.neo4j_user': neo4j_user,
        'publisher.neo4j.neo4j_password': neo4j_password,
        'publisher.neo4j.neo4j_encrypted': False,
        'publisher.neo4j.job_publish_tag': 'unique_lastupdated_tag',  # should use unique tag here like {ds}
    })

    return DefaultJob(conf=job_config,
                      task=task,
                      publisher=Neo4jCsvPublisher())
예제 #10
0
def run_csv_job():
    tmp_folder = '/var/tmp/amundsen/product-view'
    node_files_folder = Path(tmp_folder, 'nodes')
    relationship_files_folder = Path(tmp_folder, 'relationships')

    kafka_extractor = SchemaRegExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=kafka_extractor, loader=csv_loader)

    job_config = ConfigFactory.from_dict({
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        NEO4J_ENDPOINT,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        NEO4j_USERNAME,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        NEO4j_PASSWORD,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        str(int(datetime.utcnow().timestamp()))
    })

    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())

    return job
예제 #11
0
def run_column_lineage_job(column_lineage_path):
    tmp_folder = '/var/tmp/amundsen/table_column'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'
    extractor = CsvColumnLineageExtractor()
    csv_loader = FsNeo4jCSVLoader()
    task = DefaultTask(extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())
    job_config = ConfigFactory.from_dict({
        'extractor.csvcolumnlineage.column_lineage_file_location': column_lineage_path,
        'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder,
        'loader.filesystem_csv_neo4j.delete_created_directories': True,
        'publisher.neo4j.node_files_directory': node_files_folder,
        'publisher.neo4j.relation_files_directory': relationship_files_folder,
        'publisher.neo4j.neo4j_endpoint': neo4j_endpoint,
        'publisher.neo4j.neo4j_user': neo4j_user,
        'publisher.neo4j.neo4j_password': neo4j_password,
        'publisher.neo4j.neo4j_encrypted': False,
        'publisher.neo4j.job_publish_tag': 'lineage_unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())
    job.launch()
예제 #12
0
def create_last_updated_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/last_updated_data'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    task = DefaultTask(extractor=Neo4jEsLastUpdatedExtractor(),
                       loader=FsNeo4jCSVLoader())

    job_config = ConfigFactory.from_dict({
        'extractor.neo4j_es_last_updated.model_class':
        'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated',
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_lastupdated_tag',  # should use unique tag here like {ds}
    })

    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
예제 #13
0
    def test_publisher(self):
        # type: () -> None
        with patch.object(GraphDatabase, 'driver') as mock_driver:
            mock_session = MagicMock()
            mock_driver.return_value.session.return_value = mock_session

            mock_transaction = MagicMock()
            mock_session.begin_transaction.return_value = mock_transaction

            mock_run = MagicMock()
            mock_transaction.run = mock_run
            mock_commit = MagicMock()
            mock_transaction.commit = mock_commit

            publisher = Neo4jCsvPublisher()

            conf = ConfigFactory.from_dict(
                {neo4j_csv_publisher.NEO4J_END_POINT_KEY:
                 'dummy://999.999.999.999:7687/',
                 neo4j_csv_publisher.NODE_FILES_DIR:
                 '{}/nodes'.format(self._resource_path),
                 neo4j_csv_publisher.RELATION_FILES_DIR:
                 '{}/relations'.format(self._resource_path),
                 neo4j_csv_publisher.NEO4J_USER: '******',
                 neo4j_csv_publisher.NEO4J_PASSWORD: '******',
                 neo4j_csv_publisher.JOB_PUBLISH_TAG: '{}'.format(uuid.uuid4())}
            )
            publisher.init(conf)
            publisher.publish()

            self.assertEqual(mock_run.call_count, 6)

            # 2 node files, 1 relation file, and 2 more commits before index creation
            self.assertEqual(mock_commit.call_count, 5)
예제 #14
0
def run_csv_job(file_loc, job_name, model):
    tmp_folder = f'/var/tmp/amundsen/{job_name}'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    csv_extractor = CsvExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=csv_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.csv.file_location': file_loc,
        'extractor.csv.model_class': model,
        'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder,
        'loader.filesystem_csv_neo4j.delete_created_directories': True,
        'publisher.neo4j.node_files_directory': node_files_folder,
        'publisher.neo4j.relation_files_directory': relationship_files_folder,
        'publisher.neo4j.neo4j_endpoint': neo4j_endpoint,
        'publisher.neo4j.neo4j_user': neo4j_user,
        'publisher.neo4j.neo4j_password': neo4j_password,
        'publisher.neo4j.neo4j_encrypted': False,
        'publisher.neo4j.job_publish_tag': 'unique_tag',  # should use unique tag here like {ds}
    })

    DefaultJob(conf=job_config,
               task=task,
               publisher=Neo4jCsvPublisher()).launch()
def create_sample_dremio_job():
    tmp_folder = f'/var/tmp/amundsen/{"tables"}'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    extractor = DremioTableColumnExtractor()
    loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=extractor,
                       loader=loader)

    job_config = ConfigFactory.from_dict({
        f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_USER_KEY}': DREMIO_USER,
        f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_PASSWORD_KEY}': DREMIO_PASSWORD,
        f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_HOST_KEY}': DREMIO_HOST,
        f'extractor.dremio.{DremioTableColumnExtractor.DREMIO_EXCLUDE_PDS_TABLES_KEY}': True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4J_USER,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4J_PASSWORD,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag'
    })

    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())

    return job
def create_dashboard_tables_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/dashboard_table'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    csv_extractor = CsvExtractor()
    csv_loader = FsNeo4jCSVLoader()

    generic_transformer = GenericTransformer()
    dict_to_model_transformer = DictToModel()
    transformer = ChainedTransformer(
        transformers=[generic_transformer, dict_to_model_transformer],
        is_init_transformers=True)

    task = DefaultTask(extractor=csv_extractor,
                       loader=csv_loader,
                       transformer=transformer)
    publisher = Neo4jCsvPublisher()

    job_config = ConfigFactory.from_dict({
        '{}.file_location'.format(csv_extractor.get_scope()):
        'example/sample_data/sample_dashboard_table.csv',
        '{}.{}.{}'.format(transformer.get_scope(),
                          generic_transformer.get_scope(), FIELD_NAME):
        'table_ids',
        '{}.{}.{}'.format(transformer.get_scope(),
                          generic_transformer.get_scope(), CALLBACK_FUNCTION):
        _str_to_list,
        '{}.{}.{}'.format(transformer.get_scope(),
                          dict_to_model_transformer.get_scope(), MODEL_CLASS):
        'databuilder.models.dashboard.dashboard_table.DashboardTable',
        '{}.node_dir_path'.format(csv_loader.get_scope()):
        node_files_folder,
        '{}.relationship_dir_path'.format(csv_loader.get_scope()):
        relationship_files_folder,
        '{}.delete_created_directories'.format(csv_loader.get_scope()):
        True,
        '{}.node_files_directory'.format(publisher.get_scope()):
        node_files_folder,
        '{}.relation_files_directory'.format(publisher.get_scope()):
        relationship_files_folder,
        '{}.neo4j_endpoint'.format(publisher.get_scope()):
        neo4j_endpoint,
        '{}.neo4j_user'.format(publisher.get_scope()):
        neo4j_user,
        '{}.neo4j_password'.format(publisher.get_scope()):
        neo4j_password,
        '{}.neo4j_encrypted'.format(publisher.get_scope()):
        False,
        '{}.job_publish_tag'.format(publisher.get_scope()):
        'unique_tag',  # should use unique tag here like {ds}
    })

    return DefaultJob(conf=job_config, task=task, publisher=publisher)
예제 #17
0
def run_tableau_external_table_job():
    task = DefaultTask(extractor=TableauDashboardExternalTableExtractor(),
                       loader=FsNeo4jCSVLoader())

    tmp_folder = '/var/tmp/amundsen/tableau_dashboard_external_table'

    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    dict_config = common_tableau_config
    dict_config.update({
        'extractor.tableau_external_table.api_base_url':
        tableau_api_base_url,
        'extractor.tableau_external_table.api_version':
        tableau_api_version,
        'extractor.tableau_external_table.site_name':
        tableau_site_name,
        'extractor.tableau_external_table.tableau_personal_access_token_name':
        tableau_personal_access_token_name,
        'extractor.tableau_external_table.tableau_personal_access_token_secret':
        tableau_personal_access_token_secret,
        'extractor.tableau_external_table.excluded_projects':
        tableau_excluded_projects,
        'extractor.tableau_external_table.cluster':
        tableau_dashboard_cluster,
        'extractor.tableau_external_table.database':
        tableau_dashboard_database,
        'extractor.tableau_external_table.external_cluster_name':
        tableau_external_table_cluster,
        'extractor.tableau_external_table.external_schema_name':
        tableau_external_table_schema,
        'extractor.tableau_external_table.external_table_types':
        tableau_external_table_types,
        'extractor.tableau_external_table.verify_request':
        tableau_verify_request,
        'loader.filesystem_csv_neo4j.node_dir_path':
        node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path':
        relationship_files_folder,
        'loader.filesystem_csv_neo4j.delete_created_directories':
        True,
        'task.progress_report_frequency':
        100,
        'publisher.neo4j.node_files_directory':
        node_files_folder,
        'publisher.neo4j.relation_files_directory':
        relationship_files_folder,
    })
    job_config = ConfigFactory.from_dict(dict_config)

    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())

    job.launch()
예제 #18
0
def run_dbt_job(database_name,
                catalog_file_loc,
                manifest_file_loc,
                source_url=None):
    tmp_folder = '/var/tmp/amundsen/dbt_run'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    dbt_extractor = DbtExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=dbt_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    # Catalop and manifest files can be passed in as file locations or a valid python
    # dict, allowing you to retrieve the files from S3 or another source and pass it in
    with open(manifest_file_loc, 'rb') as f:
        manifest_data = json.load(f)

    job_config = ConfigFactory.from_dict({
        'extractor.dbt.database_name':
        database_name,
        'extractor.dbt.catalog_json':
        catalog_file_loc,  # File
        'extractor.dbt.manifest_json':
        json.dumps(manifest_data),  # JSON Dumped objecy
        'extractor.dbt.source_url':
        source_url,
        'loader.filesystem_csv_neo4j.node_dir_path':
        node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path':
        relationship_files_folder,
        'loader.filesystem_csv_neo4j.delete_created_directories':
        True,
        'publisher.neo4j.node_files_directory':
        node_files_folder,
        'publisher.neo4j.relation_files_directory':
        relationship_files_folder,
        'publisher.neo4j.neo4j_endpoint':
        neo4j_endpoint,
        'publisher.neo4j.neo4j_user':
        neo4j_user,
        'publisher.neo4j.neo4j_password':
        neo4j_password,
        'publisher.neo4j.neo4j_encrypted':
        False,
        'publisher.neo4j.job_publish_tag':
        'unique_tag',  # should use unique tag here like {ds}
    })

    DefaultJob(conf=job_config, task=task,
               publisher=Neo4jCsvPublisher()).launch()
def run_bq_last_upd_job(job_name):

    #where_clause_suffix = " "

    gcloud_project = "bpy---pedidosya"
    #label_filter = ""

    tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name)
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    job_config = ConfigFactory.from_dict({
        'extractor.bigquery_table_metadata.{}'.format(BigQueryLastUpdatedExtractor.PROJECT_ID_KEY):
        gcloud_project,
        'loader.filesystem_csv_neo4j.node_dir_path':
        node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path':
        relationship_files_folder,
        'loader.filesystem_csv_neo4j.delete_created_directories':
        True,
        'publisher.neo4j.node_files_directory':
        node_files_folder,
        'publisher.neo4j.relation_files_directory':
        relationship_files_folder,
        'publisher.neo4j.neo4j_endpoint':
        neo4j_endpoint,
        'publisher.neo4j.neo4j_user':
        neo4j_user,
        'publisher.neo4j.neo4j_password':
        neo4j_password,
        'publisher.neo4j.neo4j_encrypted':
        False,
        'publisher.neo4j.job_publish_tag':
        'unique_tag',  # should use unique tag here like {ds}
    })

    #if label_filter:
    #    job_config[
    #        'extractor.bigquery_table_metadata.{}'
    #        .format(BigQueryMetadataExtractor.FILTER_KEY)
    #        ] = label_filter

    task = DefaultTask(extractor=BigQueryLastUpdatedExtractor(),
                       loader=FsNeo4jCSVLoader(),
                       transformer=NoopTransformer())

    job = DefaultJob(conf=ConfigFactory.from_dict(job_config),
                     task=task,
                     publisher=Neo4jCsvPublisher())

    job.launch()
def create_tableau_metadata_job(*, host, neo4j, tableau, **kwargs):
    node_files_folder = host["node_files_folder"]
    relationship_files_folder = host["relationship_files_folder"]
    job_config = ConfigFactory.from_dict({
        'extractor.tableau_dashboard_metadata.tableau_host':
        tableau["host"],
        'extractor.tableau_dashboard_metadata.api_version':
        tableau["api_version"],
        'extractor.tableau_dashboard_metadata.site_name':
        tableau["site_name"],
        'extractor.tableau_dashboard_metadata.tableau_personal_access_token_name':
        tableau["token_name"],
        'extractor.tableau_dashboard_metadata.tableau_personal_access_token_secret':
        tableau["token_secret"],
        'extractor.tableau_dashboard_metadata.excluded_projects':
        list(),
        'extractor.tableau_dashboard_metadata.cluster':
        '',
        'extractor.tableau_dashboard_metadata.database':
        '',
        'extractor.tableau_dashboard_metadata.transformer.timestamp_str_to_epoch.timestamp_format':
        "%Y-%m-%dT%H:%M:%SZ",
        'extractor.tableau_dashboard_metadata.api_base_url':
        tableau["host"],
        'extractor.tableau_dashboard_metadata.tableau_base_url':
        '',
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}':
        True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}':
        True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        neo4j["endpoint"],
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        neo4j["user"],
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        neo4j["password"],
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        'unique_tag'
    })
    task = DefaultTask(extractor=TableauDashboardExtractor(),
                       loader=FsNeo4jCSVLoader())
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
예제 #21
0
def create_table_metadata_databuilder_job():
    """
    Launches data builder job that extracts table and column metadata from MySQL Hive metastore database,
    and publishes to Neo4j.
    @param kwargs:
    @return:
    """

    # Adding to where clause to scope schema, filter out temp tables which start with numbers and views
    where_clause_suffix = textwrap.dedent("""
        WHERE d.NAME IN {schemas}
        AND t.TBL_NAME NOT REGEXP '^[0-9]+'
        AND t.TBL_TYPE IN ( 'EXTERNAL_TABLE', 'MANAGED_TABLE' )
    """).format(schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE)

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships/'.format(
        tmp_folder=tmp_folder)

    job_config = ConfigFactory.from_dict({
        'extractor.hive_table_metadata.{}'.format(HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY):
        where_clause_suffix,
        'extractor.hive_table_metadata.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
        connection_string(),
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_CREATE_ONLY_NODES):
        [DESCRIPTION_NODE_LABEL],
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_tag',  # TO-DO unique tag must be added
    })

    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=HiveTableMetadataExtractor(),
                                      loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    job.launch()
예제 #22
0
def create_sample_snowflake_job():

    where_clause = "WHERE c.TABLE_SCHEMA not in ({0}) \
            AND c.TABLE_SCHEMA not like 'STAGE_%' \
            AND c.TABLE_SCHEMA not like 'HIST_%' \
            AND c.TABLE_SCHEMA not like 'SNAP_%' \
            AND lower(c.COLUMN_NAME) not like 'dw_%';".format(
        ','.join(IGNORED_SCHEMAS))

    tmp_folder = '/var/tmp/amundsen/{}'.format('tables')
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    sql_extractor = SnowflakeMetadataExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=sql_extractor, loader=csv_loader)

    job_config = ConfigFactory.from_dict({
        'extractor.snowflake.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
        SNOWFLAKE_CONN_STRING,
        'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.DATABASE_KEY):
        'YourSnowflakeDbName',
        'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY):
        where_clause,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
        True,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR):
        True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_tag'
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
def create_sample_job(table_name, model_name):
    sql = textwrap.dedent("""
    select * from {table_name};
    """).format(table_name=table_name)

    tmp_folder = '/tmp/amundsen/{table_name}'.format(table_name=table_name)

    # tmp_folder = os.path.join(
    #     BASE_DIR, "amundsen", f"{table_name}".format(table_name=table_name)
    # )
    node_files_folder = "{tmp_folder}/nodes".format(tmp_folder=tmp_folder)
    relationship_files_folder = "{tmp_folder}/relationships".format(
        tmp_folder=tmp_folder)

    sql_extractor = SQLAlchemyExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=sql_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        "extractor.sqlalchemy.{}".format(SQLAlchemyExtractor.CONN_STRING):
        SQLITE_CONN_STRING,
        "extractor.sqlalchemy.{}".format(SQLAlchemyExtractor.EXTRACT_SQL):
        sql,
        "extractor.sqlalchemy.model_class":
        model_name,
        "loader.filesystem_csv_neo4j.{}".format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        "loader.filesystem_csv_neo4j.{}".format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        "loader.filesystem_csv_neo4j.{}".format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
        True,
        "publisher.neo4j.{}".format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        "publisher.neo4j.{}".format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        "publisher.neo4j.{}".format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        "publisher.neo4j.{}".format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        "publisher.neo4j.{}".format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        "publisher.neo4j.{}".format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        "unique_tag",  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
def create_sample_snowflake_job():
    where_clause = f"WHERE c.TABLE_SCHEMA not in ({','.join(IGNORED_SCHEMAS)}) \
            AND c.TABLE_SCHEMA not like 'STAGE_%' \
            AND c.TABLE_SCHEMA not like 'HIST_%' \
            AND c.TABLE_SCHEMA not like 'SNAP_%' \
            AND lower(c.COLUMN_NAME) not like 'dw_%';"

    tmp_folder = '/var/tmp/amundsen/tables'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    sql_extractor = SnowflakeMetadataExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=sql_extractor, loader=csv_loader)

    job_config = ConfigFactory.from_dict({
        f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}':
        connection_string(),
        f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}':
        SNOWFLAKE_DATABASE_KEY,
        f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}':
        where_clause,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}':
        True,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}':
        True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        'unique_tag'
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
예제 #25
0
def run_bq_tu_job(job_name):
    
    #where_clause_suffix = " "
    gcloud_project = "peya-data-pocs"
    #label_filter = ""

    tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name)
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)

    bq_usage_extractor = BigQueryTableUsageExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=bq_usage_extractor,
                       loader=csv_loader,
                       transformer=BigqueryUsageTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
            gcloud_project,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
            node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
            relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
            True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
            node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
            relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
            neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
            neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
            neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
            'unique_tag',  # should use unique tag here like {ds}
    })


    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())    

    job.launch()
예제 #26
0
def create_snowflake_table_metadata_job():
    """
    Launches databuilder job that extracts table and column metadata from Snowflake database and publishes
    to Neo4j.
    """

    where_clause_suffix = textwrap.dedent("""
            WHERE c.TABLE_SCHEMA IN {schemas}
            AND lower(c.COLUMN_NAME) not like 'dw_%';
    """).format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE)

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships/'.format(
        tmp_folder=tmp_folder)

    job_config = ConfigFactory.from_dict({
        'extractor.snowflake.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
        connection_string(),
        'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY):
        SNOWFLAKE_DATABASE_KEY,
        'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY):
        where_clause_suffix,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'some_unique_tag'  # TO-DO unique tag must be added
    })

    job = DefaultJob(conf=job_config,
                     task=DefaultTask(extractor=SnowflakeMetadataExtractor(),
                                      loader=FsNeo4jCSVLoader()),
                     publisher=Neo4jCsvPublisher())
    job.launch()
    def test_preprocessor(self):
        # type: () -> None
        with patch.object(GraphDatabase, 'driver') as mock_driver:
            mock_session = MagicMock()
            mock_driver.return_value.session.return_value = mock_session

            mock_transaction = MagicMock()
            mock_session.begin_transaction.return_value = mock_transaction

            mock_run = MagicMock()
            mock_transaction.run = mock_run
            mock_commit = MagicMock()
            mock_transaction.commit = mock_commit

            mock_preprocessor = MagicMock()
            mock_preprocessor.is_perform_preprocess.return_value = MagicMock(
                return_value=True)
            mock_preprocessor.preprocess_cypher.return_value = (
                'MATCH (f:Foo) RETURN f', {})

            publisher = Neo4jCsvPublisher()

            conf = ConfigFactory.from_dict({
                neo4j_csv_publisher.NEO4J_END_POINT_KEY:
                'dummy://999.999.999.999:7687/',
                neo4j_csv_publisher.NODE_FILES_DIR:
                '{}/nodes'.format(self._resource_path),
                neo4j_csv_publisher.RELATION_FILES_DIR:
                '{}/relations'.format(self._resource_path),
                neo4j_csv_publisher.RELATION_PREPROCESSOR:
                mock_preprocessor,
                neo4j_csv_publisher.NEO4J_USER:
                '******',
                neo4j_csv_publisher.NEO4J_PASSWORD:
                '******',
                neo4j_csv_publisher.JOB_PUBLISH_TAG:
                '{}'.format(uuid.uuid4())
            })
            publisher.init(conf)
            publisher.publish()

            self.assertEqual(mock_run.call_count, 8)

            # 2 node files, 1 relation file
            self.assertEqual(mock_commit.call_count, 1)
예제 #28
0
def run_mssql_job():
    where_clause_suffix = textwrap.dedent("""
        ('dbo')
    """)

    tmp_folder = '/var/tmp/amundsen/table_metadata'
    node_files_folder = '{tmp_folder}/nodes/'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships/'.format(
        tmp_folder=tmp_folder)

    job_config = ConfigFactory.from_dict({
        # MSSQL Loader
        'extractor.mssql_metadata.{}'.format(
            MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause_suffix,
        'extractor.mssql_metadata.{}'.format(
            MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME): True,
        'extractor.mssql_metadata.extractor.sqlalchemy.{}'.format(
            SQLAlchemyExtractor.CONN_STRING): connection_string(),
        # NEO4J Loader
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
            node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
            relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
            node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
            relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
            neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
            neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
            neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
            'unique_tag',  # should use unique tag here like {ds}
    })

    job = DefaultJob(
        conf=job_config,
        task=DefaultTask(
            extractor=MSSQLMetadataExtractor(),
            loader=FsNeo4jCSVLoader()),
        publisher=Neo4jCsvPublisher())
    return job
예제 #29
0
def create_sample_job(table_name, model_name):
    sql = textwrap.dedent("""
    select * from {table_name};
    """).format(table_name=table_name)

    tmp_folder = '/var/tmp/amundsen/{table_name}'.format(table_name=table_name)
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    sql_extractor = SQLAlchemyExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=sql_extractor,
                       loader=csv_loader,
                       transformer=NoopTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING):
        SQLITE_CONN_STRING,
        'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL):
        sql,
        'extractor.sqlalchemy.model_class':
        model_name,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
        True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job
예제 #30
0
파일: bigquery_job.py 프로젝트: cpnat/fram
def create_bq_job(gcloud_project, neo4j_endpoint, neo4j_user, neo4j_password,
                  temp_folder_path, metadata_type):

    node_files_folder = '{temp_folder_path}/{metadata_type}/nodes'\
        .format(temp_folder_path=temp_folder_path, metadata_type=metadata_type)

    relationship_files_folder = '{temp_folder_path}/{metadata_type}/relationships'\
        .format(temp_folder_path=temp_folder_path, metadata_type=metadata_type)

    extractor, extractor_key = create_extractor(metadata_type=metadata_type)
    transformer = create_transformer(metadata_type=metadata_type)

    task = DefaultTask(extractor,
                       loader=FsNeo4jCSVLoader(),
                       transformer=transformer)

    job_config = ConfigFactory.from_dict({
        extractor_key:
        gcloud_project,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
        node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
        relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
        True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
        node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
        relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
        neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
        neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
        neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
        'unique_tag',  # should use unique tag here like {ds}
    })

    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())

    return job