def create_last_updated_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/last_updated_data' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) task = DefaultTask(extractor=Neo4jEsLastUpdatedExtractor(), loader=FsNeo4jCSVLoader()) job_config = ConfigFactory.from_dict({ 'extractor.neo4j_es_last_updated.model_class': 'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated', 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_lastupdated_tag', # should use unique tag here like {ds} }) return DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
def create_bq_job(metadata_type, gcloud_project): tmp_folder = f'/var/tmp/amundsen/{metadata_type}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' bq_meta_extractor = BigQueryMetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=bq_meta_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': gcloud_project, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_glue_extractor_job(): tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = Path(tmp_folder, 'nodes') relationship_files_folder = Path(tmp_folder, 'relationships') job_config = ConfigFactory.from_dict({ f'extractor.glue.{GlueExtractor.CLUSTER_KEY}': GLUE_CLUSTER_KEY, f'extractor.glue.{GlueExtractor.FILTER_KEY}': [], f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': NEO4J_ENDPOINT, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': NEO4j_USERNAME, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': NEO4j_PASSWORD, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': str(int(datetime.utcnow().timestamp())) }) return DefaultJob(conf=job_config, task=DefaultTask( extractor=GlueExtractor(), loader=FsNeo4jCSVLoader(), transformer=NoopTransformer()), publisher=Neo4jCsvPublisher())
def run_table_column_job(table_path, column_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) extractor = CsvTableColumnExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvtablecolumn.table_file_location': table_path, 'extractor.csvtablecolumn.column_file_location': column_path, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def run_table_column_job(table_path, column_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) extractor = CsvTableColumnExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvtablecolumn.{}'.format(CsvTableColumnExtractor.TABLE_FILE_LOCATION): table_path, 'extractor.csvtablecolumn.{}'.format(CsvTableColumnExtractor.COLUMN_FILE_LOCATION): column_path, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def create_table_extract_job(): where_clause_suffix = f'st.schemaname in {SUPPORTED_SCHEMA_SQL_IN_CLAUSE}' tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = f'{tmp_folder}/nodes/' relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ f'extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, f'extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True, f'extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) job.launch()
def run_tableau_external_table_job(): task = DefaultTask(extractor=TableauDashboardExternalTableExtractor(), loader=FsNeo4jCSVLoader()) tmp_folder = '/var/tmp/amundsen/tableau_dashboard_external_table' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' dict_config = common_tableau_config dict_config.update({ 'extractor.tableau_external_table.api_base_url': tableau_api_base_url, 'extractor.tableau_external_table.api_version': tableau_api_version, 'extractor.tableau_external_table.site_name': tableau_site_name, 'extractor.tableau_external_table.tableau_personal_access_token_name': tableau_personal_access_token_name, 'extractor.tableau_external_table.tableau_personal_access_token_secret': tableau_personal_access_token_secret, 'extractor.tableau_external_table.excluded_projects': tableau_excluded_projects, 'extractor.tableau_external_table.cluster': tableau_dashboard_cluster, 'extractor.tableau_external_table.database': tableau_dashboard_database, 'extractor.tableau_external_table.external_cluster_name': tableau_external_table_cluster, 'extractor.tableau_external_table.external_schema_name': tableau_external_table_schema, 'extractor.tableau_external_table.external_table_types': tableau_external_table_types, 'extractor.tableau_external_table.verify_request': tableau_verify_request, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'task.progress_report_frequency': 100, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, }) job_config = ConfigFactory.from_dict(dict_config) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def test_publisher(self) -> None: with patch.object(GraphDatabase, 'driver') as mock_driver: mock_session = MagicMock() mock_driver.return_value.session.return_value = mock_session mock_transaction = MagicMock() mock_session.begin_transaction.return_value = mock_transaction mock_run = MagicMock() mock_transaction.run = mock_run mock_commit = MagicMock() mock_transaction.commit = mock_commit publisher = Neo4jCsvPublisher() conf = ConfigFactory.from_dict({ neo4j_csv_publisher.NEO4J_END_POINT_KEY: 'dummy://999.999.999.999:7687/', neo4j_csv_publisher.NODE_FILES_DIR: '{}/nodes'.format(self._resource_path), neo4j_csv_publisher.RELATION_FILES_DIR: '{}/relations'.format(self._resource_path), neo4j_csv_publisher.NEO4J_USER: '******', neo4j_csv_publisher.NEO4J_PASSWORD: '******', neo4j_csv_publisher.JOB_PUBLISH_TAG: '{}'.format(uuid.uuid4()) }) publisher.init(conf) publisher.publish() self.assertEqual(mock_run.call_count, 6) # 2 node files, 1 relation file self.assertEqual(mock_commit.call_count, 1)
def create_table_wm_job(**kwargs): sql = textwrap.dedent(""" SELECT From_unixtime(min(A0.create_time)) as create_time, 'hive' as `database`, C0.NAME as `schema`, B0.tbl_name as table_name, {func}(A0.part_name) as part_name, {watermark} as part_type FROM PARTITIONS A0 LEFT OUTER JOIN TBLS B0 ON A0.tbl_id = B0.tbl_id LEFT OUTER JOIN DBS C0 ON B0.db_id = C0.db_id WHERE C0.NAME IN {schemas} AND B0.tbl_type IN ( 'EXTERNAL_TABLE', 'MANAGED_TABLE' ) AND A0.PART_NAME NOT LIKE '%%__HIVE_DEFAULT_PARTITION__%%' GROUP BY C0.NAME, B0.tbl_name ORDER by create_time desc """).format(func=kwargs['templates_dict'].get('agg_func'), watermark=kwargs['templates_dict'].get('watermark_type'), schemas=SUPPORTED_HIVE_SCHEMA_SQL_IN_CLAUSE) logging.info('SQL query: {}'.format(sql)) tmp_folder = '/var/tmp/amundsen/table_{hwm}'.format( hwm=kwargs['templates_dict'].get('watermark_type').strip("\"")) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) hwm_extractor = SQLAlchemyExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=hwm_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), 'extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.EXTRACT_SQL): sql, 'extractor.sqlalchemy.model_class': 'databuilder.models.watermark.Watermark', 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # TO-DO unique tag must be added }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_CREATE_ONLY_NODES}': [DESCRIPTION_NODE_LABEL], 'publisher.neo4j.job_publish_tag': 'some_unique_tag' # TO-DO unique tag must be added }) return job_config if __name__ == "__main__": # This assumes you are running on a spark cluster (for example databricks cluster) # that is configured with a hive metastore that # has pointers to all of your delta tables # Because of this, this code CANNOT run as a normal python operator on airflow. spark = SparkSession.builder.appName( "Amundsen Delta Lake Metadata Extraction").getOrCreate() job_config = create_delta_lake_job_config() dExtractor = DeltaLakeMetadataExtractor() dExtractor.set_spark(spark) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=dExtractor, loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) job.launch()