def test_keypath_and_pagesize_can_be_set(self, mock_build: Any) -> None: config_dict = { f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': 'your-project-here', f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PAGE_SIZE_KEY}': 200, f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.KEY_PATH_KEY}': '/tmp/doesnotexist', } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, TABLE_DATA) extractor = BigQueryMetadataExtractor() with self.assertRaises(FileNotFoundError): extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
def _create_big_query_extractor( source: CatSource, ) -> Tuple[BigQueryMetadataExtractor, Any]: extractor = BigQueryMetadataExtractor() scope = extractor.get_scope() conf = ConfigFactory.from_dict( { f"{scope}.connection_name": source.name, f"{scope}.key_path": source.key_path, f"{scope}.project_id": source.project_id, f"{scope}.project_credentials": source.project_credentials, f"{scope}.page_size": source.page_size, f"{scope}.filter_key": source.filter_key, f"{scope}.included_tables_regex": source.included_tables_regex, } ) return extractor, conf
def test_empty_dataset(self, mock_build: Any) -> None: mock_build.return_value = MockBigQueryClient(ONE_DATASET, NO_TABLES, None) extractor = BigQueryMetadataExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsNone(result)
def test_can_handle_datasets(self, mock_build: Any) -> None: mock_build.return_value = MockBigQueryClient(NO_DATASETS, None, None) extractor = BigQueryMetadataExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsNone(result)
def create_extractor(metadata_type): if metadata_type == MetadataType.DSL: extractor = BigQueryMetadataExtractor() extractor_key = 'extractor.bigquery_table_metadata.{}'.format( BigQueryMetadataExtractor.PROJECT_ID_KEY) elif metadata_type == MetadataType.USAGE: extractor = BigQueryTableUsageExtractor() extractor_key = 'extractor.bigquery_table_usage.{}'.format( BigQueryTableUsageExtractor.PROJECT_ID_KEY) else: raise ValueError('Invalid metadata_type') return extractor, extractor_key
def test_view(self, mock_build: Any) -> None: mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_VIEW, VIEW_DATA) extractor = BigQueryMetadataExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsInstance(result, TableMetadata) self.assertEqual(result.is_view, True)
def test_table_part_of_table_date_range(self, mock_build: Any) -> None: mock_build.return_value = MockBigQueryClient(ONE_DATASET, TABLE_DATE_RANGE, TABLE_DATA) extractor = BigQueryMetadataExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) count = 0 result = extractor.extract() table_name = result.name while result: count += 1 result = extractor.extract() self.assertEqual(count, 1) self.assertEqual(table_name, 'date_range_')
def test_accepts_dataset_filter_by_label(self, mock_build: Any) -> None: config_dict = { f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': 'your-project-here', f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.FILTER_KEY}': 'label.key:value' } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, TABLE_DATA) extractor = BigQueryMetadataExtractor() extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsInstance(result, TableMetadata)
def test_table_without_columns(self, mock_build: Any) -> None: mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, NO_COLS) extractor = BigQueryMetadataExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertEqual(result.database, 'bigquery') self.assertEqual(result.cluster, 'your-project-here') self.assertEqual(result.schema, 'fdgdfgh') self.assertEqual(result.name, 'nested_recs') self.assertEqual(result.description.text, "") self.assertEqual(result.columns, []) self.assertEqual(result.is_view, False)
def create_table_extract_job(**kwargs): tmp_folder = '/var/tmp/amundsen/{metadata_type}'.format( metadata_type=kwargs['metadata_type']) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) bq_meta_extractor = BigQueryMetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=bq_meta_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.PROJECT_ID_KEY): kwargs['PROJECT_ID_KEY'], 'extractor.bigquery_table_metadata.{}'.format(BigQueryMetadataExtractor.FILTER_KEY): #filter desired datasets only 'labels.set_label:data_platform', 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) job.launch()
def test_table_with_nested_records(self, mock_build: Any) -> None: mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, NESTED_DATA) extractor = BigQueryMetadataExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() first_col = result.columns[0] self.assertEqual(first_col.name, 'nested') self.assertEqual(first_col.type, 'RECORD') second_col = result.columns[1] self.assertEqual(second_col.name, 'nested.nested2') self.assertEqual(second_col.type, 'RECORD') third_col = result.columns[2] self.assertEqual(third_col.name, 'nested.nested2.ahah') self.assertEqual(third_col.type, 'STRING')
def test_normal_table(self, mock_build: Any) -> None: mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, TABLE_DATA) extractor = BigQueryMetadataExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertEqual(result.database, 'bigquery') self.assertEqual(result.cluster, 'your-project-here') self.assertEqual(result.schema, 'fdgdfgh') self.assertEqual(result.name, 'nested_recs') self.assertEqual(result.description.text, "") first_col = result.columns[0] self.assertEqual(first_col.name, 'test') self.assertEqual(first_col.type, 'STRING') self.assertEqual(first_col.description.text, 'some_description') self.assertEqual(result.is_view, False)
def create_bq_job(metadata_type, gcloud_project): tmp_folder = f'/var/tmp/amundsen/{metadata_type}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' bq_meta_extractor = BigQueryMetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=bq_meta_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ f'extractor.bigquery_table_metadata.{BigQueryMetadataExtractor.PROJECT_ID_KEY}': gcloud_project, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job