def test_timestamp_pagesize_settings(self, mock_build: Any) -> None: """ Test timestamp and pagesize can be set """ TIMESTAMP = '2019-01-01T00:00:00.00Z' PAGESIZE = 215 config_dict = { 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): 'your-project-here', 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.TIMESTAMP_KEY): TIMESTAMP, 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PAGE_SIZE_KEY): PAGESIZE, } conf = ConfigFactory.from_dict(config_dict) client = MockLoggingClient(CORRECT_DATA) mock_build.return_value = client extractor = BigQueryTableUsageExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) args, kwargs = client.b.list.call_args body = kwargs['body'] self.assertEqual(body['pageSize'], PAGESIZE) self.assertEqual(TIMESTAMP in body['filter'], True)
def test_key_path(self, mock_build: Any) -> None: """ Test key_path can be used """ with tempfile.NamedTemporaryFile() as keyfile: # There are many github scanners looking for API / cloud keys, so in order not to get a # false positive triggering everywhere, I base64 encoded the key. # This is written to a tempfile as part of this test and then used. keyfile.write(base64.b64decode(KEYFILE_DATA)) keyfile.flush() config_dict = { 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): 'your-project-here', 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.KEY_PATH_KEY): keyfile.name, } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockLoggingClient(CORRECT_DATA) extractor = BigQueryTableUsageExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) args, kwargs = mock_build.call_args creds = kwargs['http'].credentials self.assertEqual(creds.project_id, 'your-project-here') self.assertEqual( creds.service_account_email, '*****@*****.**')
def test_email_filter_counted(self, mock_build: Any) -> None: config_dict = { 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): 'your-project-here', 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.EMAIL_PATTERN): '.*@test.com.*', } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockLoggingClient(CORRECT_DATA) extractor = BigQueryTableUsageExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() assert result is not None self.assertIsInstance(result, tuple) (key, value) = result self.assertIsInstance(key, TableColumnUsageTuple) self.assertIsInstance(value, int) self.assertEqual(key.database, 'bigquery') self.assertEqual(key.cluster, 'bigquery-public-data') self.assertEqual(key.schema, 'austin_incidents') self.assertEqual(key.table, 'incidents_2008') self.assertEqual(key.email, '*****@*****.**') self.assertEqual(value, 1)
def test_basic_extraction(self, mock_build): """ Test Extraction using mock class """ config_dict = { 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): 'your-project-here', } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockLoggingClient(CORRECT_DATA) extractor = BigQueryTableUsageExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsInstance(result, tuple) (key, value) = result self.assertIsInstance(key, TableColumnUsageTuple) self.assertIsInstance(value, int) self.assertEqual(key.database, 'bigquery') self.assertEqual(key.cluster, 'bigquery-public-data') self.assertEqual(key.schema, 'austin_incidents') self.assertEqual(key.table, 'incidents_2008') self.assertEqual(key.email, '*****@*****.**') self.assertEqual(value, 1)
def test_counting_referenced_table_belonging_to_different_project( self, mock_build: Any) -> None: """ Test result when referenced table belongs to a project different from the PROJECT_ID_KEY of the extractor and COUNT_READS_ONLY_FROM_PROJECT is set to False """ config_dict = { f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': 'your-project-here', f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.COUNT_READS_ONLY_FROM_PROJECT_ID_KEY}': False, } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockLoggingClient(CORRECT_DATA) extractor = BigQueryTableUsageExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() assert result is not None self.assertIsInstance(result, tuple) (key, value) = result self.assertIsInstance(key, TableColumnUsageTuple) self.assertIsInstance(value, int) self.assertEqual(key.database, 'bigquery') self.assertEqual(key.cluster, 'bigquery-public-data') self.assertEqual(key.schema, 'austin_incidents') self.assertEqual(key.table, 'incidents_2008') self.assertEqual(key.email, '*****@*****.**') self.assertEqual(value, 1)
def create_extractor(metadata_type): if metadata_type == MetadataType.DSL: extractor = BigQueryMetadataExtractor() extractor_key = 'extractor.bigquery_table_metadata.{}'.format( BigQueryMetadataExtractor.PROJECT_ID_KEY) elif metadata_type == MetadataType.USAGE: extractor = BigQueryTableUsageExtractor() extractor_key = 'extractor.bigquery_table_usage.{}'.format( BigQueryTableUsageExtractor.PROJECT_ID_KEY) else: raise ValueError('Invalid metadata_type') return extractor, extractor_key
def test_no_entries(self, mock_build: Any) -> None: config_dict = { 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): 'your-project-here', } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockLoggingClient(NO_ENTRIES) extractor = BigQueryTableUsageExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsNone(result)
def test_email_filter_not_counted(self, mock_build: Any) -> None: config_dict = { 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): 'your-project-here', 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.EMAIL_PATTERN): 'emailFilter', } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockLoggingClient(CORRECT_DATA) extractor = BigQueryTableUsageExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsNone(result)
def test_failed_jobs_should_not_be_counted(self, mock_build: Any) -> None: config_dict = { f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': 'bigquery-public-data', } conf = ConfigFactory.from_dict(config_dict) client = MockLoggingClient(FAILURE) mock_build.return_value = client extractor = BigQueryTableUsageExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsNone(result)
def test_failed_jobs_should_not_be_counted(self, mock_build): config_dict = { 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): 'your-project-here', } conf = ConfigFactory.from_dict(config_dict) client = MockLoggingClient(FAILURE) mock_build.return_value = client extractor = BigQueryTableUsageExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() self.assertIsNone(result)
def create_bq_job(metadata_type, gcloud_project): tmp_folder = '/var/tmp/amundsen/{metadata_type}'.format( metadata_type=metadata_type) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) bq_usage_extractor = BigQueryTableUsageExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=bq_usage_extractor, loader=csv_loader, transformer=BigqueryUsageTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY): gcloud_project, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def test_not_counting_referenced_table_belonging_to_different_project( self, mock_build: Any) -> None: """ Test result when referenced table belongs to a project different from the PROJECT_ID_KEY of the extractor """ config_dict = { f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}': 'your-project-here', } conf = ConfigFactory.from_dict(config_dict) mock_build.return_value = MockLoggingClient(CORRECT_DATA) extractor = BigQueryTableUsageExtractor() extractor.init( Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope())) result = extractor.extract() assert result is None