Пример #1
0
 def test_table_without_partitions(self, mock_build: Any) -> None:
     mock_build.return_value = MockBigQueryClient(ONE_DATASET, ONE_TABLE, None)
     extractor = BigQueryWatermarkExtractor()
     extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                           scope=extractor.get_scope()))
     result = extractor.extract()
     self.assertIsNone(result)
Пример #2
0
    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(VerticaMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(conf.get_string(VerticaMetadataExtractor.CLUSTER_KEY))

        if conf.get_bool(VerticaMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "c.table_catalog"
        else:
            cluster_source = "'{}'".format(self._cluster)

        self._database = conf.get_string(VerticaMetadataExtractor.DATABASE_KEY, default='vertica')

        self.sql_stmt = VerticaMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(VerticaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=cluster_source
        )

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info('SQL for vertica metadata: {}'.format(self.sql_stmt))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None
Пример #3
0
    def init(self, conf):
        # type: (ConfigTree) -> None
        self.conf = conf.with_fallback(PrestoTableMetadataExtractor.DEFAULT_CONFIG)
        self._database = "{}".format(
            self.conf.get_string(PrestoTableMetadataExtractor.DATABASE_KEY)
        )
        self._cluster = self.conf.get(PrestoTableMetadataExtractor.CLUSTER_KEY, None)
        LOGGER.info("Cluster name: {}".format(self._cluster))

        if self._cluster is not None:
            cluster_prefix = self._cluster + "."
        else:
            cluster_prefix = ""

        self.sql_stmt = PrestoTableMetadataExtractor.SQL_STATEMENT.format(
            cluster_prefix=cluster_prefix,
            where_clause_suffix=self.conf.get_string(
                PrestoTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY
            )
            or "",
        )

        LOGGER.info("SQL for presto: {}".format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(
            self.conf, self._alchemy_extractor.get_scope()
        ).with_fallback(
            ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})
        )

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]
    def test_dashboard_metadata_extractor(self) -> None:

        config = ConfigFactory.from_dict({
            'extractor.tableau_dashboard_metadata.tableau_host':
            'tableau_host',
            'extractor.tableau_dashboard_metadata.api_version':
            'tableau_api_version',
            'extractor.tableau_dashboard_metadata.site_name':
            'tableau_site_name',
            'extractor.tableau_dashboard_metadata.tableau_personal_access_token_name':
            'tableau_personal_access_token_name',
            'extractor.tableau_dashboard_metadata.tableau_personal_access_token_secret':
            'tableau_personal_access_token_secret',
            'extractor.tableau_dashboard_metadata.excluded_projects': [],
            'extractor.tableau_dashboard_metadata.cluster':
            'tableau_dashboard_cluster',
            'extractor.tableau_dashboard_metadata.database':
            'tableau_dashboard_database',
            'extractor.tableau_dashboard_metadata.transformer.timestamp_str_to_epoch.timestamp_format':
            '%Y-%m-%dT%H:%M:%SZ',
        })

        extractor = TableauDashboardExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=config, scope=extractor.get_scope()))
        record = extractor.extract()

        self.assertEqual(record.dashboard_id, 'Test Workbook')
        self.assertEqual(record.dashboard_name, 'Test Workbook')
        self.assertEqual(record.dashboard_group_id, 'Test Project')
        self.assertEqual(record.dashboard_group, 'Test Project')
        self.assertEqual(record.product, 'tableau')
        self.assertEqual(record.cluster, 'tableau_dashboard_cluster')
        self.assertEqual(record.created_timestamp, 1586323921)
    def test_loading_with_different_object(self):
        # type: () -> None
        """
        Test Loading functionality with a python Dict object
        """
        loader = FSElasticsearchJSONLoader()
        loader.init(conf=Scoped.get_scoped_conf(conf=self.conf,
                                                scope=loader.get_scope()))

        data = dict(database='test_database',
                    cluster='test_cluster',
                    schema_name='test_schema',
                    table_name='test_table',
                    table_key='test_table_key',
                    table_last_updated_epoch=123456789,
                    table_description='test_description',
                    column_names=['test_col1', 'test_col2'],
                    column_descriptions=['test_comment1', 'test_comment2'],
                    total_usage=10,
                    unique_usage=5,
                    tag_names=['test_tag1', 'test_tag2'])

        with self.assertRaises(Exception) as context:
            loader.load(data)  # type: ignore
        self.assertTrue(
            "Record not of type 'ElasticsearchDocument'!" in context.exception)

        loader.close()
    def test_key_path(self, mock_build):
        """
        Test key_path can be used
        """

        with tempfile.NamedTemporaryFile() as keyfile:
            # There are many github scanners looking for API / cloud keys, so in order not to get a
            # false positive triggering everywhere, I base64 encoded the key.
            # This is written to a tempfile as part of this test and then used.
            keyfile.write(base64.b64decode(KEYFILE_DATA))
            keyfile.flush()
            config_dict = {
                'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
                    'your-project-here',
                'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.KEY_PATH_KEY):
                    keyfile.name,
            }
            conf = ConfigFactory.from_dict(config_dict)

            mock_build.return_value = MockLoggingClient(CORRECT_DATA)
            extractor = BigQueryTableUsageExtractor()
            extractor.init(Scoped.get_scoped_conf(conf=conf,
                                                  scope=extractor.get_scope()))

            args, kwargs = mock_build.call_args
            creds = kwargs['http'].credentials
            self.assertEqual(creds.project_id, 'your-project-here')
            self.assertEqual(creds.service_account_email, '*****@*****.**')
Пример #7
0
    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(
            SnowflakeTableLastUpdatedExtractor.DEFAULT_CONFIG)

        if conf.get_bool(SnowflakeTableLastUpdatedExtractor.
                         USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "t.table_catalog"
        else:
            cluster_source = "'{}'".format(
                conf.get_string(
                    SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY))

        self._database = conf.get_string(
            SnowflakeTableLastUpdatedExtractor.DATABASE_KEY)
        self._snowflake_database = conf.get_string(
            SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY)

        self.sql_stmt = SnowflakeTableLastUpdatedExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                SnowflakeTableLastUpdatedExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=cluster_source,
            database=self._snowflake_database)

        LOGGER.info(
            'SQL for snowflake table last updated timestamp: {}'.format(
                self.sql_stmt))

        # use an sql_alchemy_extractor to execute sql
        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope()) \
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None
    def test_table_part_of_table_date_range(self, mock_build):
        mock_build.return_value = MockBigQueryClient(ONE_DATASET,
                                                     TABLE_DATE_RANGE, None)
        extractor = BigQueryWatermarkExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))

        result = extractor.extract()
        self.assertEquals(result.part_type, 'low_watermark')
        self.assertEquals(result.database, 'bigquery')
        self.assertEquals(result.schema, 'fdgdfgh')
        self.assertEquals(result.table, 'date_range_')
        self.assertEquals(result.cluster, 'your-project-here')
        self.assertEquals(
            result.create_time,
            datetime.fromtimestamp(1557577779).strftime('%Y-%m-%d %H:%M:%S'))
        self.assertEquals(result.parts, [('__table__', '20190101')])

        result = extractor.extract()
        self.assertEquals(result.part_type, 'high_watermark')
        self.assertEquals(result.database, 'bigquery')
        self.assertEquals(result.schema, 'fdgdfgh')
        self.assertEquals(result.table, 'date_range_')
        self.assertEquals(result.cluster, 'your-project-here')
        self.assertEquals(
            result.create_time,
            datetime.fromtimestamp(1557577779).strftime('%Y-%m-%d %H:%M:%S'))
        self.assertEquals(result.parts, [('__table__', '20190102')])
Пример #9
0
    def test_publish_with_data_and_old_index(self) -> None:
        """
        Test Publish functionality with data and with old_index in place
        """
        mock_data = json.dumps({'KEY_DOESNOT_MATTER': 'NO_VALUE',
                                'KEY_DOESNOT_MATTER2': 'NO_VALUE2'})
        self.mock_es_client.indices.get_alias.return_value = {'test_old_index': 'DOES_NOT_MATTER'}

        with patch('builtins.open', mock_open(read_data=mock_data)) as mock_file:
            publisher = ElasticsearchPublisher()
            publisher.init(conf=Scoped.get_scoped_conf(conf=self.conf,
                                                       scope=publisher.get_scope()))

            # assert mock was called with test_file_path and test_file_mode
            mock_file.assert_called_once_with(self.test_file_path, self.test_file_mode)

            publisher.publish()
            # ensure indices create endpoint was called
            default_mapping = ElasticsearchPublisher.DEFAULT_ELASTICSEARCH_INDEX_MAPPING
            self.mock_es_client.indices.create.assert_called_once_with(index=self.test_es_new_index,
                                                                       body=default_mapping)

            # bulk endpoint called once
            self.mock_es_client.bulk.assert_called_once_with(
                [{'index': {'_index': self.test_es_new_index}},
                 {'KEY_DOESNOT_MATTER': 'NO_VALUE',
                  'KEY_DOESNOT_MATTER2': 'NO_VALUE2',
                  'resource_type': 'test_doc_type'}]
            )

            # update alias endpoint called once
            self.mock_es_client.indices.update_aliases.assert_called_once_with(
                {'actions': [{"add": {"index": self.test_es_new_index, "alias": self.test_es_alias}},
                             {"remove_index": {"index": 'test_old_index'}}]}
            )
    def _get_non_partitioned_table_sql_alchemy_extractor(self):
        # type: () -> Extractor
        """
        Getting an SQLAlchemy extractor that extracts storage location for non-partitioned table for further probing
        last updated timestamp

        :return: SQLAlchemyExtractor
        """
        if HiveTableLastUpdatedExtractor.NON_PARTITIONED_TABLE_WHERE_CLAUSE_SUFFIX_KEY in self._conf:
            where_clause_suffix = """
            {}
            AND {}
            """.format(
                self._conf.get_string(
                    HiveTableLastUpdatedExtractor.
                    NON_PARTITIONED_TABLE_WHERE_CLAUSE_SUFFIX_KEY),
                HiveTableLastUpdatedExtractor.ADDTIONAL_WHERE_CLAUSE)
        else:
            where_clause_suffix = 'WHERE {}'.format(
                HiveTableLastUpdatedExtractor.ADDTIONAL_WHERE_CLAUSE)

        sql_stmt = HiveTableLastUpdatedExtractor.NON_PARTITIONED_TABLE_SQL_STATEMENT.format(
            where_clause_suffix=where_clause_suffix)

        LOGGER.info(
            'SQL for non-partitioned table against Hive metastore: {}'.format(
                sql_stmt))

        sql_alchemy_extractor = SQLAlchemyExtractor()
        sql_alchemy_conf = Scoped.get_scoped_conf(self._conf, sql_alchemy_extractor.get_scope()) \
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: sql_stmt}))
        sql_alchemy_extractor.init(sql_alchemy_conf)
        return sql_alchemy_extractor
    def test_table_with_field_partitions(self, mock_build):
        mock_build.return_value = MockBigQueryClient(
            ONE_DATASET, TIME_PARTITIONED_WITH_FIELD, PARTITION_DATA)
        extractor = BigQueryWatermarkExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertEquals(result.part_type, 'low_watermark')
        self.assertEquals(result.database, 'bigquery')
        self.assertEquals(result.schema, 'fdgdfgh')
        self.assertEquals(result.table, 'other')
        self.assertEquals(result.cluster, 'your-project-here')
        self.assertEquals(
            result.create_time,
            datetime.fromtimestamp(1547512241).strftime('%Y-%m-%d %H:%M:%S'))
        self.assertEquals(result.parts, [('processed_date', '20180802')])

        result = extractor.extract()
        self.assertEquals(result.part_type, 'high_watermark')
        self.assertEquals(result.database, 'bigquery')
        self.assertEquals(result.schema, 'fdgdfgh')
        self.assertEquals(result.table, 'other')
        self.assertEquals(result.cluster, 'your-project-here')
        self.assertEquals(
            result.create_time,
            datetime.fromtimestamp(1547512241).strftime('%Y-%m-%d %H:%M:%S'))
        self.assertEquals(result.parts, [('processed_date', '20180804')])
Пример #12
0
    def test_extraction_one_object(self, mock_salesforce: Any) -> None:
        mock_salesforce.return_value = MockSalesForce()
        config_dict: Dict = {
            f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [
                "Account"
            ],
            **self.config,
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_salesforce.return_value = MockSalesForce()
        extractor = SalesForceExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertIsInstance(result, TableMetadata)

        expected = TableMetadata(
            "salesforce",
            "gold",
            "default",
            "Account",
            None,
            [
                ColumnMetadata("Id", "The Account Id", "id", 0, []),
                ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []),
            ],
            False,
            [],
        )

        self.assertEqual(expected.__repr__(), result.__repr__())

        self.assertIsNone(extractor.extract())
    def init(self, conf: ConfigTree) -> None:
        self._conf = conf
        self.query = """query {
          workbooks {
            name
            projectName
            upstreamTables {
              name
              schema
              database {
                name
                connectionType
              }
            }
          }
        }"""
        self._extractor = self._build_extractor()

        transformers = []
        dict_to_model_transformer = DictToModel()
        dict_to_model_transformer.init(conf=Scoped.get_scoped_conf(
            self._conf, dict_to_model_transformer.get_scope()
        ).with_fallback(
            ConfigFactory.from_dict({
                MODEL_CLASS:
                'databuilder.models.dashboard.dashboard_table.DashboardTable'
            })))
        transformers.append(dict_to_model_transformer)
        self._transformer = ChainedTransformer(transformers=transformers)
Пример #14
0
    def test_extraction_with_model_class(self) -> None:
        """
        Test Extraction using model class
        """
        config_dict = {
            f'extractor.csv.{CsvExtractor.FILE_LOCATION}':
            'example/sample_data/sample_table.csv',
            f'extractor.csv.model_class':
            'databuilder.models.table_metadata.TableMetadata',
        }
        self.conf = ConfigFactory.from_dict(config_dict)
        extractor = CsvExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))

        result = extractor.extract()
        self.assertEqual(result.name, 'test_table1')
        self.assertEqual(result.description.text, '1st test table')
        self.assertEqual(result.database, 'hive')
        self.assertEqual(result.cluster, 'gold')
        self.assertEqual(result.schema, 'test_schema')
        self.assertEqual(result.tags, ['tag1', 'tag2'])
        self.assertEqual(result.is_view, 'false')

        result2 = extractor.extract()
        self.assertEqual(result2.name, 'test_table2')
        self.assertEqual(result2.is_view, 'false')

        result3 = extractor.extract()
        self.assertEqual(result3.name, 'test_view1')
        self.assertEqual(result3.is_view, 'true')
Пример #15
0
    def test_amundsen_dataset_key(self) -> None:
        """
        Test _amundsen_dataset_key method
        """
        config_dict = {
            f'extractor.openlineage_tablelineage.{OpenLineageTableLineageExtractor.TABLE_LINEAGE_FILE_LOCATION}':
            'example/sample_data/openlineage/sample_openlineage_events.ndjson',
            f'extractor.openlineage_tablelineage.{OpenLineageTableLineageExtractor.CLUSTER_NAME}':
            'datalab',
        }
        self.conf = ConfigFactory.from_dict(config_dict)
        extractor = OpenLineageTableLineageExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))

        mock_dataset = {
            'name': 'mock_table',
            'namespace': 'postgresql',
            'database': 'testdb'
        }

        self.assertEqual('postgresql://datalab.testdb/mock_table',
                         extractor._amundsen_dataset_key(mock_dataset))
        extractor.ol_namespace_override = 'hive'
        self.assertEqual('hive://datalab.testdb/mock_table',
                         extractor._amundsen_dataset_key(mock_dataset))
    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = Scoped.get_scoped_conf(conf, self.get_scope()) \
            .with_fallback(conf) \
            .with_fallback(DEFAULT_CONFIG)
        self.target_nodes = set(conf.get_list(TARGET_NODES))
        self.target_relations = set(conf.get_list(TARGET_RELATIONS))
        self.batch_size = conf.get_int(BATCH_SIZE)
        self.dry_run = conf.get_bool(DRY_RUN)
        self.staleness_pct = conf.get_int(STALENESS_MAX_PCT)
        self.staleness_pct_dict = conf.get(STALENESS_PCT_MAX_DICT)

        if JOB_PUBLISH_TAG in conf and MS_TO_EXPIRE in conf:
            raise Exception('Cannot have both {} and {} in job config'.format(
                JOB_PUBLISH_TAG, MS_TO_EXPIRE))

        self.ms_to_expire = None
        if MS_TO_EXPIRE in conf:
            self.ms_to_expire = conf.get_int(MS_TO_EXPIRE)
            if self.ms_to_expire < conf.get_int(MIN_MS_TO_EXPIRE):
                raise Exception('{} is too small'.format(MS_TO_EXPIRE))
            self.marker = '(timestamp() - {})'.format(
                conf.get_int(MS_TO_EXPIRE))
        else:
            self.marker = conf.get_string(JOB_PUBLISH_TAG)

        self._driver = \
            GraphDatabase.driver(conf.get_string(NEO4J_END_POINT_KEY),
                                 max_connection_life_time=conf.get_int(NEO4J_MAX_CONN_LIFE_TIME_SEC),
                                 auth=(conf.get_string(NEO4J_USER), conf.get_string(NEO4J_PASSWORD)))
    def test_basic_extraction(self, mock_build):
        """
        Test Extraction using mock class
        """
        config_dict = {
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
                'your-project-here',
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_build.return_value = MockLoggingClient(CORRECT_DATA)
        extractor = BigQueryTableUsageExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=conf,
                                              scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertIsInstance(result, tuple)

        (key, value) = result
        self.assertIsInstance(key, TableColumnUsageTuple)
        self.assertIsInstance(value, int)

        self.assertEqual(key.database, 'bigquery')
        self.assertEqual(key.cluster, 'bigquery-public-data')
        self.assertEqual(key.schema, 'austin_incidents')
        self.assertEqual(key.table, 'incidents_2008')
        self.assertEqual(key.email, '*****@*****.**')
        self.assertEqual(value, 1)
Пример #18
0
    def _get_extractor(self, index_names: List[str]) -> Any:
        extractor = ElasticsearchWatermarkExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=self._get_config(index_names),
                                   scope=extractor.get_scope()))

        return extractor
    def test_timestamp_pagesize_settings(self, mock_build):
        """
        Test timestamp and pagesize can be set
        """
        TIMESTAMP = '2019-01-01T00:00:00.00Z'
        PAGESIZE = 215

        config_dict = {
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
                'your-project-here',
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.TIMESTAMP_KEY):
                TIMESTAMP,
            'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PAGE_SIZE_KEY):
                PAGESIZE,
        }
        conf = ConfigFactory.from_dict(config_dict)

        client = MockLoggingClient(CORRECT_DATA)
        mock_build.return_value = client
        extractor = BigQueryTableUsageExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=conf,
                                              scope=extractor.get_scope()))

        args, kwargs = client.b.list.call_args
        body = kwargs['body']

        self.assertEqual(body['pageSize'], PAGESIZE)
        self.assertEqual(TIMESTAMP in body['filter'], True)
Пример #20
0
    def init(self, conf):
        conf = conf.with_fallback(self.DEFAULT_CONFIG)

        self._cluster = "{}".format(conf.get_string(self.CLUSTER_KEY))

        self._database = conf.get_string(self.DATABASE_KEY)

        self.sql_stmt = self._get_sql_statement(
            use_catalog_as_cluster_name=conf.get_bool(
                self.USE_CATALOG_AS_CLUSTER_NAME),
            where_clause_suffix=conf.get_string(self.WHERE_CLAUSE_SUFFIX_KEY),
        )

        self._alchemy_extractor = SQLAlchemyExtractor()

        sql_alch_conf = Scoped.get_scoped_conf(
            conf, SQLALCHEMY_ENGINE_SCOPE).with_fallback(
                ConfigFactory.from_dict(
                    {SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(
            SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info("SQL for postgres metadata: %s", self.sql_stmt)

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, iterator] = None
    def init(self, conf: ConfigTree) -> None:
        self._conf = conf
        self.query = """query externalTables($externalTableTypes: [String]) {
          databases (filter: {connectionTypeWithin: $externalTableTypes}) {
            name
            connectionType
            description
            tables {
                name
            }
          }
        }"""
        self.query_variables = {
            'externalTableTypes':
            self._conf.get_list(
                TableauDashboardExternalTableExtractor.EXTERNAL_TABLE_TYPES)
        }
        self._extractor = self._build_extractor()

        transformers = []
        dict_to_model_transformer = DictToModel()
        dict_to_model_transformer.init(conf=Scoped.get_scoped_conf(
            self._conf, dict_to_model_transformer.get_scope()).with_fallback(
                ConfigFactory.from_dict({
                    MODEL_CLASS:
                    'databuilder.models.table_metadata.TableMetadata'
                })))
        transformers.append(dict_to_model_transformer)
        self._transformer = ChainedTransformer(transformers=transformers)
Пример #22
0
    def test_extraction_with_model_class(self: Any, mock_method: Any) -> None:
        """
        Test Extraction using model class
        """
        config_dict = {
            'extractor.sqlalchemy.conn_string':
            'TEST_CONNECTION',
            'extractor.sqlalchemy.extract_sql':
            'SELECT 1 FROM TEST_TABLE;',
            'extractor.sqlalchemy.model_class':
            'tests.unit.extractor.test_sql_alchemy_extractor.TableMetadataResult'
        }
        self.conf = ConfigFactory.from_dict(config_dict)

        extractor = SQLAlchemyExtractor()
        extractor.results = [
            dict(database='test_database',
                 schema='test_schema',
                 name='test_table',
                 description='test_description',
                 column_name='test_column_name',
                 column_type='test_column_type',
                 column_comment='test_column_comment',
                 owner='test_owner')
        ]

        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))

        result = extractor.extract()

        self.assertIsInstance(result, TableMetadataResult)
        self.assertEqual(result.name, 'test_table')
Пример #23
0
    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(BasePostgresMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = conf.get_string(
            BasePostgresMetadataExtractor.CLUSTER_KEY)

        self._database = conf.get_string(
            BasePostgresMetadataExtractor.DATABASE_KEY, default='postgres')

        self.sql_stmt = self.get_sql_statement(
            use_catalog_as_cluster_name=conf.get_bool(
                BasePostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME),
            where_clause_suffix=conf.get_string(
                BasePostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
        )

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(
            SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info('SQL for postgres metadata: %s', self.sql_stmt)

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None
Пример #24
0
    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(SqliteMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = conf.get_string(SqliteMetadataExtractor.CLUSTER_KEY)

        self._database = conf.get_string(SqliteMetadataExtractor.DATABASE_KEY,
                                         default="sqlite")

        self.sql_stmt = SqliteMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                SqliteMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=self._cluster,
        )

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(
            conf, self._alchemy_extractor.get_scope()).with_fallback(
                ConfigFactory.from_dict(
                    {SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(
            SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info("SQL for sqlite metadata: %s", self.sql_stmt)

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None
    def test_loading_with_single_object(self):
        # type: () -> None
        """
        Test Loading functionality with single python object
        """
        loader = FSElasticsearchJSONLoader()
        loader.init(conf=Scoped.get_scoped_conf(conf=self.conf,
                                                scope=loader.get_scope()))

        data = TableESDocument(
            database='test_database',
            cluster='test_cluster',
            schema_name='test_schema',
            table_name='test_table',
            table_key='test_table_key',
            table_last_updated_epoch=123456789,
            table_description='test_description',
            column_names=['test_col1', 'test_col2'],
            column_descriptions=['test_comment1', 'test_comment2'],
            total_usage=10,
            unique_usage=5,
            tag_names=['test_tag1', 'test_tag2'])
        loader.load(data)
        loader.close()

        expected = [(
            '{"table_key": "test_table_key", "column_descriptions": ["test_comment1", "test_comment2"], '
            '"schema_name": "test_schema", "database": "test_database", "cluster": "test_cluster", '
            '"column_names": ["test_col1", "test_col2"], "table_name": "test_table", '
            '"table_last_updated_epoch": 123456789,'
            '"table_description": "test_description", "unique_usage": 5, "total_usage": 10, '
            '"tag_names": ["test_tag1", "test_tag2"]}')]

        self._check_results_helper(expected=expected)
    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = conf.with_fallback(SnowflakeMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(
            conf.get_string(SnowflakeMetadataExtractor.CLUSTER_KEY))

        if conf.get_bool(
                SnowflakeMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "c.table_catalog"
        else:
            cluster_source = "'{}'".format(self._cluster)

        self._database = conf.get_string(
            SnowflakeMetadataExtractor.DATABASE_KEY)
        if six.PY2:
            self._database = self._database.encode('utf-8', 'ignore')

        self.sql_stmt = SnowflakeMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=cluster_source,
            database=self._database)

        LOGGER.info('SQL for snowflake metadata: {}'.format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]
Пример #27
0
    def init(self, conf: ConfigTree) -> None:
        conf = Scoped.get_scoped_conf(conf, self.get_scope()) \
            .with_fallback(conf) \
            .with_fallback(DEFAULT_CONFIG)
        self.target_nodes = set(conf.get_list(TARGET_NODES))
        self.target_relations = set(conf.get_list(TARGET_RELATIONS))
        self.batch_size = conf.get_int(BATCH_SIZE)
        self.dry_run = conf.get_bool(DRY_RUN)
        self.staleness_pct = conf.get_int(STALENESS_MAX_PCT)
        self.staleness_pct_dict = conf.get(STALENESS_PCT_MAX_DICT)
        self.retain_data_with_no_publisher_metadata = conf.get_bool(RETAIN_DATA_WITH_NO_PUBLISHER_METADATA)

        if JOB_PUBLISH_TAG in conf and MS_TO_EXPIRE in conf:
            raise Exception(f'Cannot have both {JOB_PUBLISH_TAG} and {MS_TO_EXPIRE} in job config')

        self.ms_to_expire = None
        if MS_TO_EXPIRE in conf:
            self.ms_to_expire = conf.get_int(MS_TO_EXPIRE)
            if self.ms_to_expire < conf.get_int(MIN_MS_TO_EXPIRE):
                raise Exception(f'{MS_TO_EXPIRE} is too small')
            self.marker = self.ms_to_expire
        else:
            self.marker = conf.get_string(JOB_PUBLISH_TAG)

        trust = neo4j.TRUST_SYSTEM_CA_SIGNED_CERTIFICATES if conf.get_bool(NEO4J_VALIDATE_SSL) \
            else neo4j.TRUST_ALL_CERTIFICATES
        self._driver = \
            GraphDatabase.driver(conf.get_string(NEO4J_END_POINT_KEY),
                                 max_connection_life_time=conf.get_int(NEO4J_MAX_CONN_LIFE_TIME_SEC),
                                 auth=(conf.get_string(NEO4J_USER), conf.get_string(NEO4J_PASSWORD)),
                                 encrypted=conf.get_bool(NEO4J_ENCRYPTED),
                                 trust=trust)
    def test_extraction_with_multiple_query_result(self):
        # type: (Any) -> None
        """
        Test Extraction with multiple result from query
        """
        with patch.object(Neo4jExtractor, '_get_driver'):
            extractor = Neo4jExtractor()
            extractor.init(
                Scoped.get_scoped_conf(conf=self.conf,
                                       scope=extractor.get_scope()))

            extractor.results = [
                'test_result1', 'test_result2', 'test_result3'
            ]

            result = extractor.extract()
            self.assertEqual(result, 'test_result1')

            result = extractor.extract()
            self.assertEqual(result, 'test_result2')

            result = extractor.extract()
            self.assertEqual(result, 'test_result3')

            # Ensure next result is None
            result = extractor.extract()
            self.assertIsNone(result)
Пример #29
0
    def init(self, conf):
        # type: (ConfigTree) -> None
        """
        Initialize Neo4jExtractor object from configuration and use that for extraction
        """
        self.conf = conf
        self.entity = conf.get_string(Neo4jSearchDataExtractor.ENTITY_TYPE,
                                      default='table').lower()
        # extract cypher query from conf, if specified, else use default query
        if Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY in conf:
            self.cypher_query = conf.get_string(
                Neo4jSearchDataExtractor.CYPHER_QUERY_CONFIG_KEY)
        else:
            default_query = Neo4jSearchDataExtractor.DEFAULT_QUERY_BY_ENTITY[
                self.entity]
            self.cypher_query = self._add_publish_tag_filter(
                conf.get_string(JOB_PUBLISH_TAG, ''),
                cypher_query=default_query)

        self.neo4j_extractor = Neo4jExtractor()
        # write the cypher query in configs in Neo4jExtractor scope
        key = self.neo4j_extractor.get_scope(
        ) + '.' + Neo4jExtractor.CYPHER_QUERY_CONFIG_KEY
        self.conf.put(key, self.cypher_query)
        # initialize neo4j_extractor from configs
        self.neo4j_extractor.init(
            Scoped.get_scoped_conf(self.conf,
                                   self.neo4j_extractor.get_scope()))
Пример #30
0
 def test_empty_dataset(self, mock_build: Any) -> None:
     mock_build.return_value = MockBigQueryClient(ONE_DATASET, NO_TABLES, None)
     extractor = BigQueryWatermarkExtractor()
     extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                           scope=extractor.get_scope()))
     result = extractor.extract()
     self.assertIsNone(result)