Пример #1
0
def test_query_formatting():
    test_email = "*****@*****.**"
    test_query = "select * from foo where id in (select id from bar);"
    formatted_test_query: str = "SELECT *\n  FROM foo\n WHERE id in (\n        SELECT id\n          FROM bar\n       );"
    event_time = datetime(2020, 1, 1)

    floored_ts = get_time_bucket(event_time, BucketDuration.DAY)

    resource = "test_db.test_schema.test_table"

    ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource)
    ta.add_read_entry(
        test_email,
        test_query,
        [],
    )
    wu: MetadataWorkUnit = ta.make_usage_workunit(
        bucket_duration=BucketDuration.DAY,
        urn_builder=lambda x: x,
        top_n_queries=10,
        format_sql_queries=True,
    )
    assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table"
    assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper)
    du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect
    assert du.totalSqlQueries == 1
    assert du.topSqlQueries
    assert du.topSqlQueries.pop() == formatted_test_query
Пример #2
0
    def _aggregate_access_events(
        self, events: List[RedshiftJoinedAccessEvent]
    ) -> Dict[datetime, Dict[RedshiftTableRef, AggregatedDataset]]:
        datasets: Dict[datetime,
                       Dict[RedshiftTableRef,
                            AggregatedDataset]] = collections.defaultdict(dict)

        for event in events:
            floored_ts = get_time_bucket(event.starttime,
                                         self.config.bucket_duration)

            resource = f"{event.database}.{event.schema_}.{event.table}"

            agg_bucket = datasets[floored_ts].setdefault(
                resource,
                AggregatedDataset(bucket_start_time=floored_ts,
                                  resource=resource),
            )

            # add @unknown.com to username
            # current limitation in user stats UI, we need to provide email to show users
            username = f"{event.usename if event.usename else 'unknown'}@{self.config.email_domain}"
            logger.info(f"username: {username}")
            agg_bucket.add_read_entry(
                username,
                event.text,
                [],  # TODO: not currently supported by redshift; find column level changes
            )
        return datasets
Пример #3
0
def test_query_trimming():
    test_email: str = "*****@*****.**"
    test_query: str = "select * from test where a > 10 and b > 20 order by a asc"
    top_n_queries: int = 10
    total_budget_for_query_list: int = 200
    event_time = datetime(2020, 1, 1)
    floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
    resource = "test_db.test_schema.test_table"

    ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource)
    ta.total_budget_for_query_list = total_budget_for_query_list
    ta.add_read_entry(
        test_email,
        test_query,
        [],
    )
    wu: MetadataWorkUnit = ta.make_usage_workunit(
        bucket_duration=BucketDuration.DAY,
        urn_builder=lambda x: x,
        top_n_queries=top_n_queries,
        format_sql_queries=False,
    )

    assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table"
    assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper)
    du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect
    assert du.totalSqlQueries == 1
    assert du.topSqlQueries
    assert du.topSqlQueries.pop() == "select * f ..."
Пример #4
0
 def _aggregate_access_events(
     self, events_iterable: Iterable[RedshiftAccessEvent]
 ) -> AggregatedAccessEvents:
     datasets: AggregatedAccessEvents = collections.defaultdict(dict)
     for event in events_iterable:
         floored_ts: datetime = get_time_bucket(event.starttime,
                                                self.config.bucket_duration)
         resource: str = f"{event.database}.{event.schema_}.{event.table}"
         # Get a reference to the bucket value(or initialize not yet in dict) and update it.
         agg_bucket: AggregatedDataset = datasets[floored_ts].setdefault(
             resource,
             AggregatedDataset(
                 bucket_start_time=floored_ts,
                 resource=resource,
                 user_email_pattern=self.config.user_email_pattern,
             ),
         )
         # current limitation in user stats UI, we need to provide email to show users
         user_email: str = f"{event.username if event.username else 'unknown'}"
         if "@" not in user_email:
             user_email += f"@{self.config.email_domain}"
         logger.info(f"user_email: {user_email}")
         agg_bucket.add_read_entry(
             user_email,
             event.text,
             [],  # TODO: not currently supported by redshift; find column level changes
         )
     return datasets
Пример #5
0
def test_multiple_query_without_columns():
    test_email = "*****@*****.**"
    test_email2 = "*****@*****.**"
    test_query = "select * from test"
    test_query2 = "select * from test2"
    event_time = datetime(2020, 1, 1)
    floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
    resource = "test_db.test_schema.test_table"

    ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource)
    ta.add_read_entry(
        test_email,
        test_query,
        [],
    )
    ta.add_read_entry(
        test_email,
        test_query,
        [],
    )
    ta.add_read_entry(
        test_email2,
        test_query2,
        [],
    )

    assert ta.queryCount == 3
    assert ta.queryFreq[test_query] == 2
    assert ta.userFreq[test_email] == 2
    assert ta.queryFreq[test_query2] == 1
    assert ta.userFreq[test_email2] == 1
    assert len(ta.columnFreq) == 0
Пример #6
0
    def _aggregate_access_events(
        self, events: Iterable[SnowflakeJoinedAccessEvent]
    ) -> Dict[datetime, Dict[SnowflakeTableRef, AggregatedDataset]]:
        datasets: Dict[datetime,
                       Dict[SnowflakeTableRef,
                            AggregatedDataset]] = collections.defaultdict(dict)

        for event in events:
            floored_ts = get_time_bucket(event.query_start_time,
                                         self.config.bucket_duration)

            accessed_data = (event.base_objects_accessed
                             if self.config.apply_view_usage_to_tables else
                             event.direct_objects_accessed)
            for object in accessed_data:
                resource = object.objectName
                agg_bucket = datasets[floored_ts].setdefault(
                    resource,
                    AggregatedDataset(bucket_start_time=floored_ts,
                                      resource=resource),
                )
                agg_bucket.add_read_entry(
                    event.email,
                    event.query_text,
                    [colRef.columnName.lower() for colRef in object.columns]
                    if object.columns is not None else [],
                )

        return datasets
Пример #7
0
    def _aggregate_enriched_read_events(
        self, events: Iterable[ReadEvent]
    ) -> Dict[datetime, Dict[BigQueryTableRef, AggregatedDataset]]:
        # TODO: handle partitioned tables

        # TODO: perhaps we need to continuously prune this, rather than
        # storing it all in one big object.
        datasets: Dict[datetime,
                       Dict[BigQueryTableRef,
                            AggregatedDataset]] = collections.defaultdict(dict)

        for event in events:
            floored_ts = get_time_bucket(event.timestamp,
                                         self.config.bucket_duration)
            resource = event.resource.remove_extras()

            if resource.is_anonymous():
                self.report.report_dropped(str(resource))
                continue

            agg_bucket = datasets[floored_ts].setdefault(
                resource,
                AggregatedDataset(bucket_start_time=floored_ts,
                                  resource=resource),
            )
            agg_bucket.add_read_entry(event.actor_email, event.query,
                                      event.fieldsRead)

        return datasets
Пример #8
0
    def _aggregate_access_events(
        self, events: List[ClickHouseJoinedAccessEvent]
    ) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]:
        datasets: Dict[
            datetime, Dict[ClickHouseTableRef, AggregatedDataset]
        ] = collections.defaultdict(dict)

        for event in events:
            floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration)

            resource = (
                f'{self.config.platform_instance+"." if self.config.platform_instance else ""}'
                f"{event.schema_}.{event.table}"
            )

            agg_bucket = datasets[floored_ts].setdefault(
                resource,
                AggregatedDataset(bucket_start_time=floored_ts, resource=resource),
            )

            # current limitation in user stats UI, we need to provide email to show users
            user_email = f"{event.usename if event.usename else 'unknown'}"
            if "@" not in user_email:
                user_email += f"@{self.config.email_domain}"
            logger.info(f"user_email: {user_email}")
            agg_bucket.add_read_entry(
                user_email,
                event.query,
                event.columns,
            )
        return datasets
Пример #9
0
def test_make_usage_workunit():
    test_email = "*****@*****.**"
    test_query = "select * from test"
    event_time = datetime(2020, 1, 1)
    floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
    resource = "test_db.test_schema.test_table"

    ta = _TestAggregatedDataset(bucket_start_time=floored_ts,
                                resource=resource)
    ta.add_read_entry(
        test_email,
        test_query,
        [],
    )
    wu: MetadataWorkUnit = ta.make_usage_workunit(
        bucket_duration=BucketDuration.DAY,
        urn_builder=lambda x: x,
        top_n_queries=10)

    assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table"
    assert isinstance(wu.get_metadata()["metadata"],
                      MetadataChangeProposalWrapper)
    du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect
    assert du.totalSqlQueries == 1
    assert du.topSqlQueries
    assert du.topSqlQueries.pop() == test_query
Пример #10
0
    def _aggregate_access_events(
        self, events: Iterable[SnowflakeJoinedAccessEvent]
    ) -> Iterable[Union[AggregatedAccessEvents, MetadataWorkUnit]]:
        """
        Emits aggregated access events combined with operational workunits from the events.
        """
        datasets: AggregatedAccessEvents = collections.defaultdict(dict)

        for event in events:
            floored_ts = get_time_bucket(event.query_start_time,
                                         self.config.bucket_duration)

            accessed_data = (event.base_objects_accessed
                             if self.config.apply_view_usage_to_tables else
                             event.direct_objects_accessed)
            for object in accessed_data:
                resource = object.objectName
                agg_bucket = datasets[floored_ts].setdefault(
                    resource,
                    AggregatedDataset(
                        bucket_start_time=floored_ts,
                        resource=resource,
                        user_email_pattern=self.config.user_email_pattern,
                    ),
                )
                agg_bucket.add_read_entry(
                    event.email,
                    event.query_text,
                    [colRef.columnName.lower() for colRef in object.columns]
                    if object.columns is not None else [],
                )
            if self.config.include_operational_stats:
                yield from self._get_operation_aspect_work_unit(event)

        yield datasets
Пример #11
0
    def _aggregate_enriched_read_events(
        self, events: Iterable[ReadEvent]
    ) -> Dict[datetime, Dict[BigQueryTableRef, AggregatedDataset]]:
        # TODO: handle partitioned tables

        # TODO: perhaps we need to continuously prune this, rather than
        # storing it all in one big object.
        datasets: Dict[datetime,
                       Dict[BigQueryTableRef,
                            AggregatedDataset]] = collections.defaultdict(dict)

        num_aggregated: int = 0
        for event in events:
            floored_ts = get_time_bucket(event.timestamp,
                                         self.config.bucket_duration)
            resource: Optional[BigQueryTableRef] = None
            try:
                resource = event.resource.remove_extras()
            except Exception as e:
                self.report.report_warning(
                    str(event.resource), f"Failed to clean up resource, {e}")
                logger.warning(
                    f"Failed to process event {str(event.resource)}", e)
                continue

            if resource.is_temporary_table(
                    self.config.temp_table_dataset_prefix):
                logger.debug(f"Dropping temporary table {resource}")
                self.report.report_dropped(str(resource))
                continue

            agg_bucket = datasets[floored_ts].setdefault(
                resource,
                AggregatedDataset(
                    bucket_start_time=floored_ts,
                    resource=resource,
                    user_email_pattern=self.config.user_email_pattern,
                ),
            )
            agg_bucket.add_read_entry(event.actor_email, event.query,
                                      event.fieldsRead)
            num_aggregated += 1
        logger.info(f"Total number of events aggregated = {num_aggregated}.")
        bucket_level_stats: str = "\n\t" + "\n\t".join([
            f'bucket:{db.strftime("%m-%d-%Y:%H:%M:%S")}, size={len(ads)}'
            for db, ads in datasets.items()
        ])
        logger.debug(
            f"Number of buckets created = {len(datasets)}. Per-bucket details:{bucket_level_stats}"
        )

        return datasets
Пример #12
0
    def _aggregate_access_events(
        self, events: List[TrinoJoinedAccessEvent]
    ) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
        datasets: Dict[datetime,
                       Dict[TrinoTableRef,
                            AggregatedDataset]] = collections.defaultdict(dict)

        for event in events:
            floored_ts = get_time_bucket(event.starttime,
                                         self.config.bucket_duration)
            for metadata in event.accessed_metadata:

                # Skipping queries starting with $system@
                if metadata.catalog_name.startswith("$system@"):
                    logging.debug(
                        f"Skipping system query for {metadata.catalog_name}..."
                    )
                    continue

                # Filtering down queries to the selected catalog
                if metadata.catalog_name != self.config.database:
                    continue

                resource = (
                    f"{metadata.catalog_name}.{metadata.schema_name}.{metadata.table}"
                )

                agg_bucket = datasets[floored_ts].setdefault(
                    resource,
                    AggregatedDataset(
                        bucket_start_time=floored_ts,
                        resource=resource,
                        user_email_pattern=self.config.user_email_pattern,
                    ),
                )

                # add @unknown.com to username
                # current limitation in user stats UI, we need to provide email to show users
                if "@" in parseaddr(event.usr)[1]:
                    username = event.usr
                else:
                    username = f"{event.usr if event.usr else 'unknown'}@{self.config.email_domain}"

                agg_bucket.add_read_entry(
                    username,
                    event.query,
                    metadata.columns,
                )
        return datasets
Пример #13
0
def test_add_one_query_with_ignored_user():
    test_email = "*****@*****.**"
    test_query = "select * from test"
    event_time = datetime(2020, 1, 1)
    floored_ts = get_time_bucket(event_time, BucketDuration.DAY)
    resource = "test_db.test_schema.test_table"

    ta = _TestAggregatedDataset(
        bucket_start_time=floored_ts,
        resource=resource,
        user_email_pattern=AllowDenyPattern(deny=list(["*****@*****.**"])),
    )
    ta.add_read_entry(
        test_email,
        test_query,
        [],
    )

    assert ta.queryCount == 0
    assert ta.queryFreq[test_query] == 0
    assert ta.userFreq[test_email] == 0
    assert len(ta.columnFreq) == 0