def test_query_formatting(): test_email = "*****@*****.**" test_query = "select * from foo where id in (select id from bar);" formatted_test_query: str = "SELECT *\n FROM foo\n WHERE id in (\n SELECT id\n FROM bar\n );" event_time = datetime(2020, 1, 1) floored_ts = get_time_bucket(event_time, BucketDuration.DAY) resource = "test_db.test_schema.test_table" ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource) ta.add_read_entry( test_email, test_query, [], ) wu: MetadataWorkUnit = ta.make_usage_workunit( bucket_duration=BucketDuration.DAY, urn_builder=lambda x: x, top_n_queries=10, format_sql_queries=True, ) assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table" assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper) du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect assert du.totalSqlQueries == 1 assert du.topSqlQueries assert du.topSqlQueries.pop() == formatted_test_query
def _aggregate_access_events( self, events: List[RedshiftJoinedAccessEvent] ) -> Dict[datetime, Dict[RedshiftTableRef, AggregatedDataset]]: datasets: Dict[datetime, Dict[RedshiftTableRef, AggregatedDataset]] = collections.defaultdict(dict) for event in events: floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration) resource = f"{event.database}.{event.schema_}.{event.table}" agg_bucket = datasets[floored_ts].setdefault( resource, AggregatedDataset(bucket_start_time=floored_ts, resource=resource), ) # add @unknown.com to username # current limitation in user stats UI, we need to provide email to show users username = f"{event.usename if event.usename else 'unknown'}@{self.config.email_domain}" logger.info(f"username: {username}") agg_bucket.add_read_entry( username, event.text, [], # TODO: not currently supported by redshift; find column level changes ) return datasets
def test_query_trimming(): test_email: str = "*****@*****.**" test_query: str = "select * from test where a > 10 and b > 20 order by a asc" top_n_queries: int = 10 total_budget_for_query_list: int = 200 event_time = datetime(2020, 1, 1) floored_ts = get_time_bucket(event_time, BucketDuration.DAY) resource = "test_db.test_schema.test_table" ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource) ta.total_budget_for_query_list = total_budget_for_query_list ta.add_read_entry( test_email, test_query, [], ) wu: MetadataWorkUnit = ta.make_usage_workunit( bucket_duration=BucketDuration.DAY, urn_builder=lambda x: x, top_n_queries=top_n_queries, format_sql_queries=False, ) assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table" assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper) du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect assert du.totalSqlQueries == 1 assert du.topSqlQueries assert du.topSqlQueries.pop() == "select * f ..."
def _aggregate_access_events( self, events_iterable: Iterable[RedshiftAccessEvent] ) -> AggregatedAccessEvents: datasets: AggregatedAccessEvents = collections.defaultdict(dict) for event in events_iterable: floored_ts: datetime = get_time_bucket(event.starttime, self.config.bucket_duration) resource: str = f"{event.database}.{event.schema_}.{event.table}" # Get a reference to the bucket value(or initialize not yet in dict) and update it. agg_bucket: AggregatedDataset = datasets[floored_ts].setdefault( resource, AggregatedDataset( bucket_start_time=floored_ts, resource=resource, user_email_pattern=self.config.user_email_pattern, ), ) # current limitation in user stats UI, we need to provide email to show users user_email: str = f"{event.username if event.username else 'unknown'}" if "@" not in user_email: user_email += f"@{self.config.email_domain}" logger.info(f"user_email: {user_email}") agg_bucket.add_read_entry( user_email, event.text, [], # TODO: not currently supported by redshift; find column level changes ) return datasets
def test_multiple_query_without_columns(): test_email = "*****@*****.**" test_email2 = "*****@*****.**" test_query = "select * from test" test_query2 = "select * from test2" event_time = datetime(2020, 1, 1) floored_ts = get_time_bucket(event_time, BucketDuration.DAY) resource = "test_db.test_schema.test_table" ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource) ta.add_read_entry( test_email, test_query, [], ) ta.add_read_entry( test_email, test_query, [], ) ta.add_read_entry( test_email2, test_query2, [], ) assert ta.queryCount == 3 assert ta.queryFreq[test_query] == 2 assert ta.userFreq[test_email] == 2 assert ta.queryFreq[test_query2] == 1 assert ta.userFreq[test_email2] == 1 assert len(ta.columnFreq) == 0
def _aggregate_access_events( self, events: Iterable[SnowflakeJoinedAccessEvent] ) -> Dict[datetime, Dict[SnowflakeTableRef, AggregatedDataset]]: datasets: Dict[datetime, Dict[SnowflakeTableRef, AggregatedDataset]] = collections.defaultdict(dict) for event in events: floored_ts = get_time_bucket(event.query_start_time, self.config.bucket_duration) accessed_data = (event.base_objects_accessed if self.config.apply_view_usage_to_tables else event.direct_objects_accessed) for object in accessed_data: resource = object.objectName agg_bucket = datasets[floored_ts].setdefault( resource, AggregatedDataset(bucket_start_time=floored_ts, resource=resource), ) agg_bucket.add_read_entry( event.email, event.query_text, [colRef.columnName.lower() for colRef in object.columns] if object.columns is not None else [], ) return datasets
def _aggregate_enriched_read_events( self, events: Iterable[ReadEvent] ) -> Dict[datetime, Dict[BigQueryTableRef, AggregatedDataset]]: # TODO: handle partitioned tables # TODO: perhaps we need to continuously prune this, rather than # storing it all in one big object. datasets: Dict[datetime, Dict[BigQueryTableRef, AggregatedDataset]] = collections.defaultdict(dict) for event in events: floored_ts = get_time_bucket(event.timestamp, self.config.bucket_duration) resource = event.resource.remove_extras() if resource.is_anonymous(): self.report.report_dropped(str(resource)) continue agg_bucket = datasets[floored_ts].setdefault( resource, AggregatedDataset(bucket_start_time=floored_ts, resource=resource), ) agg_bucket.add_read_entry(event.actor_email, event.query, event.fieldsRead) return datasets
def _aggregate_access_events( self, events: List[ClickHouseJoinedAccessEvent] ) -> Dict[datetime, Dict[ClickHouseTableRef, AggregatedDataset]]: datasets: Dict[ datetime, Dict[ClickHouseTableRef, AggregatedDataset] ] = collections.defaultdict(dict) for event in events: floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration) resource = ( f'{self.config.platform_instance+"." if self.config.platform_instance else ""}' f"{event.schema_}.{event.table}" ) agg_bucket = datasets[floored_ts].setdefault( resource, AggregatedDataset(bucket_start_time=floored_ts, resource=resource), ) # current limitation in user stats UI, we need to provide email to show users user_email = f"{event.usename if event.usename else 'unknown'}" if "@" not in user_email: user_email += f"@{self.config.email_domain}" logger.info(f"user_email: {user_email}") agg_bucket.add_read_entry( user_email, event.query, event.columns, ) return datasets
def test_make_usage_workunit(): test_email = "*****@*****.**" test_query = "select * from test" event_time = datetime(2020, 1, 1) floored_ts = get_time_bucket(event_time, BucketDuration.DAY) resource = "test_db.test_schema.test_table" ta = _TestAggregatedDataset(bucket_start_time=floored_ts, resource=resource) ta.add_read_entry( test_email, test_query, [], ) wu: MetadataWorkUnit = ta.make_usage_workunit( bucket_duration=BucketDuration.DAY, urn_builder=lambda x: x, top_n_queries=10) assert wu.id == "2020-01-01T00:00:00-test_db.test_schema.test_table" assert isinstance(wu.get_metadata()["metadata"], MetadataChangeProposalWrapper) du: DatasetUsageStatisticsClass = wu.get_metadata()["metadata"].aspect assert du.totalSqlQueries == 1 assert du.topSqlQueries assert du.topSqlQueries.pop() == test_query
def _aggregate_access_events( self, events: Iterable[SnowflakeJoinedAccessEvent] ) -> Iterable[Union[AggregatedAccessEvents, MetadataWorkUnit]]: """ Emits aggregated access events combined with operational workunits from the events. """ datasets: AggregatedAccessEvents = collections.defaultdict(dict) for event in events: floored_ts = get_time_bucket(event.query_start_time, self.config.bucket_duration) accessed_data = (event.base_objects_accessed if self.config.apply_view_usage_to_tables else event.direct_objects_accessed) for object in accessed_data: resource = object.objectName agg_bucket = datasets[floored_ts].setdefault( resource, AggregatedDataset( bucket_start_time=floored_ts, resource=resource, user_email_pattern=self.config.user_email_pattern, ), ) agg_bucket.add_read_entry( event.email, event.query_text, [colRef.columnName.lower() for colRef in object.columns] if object.columns is not None else [], ) if self.config.include_operational_stats: yield from self._get_operation_aspect_work_unit(event) yield datasets
def _aggregate_enriched_read_events( self, events: Iterable[ReadEvent] ) -> Dict[datetime, Dict[BigQueryTableRef, AggregatedDataset]]: # TODO: handle partitioned tables # TODO: perhaps we need to continuously prune this, rather than # storing it all in one big object. datasets: Dict[datetime, Dict[BigQueryTableRef, AggregatedDataset]] = collections.defaultdict(dict) num_aggregated: int = 0 for event in events: floored_ts = get_time_bucket(event.timestamp, self.config.bucket_duration) resource: Optional[BigQueryTableRef] = None try: resource = event.resource.remove_extras() except Exception as e: self.report.report_warning( str(event.resource), f"Failed to clean up resource, {e}") logger.warning( f"Failed to process event {str(event.resource)}", e) continue if resource.is_temporary_table( self.config.temp_table_dataset_prefix): logger.debug(f"Dropping temporary table {resource}") self.report.report_dropped(str(resource)) continue agg_bucket = datasets[floored_ts].setdefault( resource, AggregatedDataset( bucket_start_time=floored_ts, resource=resource, user_email_pattern=self.config.user_email_pattern, ), ) agg_bucket.add_read_entry(event.actor_email, event.query, event.fieldsRead) num_aggregated += 1 logger.info(f"Total number of events aggregated = {num_aggregated}.") bucket_level_stats: str = "\n\t" + "\n\t".join([ f'bucket:{db.strftime("%m-%d-%Y:%H:%M:%S")}, size={len(ads)}' for db, ads in datasets.items() ]) logger.debug( f"Number of buckets created = {len(datasets)}. Per-bucket details:{bucket_level_stats}" ) return datasets
def _aggregate_access_events( self, events: List[TrinoJoinedAccessEvent] ) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]: datasets: Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]] = collections.defaultdict(dict) for event in events: floored_ts = get_time_bucket(event.starttime, self.config.bucket_duration) for metadata in event.accessed_metadata: # Skipping queries starting with $system@ if metadata.catalog_name.startswith("$system@"): logging.debug( f"Skipping system query for {metadata.catalog_name}..." ) continue # Filtering down queries to the selected catalog if metadata.catalog_name != self.config.database: continue resource = ( f"{metadata.catalog_name}.{metadata.schema_name}.{metadata.table}" ) agg_bucket = datasets[floored_ts].setdefault( resource, AggregatedDataset( bucket_start_time=floored_ts, resource=resource, user_email_pattern=self.config.user_email_pattern, ), ) # add @unknown.com to username # current limitation in user stats UI, we need to provide email to show users if "@" in parseaddr(event.usr)[1]: username = event.usr else: username = f"{event.usr if event.usr else 'unknown'}@{self.config.email_domain}" agg_bucket.add_read_entry( username, event.query, metadata.columns, ) return datasets
def test_add_one_query_with_ignored_user(): test_email = "*****@*****.**" test_query = "select * from test" event_time = datetime(2020, 1, 1) floored_ts = get_time_bucket(event_time, BucketDuration.DAY) resource = "test_db.test_schema.test_table" ta = _TestAggregatedDataset( bucket_start_time=floored_ts, resource=resource, user_email_pattern=AllowDenyPattern(deny=list(["*****@*****.**"])), ) ta.add_read_entry( test_email, test_query, [], ) assert ta.queryCount == 0 assert ta.queryFreq[test_query] == 0 assert ta.userFreq[test_email] == 0 assert len(ta.columnFreq) == 0