def col_split(dataset, request: Request, column_split_spec: ColumnSplitSpec, *args, **kwargs): """ Split query in 2 steps if a large number of columns is being selected. - First query only selects event_id and project_id. - Second query selects all fields for only those events. - Shrink the date range. """ # The query function may mutate the request body during query # evaluation, so we need to copy the body to ensure that the query has # not been modified by the time we're ready to run the full query. minimal_request = copy.deepcopy(request) minimal_request.query.set_selected_columns( column_split_spec.get_min_columns()) result = query_func(dataset, minimal_request, *args, **kwargs) del minimal_request if result.result["data"]: request = copy.deepcopy(request) event_ids = list( set([ event[column_split_spec.id_column] for event in result.result["data"] ])) request.query.add_conditions([(column_split_spec.id_column, "IN", event_ids)]) request.query.set_offset(0) request.query.set_limit(len(event_ids)) project_ids = list( set([ event[column_split_spec.project_column] for event in result.result["data"] ])) request.extensions["project"]["project"] = project_ids timestamp_field = column_split_spec.timestamp_column timestamps = [ event[timestamp_field] for event in result.result["data"] ] request.extensions[ "timeseries"]["from_date"] = util.parse_datetime( min(timestamps)).isoformat() # We add 1 second since this gets translated to ('timestamp', '<', to_date) # and events are stored with a granularity of 1 second. request.extensions["timeseries"]["to_date"] = ( util.parse_datetime(max(timestamps)) + timedelta(seconds=1)).isoformat() return query_func(dataset, request, *args, **kwargs)
def get_split_query_spec(self) -> Union[None, ColumnSplitSpec]: return ColumnSplitSpec( id_column="events.event_id", project_column="events.project_id", timestamp_column="events.timestamp", )