def _project_column( cls, query: Select, taxon: Taxon, source_df_column: Optional[DataframeColumn], ) -> ColumnAndDataframeColumn: """ Returns projection SQL for given taxon, and also description of that projected column in a form of DataframeColumn. """ try: col = query.columns[taxon.slug_safe_sql_identifier] assert ( source_df_column is not None ), f'DataframeColumn is required for dimension types. taxon_slug: {taxon.slug}' if source_df_column.quantity_type == ValueQuantityType.array: # We want to cast array into a string when selecting it. # The final quantity is thus always scalar. col = cast(col, String) df_col = DataframeColumn(TaxonExpressionStr(taxon.slug), taxon, ValueQuantityType.scalar) return col.label(taxon.slug_safe_sql_identifier), df_col except TelExpressionException as error: raise HuskyInvalidTelException(error, taxon.slug)
def get_mocked_dataframe_columns_map( taxon_slugs: List[str]) -> Dict[TaxonExpressionStr, DataframeColumn]: """ Helper fn that creates DataframeColumn map, where all columns are scalar by default. :param taxon_slugs: :return: """ taxon_map = mock_get_taxons_map(None, taxon_slugs) return { slug_expr: DataframeColumn(slug_expr, taxon, ValueQuantityType.scalar) for slug_expr, taxon in taxon_map.items() }
def calculate_dataframe( self, ctx: HuskyQueryContext, df: Dataframe, physical_data_sources: Set[str], grouping_sets: Optional[GroupingSets] = None, filter_clause: Optional[FilterClause] = None, ) -> Dataframe: """ Applies in this order: - pre aggregation logic - aggregation by group by or grouping sets - optional step of window function aggregation - after aggregation logic - filters. Filters are applied here to simplify the final query and apply filtering before filling date gaps. """ pre_agg_columns = [ ] # Columns with applied aggregation function in aggregation step # Columns to select from window step - columns that are not removed and dont need window step select_from_window_step: List[ColumnClause] = [] df_columns: List[DataframeColumn] = [ ] # Final df columns after all steps. group_columns = [] final_columns: List[ColumnClause] = [] for pre_formula in self.taxon_manager.plan.metric_pre: col = pre_formula.formula.label(pre_formula.label) aggregation_fn = self.AGGREGATION_FUNCTIONS_MAP.get( pre_formula.aggregation.type) if aggregation_fn: # we know the aggregation function so let's use it pre_agg_columns.append( aggregation_fn(col).label(pre_formula.label)) else: # if no aggregation function is defined, then we simply group by this formula group_columns.append(col) select_from_window_step.append(col) # taxon slugs used in group by clause dimension_taxon_slugs = { group_column.name for group_column in group_columns } for post_formula, taxon in self.taxon_manager.plan.metric_post: post_formula_sql = post_formula.render_formula( ctx.dialect, dimension_taxon_slugs) col = post_formula_sql.label(taxon.slug_safe_sql_identifier) final_columns.append(col) df_columns.append(DataframeColumn(taxon.slug_expr, taxon)) # Aggregation query with column logic. This is the first aggregation step, regular group by # or a common table expression with multiple group by statements in case of grouping sets. pre_query = self._add_aggregation(df.query, pre_agg_columns, group_columns, grouping_sets) # Post aggregation logic post_query = Select( columns=sort_columns(final_columns)).select_from(pre_query) slug_to_column = Dataframe.dataframe_columns_to_map(df_columns) if filter_clause: taxon_model_info = { str(slug): TaxonModelInfo(safe_quote_identifier(slug, ctx.dialect)) for slug in slug_to_column.keys() } post_query = FilterBuilder.augment_query(ctx, post_query, taxon_model_info, filter_clause) return Dataframe(post_query, slug_to_column, df.used_model_names, physical_data_sources)
def _build_comparison_blend_query( cls, ctx: HuskyQueryContext, config_arg: BlendingDataRequest, taxon_manager: BlendingTaxonManager, query_info: BlendingQueryInfo, allowed_physical_data_sources: Optional[Set[str]] = None, ) -> Optional[Dataframe]: """ Builds comparison query for each subrequest and then blends them all into one comparison dataframe. """ dataframes = [] config = BlendingDataRequest(config_arg.to_native( )) # Clone, coz we will be modifying subqueries assert config.comparison, 'Comparison must be defined when trying to build comparison query..' comparison: ComparisonConfig = config.comparison for _subrequest in config.data_subrequests: subrequest = cls._build_comparison_subrequest( _subrequest, comparison, taxon_manager) data_source = subrequest.properties.data_source # if no comparison taxons were found for this subrequest, skip creating comparison query for it as well if len(subrequest.taxons) == 0: continue bm_sub_query_info = QueryInfo.create(subrequest) query_info.comparison_subrequests_info.append(bm_sub_query_info) # Build comparison dataframe and add it to a list. # TODO pass down TelPlan for comparisons # ComparisonRequestBuilder might have added filters (typically for company id project id) # Me create new filter templates for this comparison subrequest. filter_templates = TelPlanner.get_preaggregation_filter_templates( ctx, [ subrequest.preaggregation_filters, subrequest.scope.preaggregation_filters ], taxon_manager.taxon_map, data_source, ) dataframes.append( QueryBuilder.build_query( ctx, subrequest, bm_sub_query_info, taxon_manager.used_taxons, dimension_templates=taxon_manager.plan. comparison_data_source_formula_templates[data_source], filter_templates=filter_templates, allowed_physical_data_sources=allowed_physical_data_sources, )) # if no comparison subrequests were created, there is no need to blend data frames if len(dataframes) == 0: return None # Blend all comparison dataframes into one # TODO pass down TelPlan for comparisons data_source_formula_templates = taxon_manager.plan.comparison_data_source_formula_templates dataframe = blend_dataframes(ctx, dataframes, data_source_formula_templates) # Prefix all comparison metric columns with 'comparison@' and create comparison taxon for it. query = dataframe.query final_columns = [] aliased_taxon_by_slug: Dict[TaxonExpressionStr, DataframeColumn] = dict() for slug, df_column in dataframe.slug_to_column.items(): # Alias metrics with comparison@ prefix, and select dimensions.. if df_column.taxon.is_dimension: new_taxon = df_column.taxon.copy(deep=True) new_slug = TaxonExpressionStr(f'{slug}') else: new_slug, new_taxon = BlendingTaxonManager.create_comparison_taxon( df_column.taxon) final_columns.append(query.c[safe_identifier(slug)].label( new_taxon.slug_safe_sql_identifier)) aliased_taxon_by_slug[new_slug] = DataframeColumn( new_slug, new_taxon, df_column.quantity_type) for pre_formulas in data_source_formula_templates.values(): # and also select the dim columns from dim templates. for pre_formula in pre_formulas: final_columns.append( literal_column( quote_identifier(pre_formula.label, ctx.dialect))) renamed_cols_query = select(sort_columns(final_columns)).select_from( dataframe.query) return Dataframe(renamed_cols_query, aliased_taxon_by_slug, dataframe.used_model_names, dataframe.used_physical_data_sources)
def query( cls, select_query: Select, taxon_model_info_map: Dict[str, TaxonModelInfo], projection_taxons: SlugExprTaxonMap, data_source: str, order_by: Optional[List[TaxonDataOrder]], limit: Optional[int], offset: Optional[int], used_physical_data_sources: Set[str], dimension_templates: Optional[List[SqlFormulaTemplate]] = None, ) -> Dataframe: """ Generates the final projected dataframe :param select_query: Original query fetching all necessary fields :param taxon_model_info_map: Map of taxon slug expression to taxon model info :param projection_taxons: List of taxons meant to be projected by the final query :param data_source: Virtual data source for this subrequest :param order_by: List of clauses for order by :param limit: Limit for the query :param offset: Offset for the query :param dimension_templates: List of dimension templates :return: Final dataframe including all requested taxons """ group_by = [] selectors = [] projected_df_columns: Dict[TaxonExpressionStr, DataframeColumn] = {} for taxon in projection_taxons.values(): # apply aggregation, if you need to agg_type = taxon.tel_metadata_aggregation_type if agg_type and agg_type in cls._AGGREGATION_FUNCTIONS_MAP: col = cls._AGGREGATION_FUNCTIONS_MAP[agg_type](column(taxon.slug_safe_sql_identifier)) else: col = column(taxon.slug_safe_sql_identifier) col = col.label(taxon.slug_safe_sql_identifier) # create appropriate dataframe column value_quality_type = ValueQuantityType.scalar if not taxon.calculation and taxon.slug_expr in taxon_model_info_map: value_quality_type = taxon_model_info_map[taxon.slug_expr].quantity_type df_column_name = TaxonExpressionStr(taxon.slug) projected_df_columns[df_column_name] = DataframeColumn(df_column_name, taxon, value_quality_type) # make sure we select this column in the query selectors.append(col) # check whether this taxon should be in group by clause if agg_type in cls._GROUP_BY_AGGREGATION_TYPES: group_by.append(col) # make sure we select all columns for dimension templates for dim_template in dimension_templates or []: col = column(dim_template.label) selectors.append(col) # we should group by all dimension templates group_by.append(col) # On purpose adding this value to emulate USING ON FALSE => PROD-8136 selectors.append(literal(data_source).label(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME)) # using literal_column here because some database engines do not like grouping by constant group_by.append(literal_column(HUSKY_QUERY_DATA_SOURCE_COLUMN_NAME)) # created this query new_query = Select( columns=sort_columns(selectors), order_by=[nullslast(ORDER_BY_FUNCTIONS[item.type](item.taxon)) for item in (order_by or [])], group_by=sort_columns(group_by), ).select_from(select_query) if limit is not None: new_query = new_query.limit(limit) if offset is not None: new_query = new_query.offset(offset) # collect names of all used models used_model_names = { model_info.model_name for model_info in taxon_model_info_map.values() if model_info.model_name is not None } return Dataframe(new_query, projected_df_columns, used_model_names, used_physical_data_sources)