def __init__( self, formula: ClauseElement, label: str, aggregation: Optional[AggregationDefinition] = None, data_source: Optional[str] = None, ): self.formula = formula self.label = safe_identifier(label) self.aggregation = aggregation or AggregationDefinition( type=AggregationType.sum) self.data_source = data_source
def project_dataframe( cls, calc_df: Dataframe, return_taxons: Dict[TaxonExpressionStr, Taxon], physical_data_sources: Set[str], order_by: Optional[List[TaxonDataOrder]] = None, limit: Optional[int] = None, offset: Optional[int] = None, ) -> Dataframe: """ Applies in this order: - filtering - ordering - limiting and offsetting """ for order_by_rule in order_by or []: if order_by_rule.taxon not in return_taxons: raise InvalidRequest( 'request.order_by', f'Taxon "{order_by_rule.taxon}" used in order_by clause must be also selected.' ) projected_sql_and_df_columns, final_query = cls._project_columns( calc_df.query, calc_df, return_taxons) final_query = final_query.select_from(calc_df.query) projected_df_columns = Dataframe.dataframe_columns_to_map( [df_col for _, df_col in projected_sql_and_df_columns]) if order_by: final_query = final_query.order_by(*[ nullslast(ORDER_BY_FUNCTIONS[item.type](column( safe_identifier(item.taxon)))) for item in (order_by or []) ]) if limit is not None: final_query = final_query.limit(limit) if offset is not None: final_query = final_query.offset(offset) return Dataframe( final_query, projected_df_columns, calc_df.used_model_names, used_physical_data_sources=physical_data_sources, )
def _rebuild_taxon_info_map_inner_query(self): """ Updates the internal taxon model info map, because we use inner query to select the raw data """ taxon_model_info_map = dict() for taxon_slug_expression, info in self.taxon_model_info_map.items(): new_info = TaxonModelInfo(safe_identifier(taxon_slug_expression), info.model_name, info.quantity_type) taxon_model_info_map[taxon_slug_expression] = new_info for filter_slug, formula in self.filter_templates.items(): if filter_slug in taxon_model_info_map: info = taxon_model_info_map[filter_slug] render_params = dict() for used_slug in formula.used_taxons: render_params[used_slug] = taxon_model_info_map[ used_slug].taxon_sql_accessor sql_accessor = formula.render_formula(**render_params) taxon_model_info_map[filter_slug] = TaxonModelInfo( sql_accessor, info.model_name, info.quantity_type) self.taxon_model_info_map = taxon_model_info_map
def _build_query_window_aggregations( self, taxon_to_model: Dict[TaxonSlugExpression, HuskyModel], ordered_query_joins: Sequence[QueryJoins], ) -> Select: """ Generates query for taxons which need window functions for aggregation :param taxon_to_model: Map of taxon slugs (key) and models they are coming from (value) :param ordered_query_joins: List of joins """ selectors = [] # generate inner query with window aggregation functions for taxon_slug_expression, taxon in sorted( self.projection_taxons.items(), key=lambda x: str(x[0])): model = taxon_to_model[taxon_slug_expression] if (taxon.tel_metadata and taxon.tel_metadata.aggregation_definition and taxon.tel_metadata.aggregation_definition.params and taxon.tel_metadata_aggregation_type in self._AGGREGATION_WINDOW_FUNCTIONS): # find the order_by columns order_by = [] window_params = cast( AggregationParamsSortDimension, taxon.tel_metadata.aggregation_definition.params) for field in window_params.sort_dimensions: col = taxon_to_model[TaxonSlugExpression( field.taxon)].taxon_sql_accessor( self.ctx, field.taxon) order_by_dir = field.order_by or TaxonOrderType.asc order_by.append( nullslast(ORDER_BY_FUNCTIONS[order_by_dir]( literal_column(col)))) # apply window aggregation functions column = self._AGGREGATION_WINDOW_FUNCTIONS[ taxon.tel_metadata_aggregation_type](literal_column( model.taxon_sql_accessor(self.ctx, taxon.slug))).over( partition_by=self.get_partition_by_columns(model), order_by=order_by) else: # otherwise, render the columns "as-is" column = literal_column( model.taxon_sql_accessor(self.ctx, taxon.slug)) selectors.append(column.label(taxon.slug_safe_sql_identifier)) # add joins to the inner query inner_query = select(selectors).select_from( self._build_from_joins(ordered_query_joins)) # apply scope filters to the inner query inner_query = ScopeGuard.add_scope_row_filters( self.ctx, self.scope, inner_query, self.taxon_model_info_map) # update taxon model info map, because we're selecting from outer query and not the inner query self._rebuild_taxon_info_map_inner_query() # then, we use prepare the outer query on which we can safely apply GROUP BY return self._build_selectors(lambda _, taxon_slug: safe_identifier( taxon_slug)).select_from(inner_query)
def generate_identifier(column_name: str, slug: str, include_unknown_values: bool) -> str: """Generate SQL identifier for the mapping""" return safe_identifier( f'__om_{column_name}_{slug}_{str(include_unknown_values)}')
def generate_cte_name(slug: str) -> str: """Generated name for CTE representing override mapping with the slug""" return safe_identifier(f'__om_{slug}')
def _add_aggregation( cls, inner_query: Select, aggregation_columns: List[ColumnClause], group_by_columns: List[ColumnClause], grouping_sets: Optional[GroupingSets] = None, ) -> Select: """ Aggregates raw metric taxons. Groups by given dimension taxons or grouping sets. :param inner_query: Query to aggregate :param aggregation_columns: List of columns with applied aggregation function :param group_by_columns: List of columns to group by :param grouping_sets: Optional list of grouping sets to group by instead :return: Aggregated query """ if grouping_sets: # Because we union _PANORAMIC_GROUPINGSETS_NULL with column that can be date(time) or number, # we must cast all group columns to text. Some DB engines fail when we do casting and grouping in one query, # thus here we need to stringify the group columns in the CTE, and not in the group by query just below... group_by_column_names = {col.name for col in group_by_columns} stringified_group_columns = [] for col in inner_query.columns: if col.name in group_by_column_names: stringified_group_columns.append( cast(col, sqlalchemy.VARCHAR).label(col.name)) else: stringified_group_columns.append(col) # common table expression reused by multiple grouping sets queries cte_query = (Select( columns=sort_columns(stringified_group_columns)).select_from( inner_query).cte('__cte_grouping_sets')) grouping_sets_queries = [] for grouping_set in grouping_sets: safe_grouping_set = [ safe_identifier(col) for col in grouping_set ] # dimensions in the grouping set, used to aggregate values with group by gs_group_columns = [ col for col in group_by_columns if col.name in safe_grouping_set ] # extra dimensions not in the grouping set, returned as custom null values gs_null_columns = [ literal_column(f"'{_PANORAMIC_GROUPINGSETS_NULL}'").label( col.name) for col in group_by_columns if col.name not in safe_grouping_set ] grouping_sets_queries.append( Select(columns=sort_columns( gs_group_columns + gs_null_columns + aggregation_columns)).select_from(cte_query).group_by( *sort_columns(gs_group_columns))) return union_all(*grouping_sets_queries) # If grouping sets are not defined, use all dimensions for grouping. return (Select(columns=sort_columns( group_by_columns + aggregation_columns)).select_from(inner_query).group_by( *sort_columns(group_by_columns)))
def _build_comparison_blend_query( cls, ctx: HuskyQueryContext, config_arg: BlendingDataRequest, taxon_manager: BlendingTaxonManager, query_info: BlendingQueryInfo, allowed_physical_data_sources: Optional[Set[str]] = None, ) -> Optional[Dataframe]: """ Builds comparison query for each subrequest and then blends them all into one comparison dataframe. """ dataframes = [] config = BlendingDataRequest(config_arg.to_native( )) # Clone, coz we will be modifying subqueries assert config.comparison, 'Comparison must be defined when trying to build comparison query..' comparison: ComparisonConfig = config.comparison for _subrequest in config.data_subrequests: subrequest = cls._build_comparison_subrequest( _subrequest, comparison, taxon_manager) data_source = subrequest.properties.data_source # if no comparison taxons were found for this subrequest, skip creating comparison query for it as well if len(subrequest.taxons) == 0: continue bm_sub_query_info = QueryInfo.create(subrequest) query_info.comparison_subrequests_info.append(bm_sub_query_info) # Build comparison dataframe and add it to a list. # TODO pass down TelPlan for comparisons # ComparisonRequestBuilder might have added filters (typically for company id project id) # Me create new filter templates for this comparison subrequest. filter_templates = TelPlanner.get_preaggregation_filter_templates( ctx, [ subrequest.preaggregation_filters, subrequest.scope.preaggregation_filters ], taxon_manager.taxon_map, data_source, ) dataframes.append( QueryBuilder.build_query( ctx, subrequest, bm_sub_query_info, taxon_manager.used_taxons, dimension_templates=taxon_manager.plan. comparison_data_source_formula_templates[data_source], filter_templates=filter_templates, allowed_physical_data_sources=allowed_physical_data_sources, )) # if no comparison subrequests were created, there is no need to blend data frames if len(dataframes) == 0: return None # Blend all comparison dataframes into one # TODO pass down TelPlan for comparisons data_source_formula_templates = taxon_manager.plan.comparison_data_source_formula_templates dataframe = blend_dataframes(ctx, dataframes, data_source_formula_templates) # Prefix all comparison metric columns with 'comparison@' and create comparison taxon for it. query = dataframe.query final_columns = [] aliased_taxon_by_slug: Dict[TaxonExpressionStr, DataframeColumn] = dict() for slug, df_column in dataframe.slug_to_column.items(): # Alias metrics with comparison@ prefix, and select dimensions.. if df_column.taxon.is_dimension: new_taxon = df_column.taxon.copy(deep=True) new_slug = TaxonExpressionStr(f'{slug}') else: new_slug, new_taxon = BlendingTaxonManager.create_comparison_taxon( df_column.taxon) final_columns.append(query.c[safe_identifier(slug)].label( new_taxon.slug_safe_sql_identifier)) aliased_taxon_by_slug[new_slug] = DataframeColumn( new_slug, new_taxon, df_column.quantity_type) for pre_formulas in data_source_formula_templates.values(): # and also select the dim columns from dim templates. for pre_formula in pre_formulas: final_columns.append( literal_column( quote_identifier(pre_formula.label, ctx.dialect))) renamed_cols_query = select(sort_columns(final_columns)).select_from( dataframe.query) return Dataframe(renamed_cols_query, aliased_taxon_by_slug, dataframe.used_model_names, dataframe.used_physical_data_sources)
def left_join_dataframes( ctx: HuskyQueryContext, data_dataframe: Dataframe, comparison_dataframe: Dataframe, tel_plan: TelPlan ) -> Dataframe: """ Produces new DF, that is DATA_DF LEFT JOIN COMPARISON_DF on given list of taxons. :param ctx: Husky query context :param data_dataframe: df to left join to :param comparison_dataframe: other df :param tel_plan: Current TEL plan :return: Left joined dataframe """ # Alias their queries to be able to easily reference them. data_table = data_dataframe.query.alias('data_dataframe') comparison_table = comparison_dataframe.query.alias('comparison_dataframe') # Union taxon slugs from both DFs. columns_by_slug = {**data_dataframe.slug_to_column, **comparison_dataframe.slug_to_column} select_columns = set() # Select the column from specific data frame (data or comparison), but then label them to remove that prefix, # since the names are already unique (from TEL planner) for slug in data_dataframe.slug_to_column.keys(): select_columns.add( literal_column(f'data_dataframe.{safe_quote_identifier(slug, ctx.dialect)}').label(safe_identifier(slug)) ) for slug, df_column in comparison_dataframe.slug_to_column.items(): taxon: Taxon = df_column.taxon if taxon.is_comparison_taxon: select_columns.add( literal_column(f'comparison_dataframe.{safe_quote_identifier(slug, ctx.dialect)}').label( safe_identifier(slug) ) ) join_on_conditions = [] for template in tel_plan.dimension_formulas: # Select the data source formula labels explicitly from data table select_columns.add(data_table.c[template.label]) for join_column in tel_plan.comparison_join_columns: join_on_conditions.append( # Account for dimensions that can have NULL values, because NULL = NULL evaluates to FALSE in SQL, # second condition that compares both columns to IS NULL needs to be added. or_( data_table.c[join_column] == comparison_table.c[join_column], and_(data_table.c[join_column].is_(None), comparison_table.c[join_column].is_(None)).self_group(), ) ) if len(join_on_conditions) == 0: # In case there were no comparison dimensions defined, the comparison dataframe also has no dimensions # (thus it is one row) and we can safely do a join without ON clause to data dataframe. # Using 1=1 as a easiest way to do join without ON clause in alchemy... join_on_conditions.append(text('1=1')) q = select(sort_columns(list(select_columns))).select_from( data_table.outerjoin(comparison_table, and_(*join_on_conditions)) ) return Dataframe( q, columns_by_slug, data_dataframe.used_model_names | comparison_dataframe.used_model_names, data_dataframe.used_physical_data_sources | comparison_dataframe.used_physical_data_sources, )
def slug_safe_sql_identifier(self): """ Returns slug that is safe to use on any database, especially on BigQuery. :return: """ return safe_identifier(self.slug)
def __init__( self, name: TaxonExpressionStr, taxon: Taxon, quantity_type: ValueQuantityType = ValueQuantityType.scalar ): self.name = safe_identifier(name) self.taxon = taxon self.quantity_type = quantity_type
def __init__(self, template: SqlTemplate, label: str, data_source: str, used_taxons: Set[str]): self.template = template self.label = safe_identifier(label) self.data_source = data_source self.used_taxons = used_taxons