def make_reference_filters(filters, ref_dimension, offset_func): """ Copies and replaces the reference dimension's definition in all of the filters applied to a dataset query. This is used to shift the dimension filters to fit the reference window. :param filters: :param ref_dimension: :param offset_func: :return: """ reference_filters = [] for ref_filter in filters: # Metric filters should not be part of the reference if is_metric_field(ref_filter.field): continue if ref_filter.field is ref_dimension: # NOTE: Important to apply the offset function to the start and stop properties because the date math can # become expensive over many rows ref_filter = copy.copy(ref_filter) ref_filter.start = offset_func(ref_filter.start) ref_filter.stop = offset_func(ref_filter.stop) reference_filters.append(ref_filter) return reference_filters
def _blend_query(dimensions, metrics, orders, field_maps, queries, query_builder): base_query, *join_queries = queries base_field_map, *join_field_maps = field_maps reference = base_query._references[0] if base_query._references else None blender_query = _perform_join_operations(dimensions, base_query, base_field_map, join_queries, join_field_maps) mocked_metrics = set() # WARNING: In order to make complex fields work, the get_sql for each field is monkey patched in. This must # happen here because a complex metric by definition references values selected from the dataset subqueries. for metric in find_dataset_fields(metrics): subquery_field = _get_sq_field_for_blender_field( metric, queries, field_maps, reference) metric._origin_get_sql = metric.get_sql metric.get_sql = subquery_field.get_sql mocked_metrics.add(metric) # WARNING: Artificial dimensions (i.e. dimensions created dynamically), which depend on a metric, # can only be properly mapped once the metrics' get_sql methods are monkey patched. That's the case # for set dimensions. Therefore, dimensions needs to be re-read from the query builder. dimensions = query_builder.dimensions sq_dimensions = [ _get_sq_field_for_blender_field(d, queries, field_maps) for d in dimensions ] sq_metrics = [ _get_sq_field_for_blender_field(m, queries, field_maps, reference) for m in metrics ] blender_query = blender_query.select(*sq_dimensions).select(*sq_metrics) for field, orientation in orders: # Comparing fields using the is operator (i.e. object id) doesn't work for set # dimensions, which are dynamically generated. The dunder hash of Field class # does the job properly properly though, given set dimensions are treated # in particular while object id is used for anything else. if not is_metric_field(field): # Don't add the reference type to dimensions. orderby_field = _get_sq_field_for_blender_field( field, queries, field_maps) else: orderby_field = _get_sq_field_for_blender_field( field, queries, field_maps, reference) blender_query = blender_query.orderby(orderby_field, order=orientation) # Undo the get_sql's mocks above, otherwise reused datasets might produce different results. This issue can # affect tests, for instance. for metric in mocked_metrics: metric.get_sql = metric._origin_get_sql return blender_query
def _replace_field_if_needed(field: Field, fields_per_set_filter, target_dataset: 'DataSet') -> Tuple[Field]: set_filter = fields_per_set_filter.get(_unwrap_field(field)) if not set_filter: return (field, ) set_dimension = _make_set_dimension(set_filter, target_dataset) if isinstance(field, DimensionModifier): modified_set_dimension = deepcopy(field) modified_set_dimension.dimension = set_dimension set_dimension = modified_set_dimension if set_filter.will_replace_referenced_dimension and not is_metric_field( set_dimension): # Metrics should not be replaced. return (set_dimension, ) else: return (set_dimension, field)
def _make_set_dimension(set_filter: Field, target_dataset: 'DataSet') -> Field: """ Returns a new dimension that uses a CASE statement as its definition, in order to represent membership to a set, given the provided conditional. :param set_filter: A ResultSet instance. :param target_dataset: A DataSet instance, that will be used as the dataset for which the new dimension will be applied to. :return: A new Field instance. """ old_definition = set_filter.filter while hasattr(old_definition, 'definition'): old_definition = old_definition.definition old_definition = deepcopy(old_definition) old_definition_sql = old_definition.get_sql(quote_char="") set_dimension = deepcopy(set_filter.filter.field) is_metric = is_metric_field(set_dimension) # When using data blending, the dataset table of the set filter needs to be re-mapped to the table in the # target dataset (i.e. primary or secondary). Otherwise table not found issues would pop up when resolving # the joins. if target_dataset and not is_metric: target_dataset_definition = target_dataset.fields[ set_dimension.alias].definition target_dataset_leaf_definition = target_dataset_definition while hasattr(target_dataset_leaf_definition, 'definition'): # Sometimes a fireant's Field can have nested fireant Fields. target_dataset_leaf_definition = target_dataset_leaf_definition.definition old_definition = _definition_field_for_data_blending( target_dataset_definition, target_dataset_leaf_definition, old_definition) set_term = set_filter.set_label complement_term = set_filter.complement_label if not set_term and not complement_term: set_term = "set({})".format(old_definition_sql) complement_term = "complement({})".format(old_definition_sql) if not set_filter.will_group_complement: complement_term = set_filter.filter.field.definition if is_metric or not set_filter.will_replace_referenced_dimension: # When keeping a referenced dimension, we name the set dimension with a custom alias, so as to have no # alias clashes. That prevents issues with rollups/share dimensions, given the original dimension # is maintained. Also, metrics need to have the same treatment, given that, unlike dimensions, they are # never replaced. set_dimension.alias = alias_selector( "set({})".format(old_definition_sql)) set_dimension.data_type = DataType.text set_dimension.label = "Set({})".format( old_definition_sql ) if not set_filter.set_label else set_filter.set_label set_dimension.definition = Case().when(old_definition, set_term).else_(complement_term) # Necessary for set feature to work properly with data blending's field mapping. set_dimension.is_artificial = True return set_dimension
def _make_set_dimension(set_filter: Field, target_dataset: 'DataSet') -> Field: """ Returns a new dimension that uses a CASE statement as its definition, in order to represent membership to a set, given the provided conditional. :param set_filter: A ResultSet instance. :param target_dataset: A DataSet instance, that will be used as the dataset for which the new dimension will be applied to. :return: A new Field instance. """ old_definition = set_filter.filter while hasattr(old_definition, 'definition'): old_definition = old_definition.definition old_definition = deepcopy(old_definition) old_definition_sql = old_definition.get_sql(quote_char="") set_dimension = deepcopy(set_filter.filter.field) is_metric = is_metric_field(set_dimension) # When using data blending, the dataset table of the set filter needs to be re-mapped to the table in the # target dataset (i.e. primary or secondary). The easiest way to do that is to select the field in the # target dataset directly. if (target_dataset and not is_metric and isinstance( old_definition, ( terms.ContainsCriterion, terms.NullCriterion, terms.BetweenCriterion, terms.BitwiseAndCriterion, ), )): target_dataset_definition = deepcopy( target_dataset.fields[set_dimension.alias].definition) if not isinstance(old_definition.term, (terms.ValueWrapper, terms.Function)): old_definition.term = target_dataset_definition if target_dataset and not is_metric and isinstance(old_definition, terms.BasicCriterion): target_dataset_definition = deepcopy( target_dataset.fields[set_dimension.alias].definition) if not isinstance(old_definition.left, (terms.ValueWrapper, terms.Function)): old_definition.left = target_dataset_definition if not isinstance(old_definition.right, (terms.ValueWrapper, terms.Function)): old_definition.right = target_dataset_definition set_term = set_filter.set_label complement_term = set_filter.complement_label if not set_term and not complement_term: set_term = "set({})".format(old_definition_sql) complement_term = "complement({})".format(old_definition_sql) if not set_filter.will_group_complement: complement_term = set_filter.filter.field.definition if is_metric or not set_filter.will_replace_referenced_dimension: # When keeping a referenced dimension, we name the set dimension with a custom alias, so as to have no # alias clashes. That prevents issues with rollups/share dimensions, given the original dimension # is maintained. Also, metrics need to have the same treatment, given that, unlike dimensions, they are # never replaced. set_dimension.alias = alias_selector( "set({})".format(old_definition_sql)) set_dimension.data_type = DataType.text set_dimension.label = "Set({})".format( old_definition_sql ) if not set_filter.set_label else set_filter.set_label set_dimension.definition = Case().when(old_definition, set_term).else_(complement_term) # Necessary for set feature to work properly with data blending's field mapping. set_dimension.is_artificial = True return set_dimension
def sql(self): """ Serialize this query builder to a list of Pypika/SQL queries. This function will return one query for every combination of reference and rolled up dimension (including null options). This collects all of the metrics in each widget, dimensions, and filters and builds a corresponding pypika query to fetch the data. When references are used, the base query normally produced is wrapped in an outer query and a query for each reference is joined based on the referenced dimension shifted. :return: a list of Pypika's Query subclass instances. """ # First run validation for the query on all widgets self._validate() datasets, field_maps = _datasets_and_field_maps( self.dataset, self._filters) selected_blender_dimensions = self.dimensions selected_blender_dimensions_aliases = { dimension.alias for dimension in selected_blender_dimensions } selected_blender_metrics = find_metrics_for_widgets(self._widgets) selected_blender_metrics_aliases = { metric.alias for metric in selected_blender_metrics } operations = find_operations_for_widgets(self._widgets) share_dimensions = find_share_dimensions(selected_blender_dimensions, operations) non_set_filters = omit_set_filters(self._filters) # Add fields to be ordered on, to metrics if they aren't yet selected in metrics or dimensions # To think about: if the selected order_by field is a dimension, should we add it to dimensions? for field, _ in self.orders: if (field.alias not in selected_blender_metrics_aliases and field.alias not in selected_blender_dimensions_aliases): selected_blender_metrics.append(field) # Needed dimensions in final query as tuples of (dimension, is_selected_dimension) needed_blender_dimensions = [ (dimension_field, True) for dimension_field in selected_blender_dimensions ] # Add dimension filters which are not selected to the pool of needed dimensions for filter_ in non_set_filters: if not is_metric_field(filter_.field) and ( filter_.field.alias not in selected_blender_dimensions_aliases): needed_blender_dimensions.append((filter_.field, False)) selected_metrics_as_dataset_fields = find_dataset_fields( selected_blender_metrics) # Determine for each dataset which metrics and dimensions need to be selected dataset_dimensions = [[] for _ in range(len(datasets))] dataset_metrics = [] dataset_filters = [] dataset_included_in_final_query = [False] * len(datasets) # First determine the metrics. If a a metric is requested, and the dataset has it, add it for that dataset. # We include metrics used in filters. We also save for each dataset the mapped metrics and filters for dataset_index, dataset in enumerate(datasets): dataset_metrics.append( map_blender_fields_to_dataset_fields( selected_metrics_as_dataset_fields, field_maps[dataset_index], dataset, )) dataset_filters.append( map_blender_fields_to_dataset_fields(non_set_filters, field_maps[dataset_index], dataset)) # Metric selected from this dataset, so include it. if dataset_metrics[dataset_index]: dataset_included_in_final_query[dataset_index] = True continue # Filter with metric from this dataset selected, so include it. for filter_ in dataset_filters[dataset_index]: if is_metric_field(filter_.field): dataset_included_in_final_query[dataset_index] = True break # Second map the dimensions and find the dimensions which are unique to a dataset. Include those. # Also save for each dimension of which datasets it is part of. dimensions_dataset_info = [] for blender_dimension_field, is_selected_dimension in needed_blender_dimensions: dimension_dataset_info = [] for dataset_index, dataset in enumerate(datasets): mapped_dimension = map_blender_field_to_dataset_field( blender_dimension_field, field_maps[dataset_index], dataset) if mapped_dimension is not None: dimension_dataset_info.append( (dataset_index, mapped_dimension, is_selected_dimension)) if len(dimension_dataset_info) == 0: # This case should only happen when using sets, otherwise I would have raised the following exception: # raise Exception("Dimension requested that was not part of any dataset.") pass elif len(dimension_dataset_info) == 1: # This is the only dataset that has this dimension, assign it dataset_index, _, _ = dimension_dataset_info[0] dataset_included_in_final_query[dataset_index] = True if dimension_dataset_info: dimensions_dataset_info.append(dimension_dataset_info) # Add all the dimensions to the subqueries that are already selected for the final query # Add dimensions that are not yet accounted for to the first dataset that has it for dimension_dataset_info in dimensions_dataset_info: dimension_accounted_for = False first_dataset_that_has_the_dimension = None for (dataset_index, mapped_dimension, is_selected_dimension) in dimension_dataset_info: # If the dataset is already part of the final query, add this dimension if dataset_included_in_final_query[dataset_index]: dimension_accounted_for = True if is_selected_dimension: dataset_dimensions[dataset_index].append( mapped_dimension) # Update first_dataset_that_has_the_dimension if needed if not dimension_accounted_for and first_dataset_that_has_the_dimension is None: first_dataset_that_has_the_dimension = ( dataset_index, mapped_dimension, is_selected_dimension, ) if not dimension_accounted_for: # Dimension not yet accounted for! Take first dataset that has the dimension. dataset_index, mapped_dimension, is_selected_dimension = first_dataset_that_has_the_dimension dataset_included_in_final_query[dataset_index] = True if is_selected_dimension: dataset_dimensions[dataset_index].append(mapped_dimension) datasets_queries = [] filtered_field_maps = [] for dataset_index, dataset in enumerate(datasets): if dataset_included_in_final_query[dataset_index]: datasets_queries.append( _build_dataset_query( dataset, field_maps[dataset_index], dataset_metrics[dataset_index], dataset_dimensions[dataset_index], dataset_filters[dataset_index], self._references, operations, share_dimensions, )) # Filter the field maps of which the dataset is not going to be in the final query. filtered_field_maps.append(field_maps[dataset_index]) """ A dataset query can yield one or more sql queries, depending on how many types of references or dimensions with totals are selected. A blended dataset query must yield the same number and types of sql queries, but each blended together. The individual dataset queries built above will always yield the same number of sql queries, so here those lists of sql queries are zipped. base ref totals ref+totals ds1 | ds1_a ds1_b ds1_c ds1_d ds2 | ds2_a ds2_b ds2_c ds2_d More concretely, using the diagram above as a reference, a dataset query with 1 reference and 1 totals dimension would yield 4 sql queries. With data blending with 1 reference and 1 totals dimension, 4 sql queries must also be produced. The following lines convert the list of rows of the table in the diagram to a list of columns. Each set of queries in a column are then reduced to a single data blending sql query. """ per_dataset_queries_count = max( [len(dataset_queries) for dataset_queries in datasets_queries]) # There will be the same amount of query sets as the longest length of queries for a single dataset query_sets = [[] for _ in range(per_dataset_queries_count)] # Add the queries returned for each dataset to the correct queryset for dataset_index, dataset_queries in enumerate(datasets_queries): for i, query in enumerate(dataset_queries): query_sets[i].append(query) blended_queries = [] for queryset in query_sets: blended_query = _blend_query( selected_blender_dimensions, selected_blender_metrics, self.orders, filtered_field_maps, queryset, self, ) blended_query = self._apply_pagination(blended_query) if blended_query: blended_queries.append(blended_query) return blended_queries