Пример #1
0
def make_reference_filters(filters, ref_dimension, offset_func):
    """
    Copies and replaces the reference dimension's definition in all of the filters applied to a dataset query.

    This is used to shift the dimension filters to fit the reference window.

    :param filters:
    :param ref_dimension:
    :param offset_func:
    :return:
    """
    reference_filters = []
    for ref_filter in filters:
        # Metric filters should not be part of the reference
        if is_metric_field(ref_filter.field):
            continue

        if ref_filter.field is ref_dimension:
            # NOTE: Important to apply the offset function to the start and stop properties because the date math can
            # become expensive over many rows
            ref_filter = copy.copy(ref_filter)
            ref_filter.start = offset_func(ref_filter.start)
            ref_filter.stop = offset_func(ref_filter.stop)

        reference_filters.append(ref_filter)

    return reference_filters
def _blend_query(dimensions, metrics, orders, field_maps, queries,
                 query_builder):
    base_query, *join_queries = queries
    base_field_map, *join_field_maps = field_maps

    reference = base_query._references[0] if base_query._references else None
    blender_query = _perform_join_operations(dimensions, base_query,
                                             base_field_map, join_queries,
                                             join_field_maps)

    mocked_metrics = set()

    # WARNING: In order to make complex fields work, the get_sql for each field is monkey patched in. This must
    # happen here because a complex metric by definition references values selected from the dataset subqueries.
    for metric in find_dataset_fields(metrics):
        subquery_field = _get_sq_field_for_blender_field(
            metric, queries, field_maps, reference)
        metric._origin_get_sql = metric.get_sql
        metric.get_sql = subquery_field.get_sql
        mocked_metrics.add(metric)

    # WARNING: Artificial dimensions (i.e. dimensions created dynamically), which depend on a metric,
    # can only be properly mapped once the metrics' get_sql methods are monkey patched. That's the case
    # for set dimensions. Therefore, dimensions needs to be re-read from the query builder.
    dimensions = query_builder.dimensions

    sq_dimensions = [
        _get_sq_field_for_blender_field(d, queries, field_maps)
        for d in dimensions
    ]
    sq_metrics = [
        _get_sq_field_for_blender_field(m, queries, field_maps, reference)
        for m in metrics
    ]

    blender_query = blender_query.select(*sq_dimensions).select(*sq_metrics)

    for field, orientation in orders:
        # Comparing fields using the is operator (i.e. object id) doesn't work for set
        # dimensions, which are dynamically generated. The dunder hash of Field class
        # does the job properly properly though, given set dimensions are treated
        # in particular while object id is used for anything else.
        if not is_metric_field(field):
            # Don't add the reference type to dimensions.
            orderby_field = _get_sq_field_for_blender_field(
                field, queries, field_maps)
        else:
            orderby_field = _get_sq_field_for_blender_field(
                field, queries, field_maps, reference)

        blender_query = blender_query.orderby(orderby_field, order=orientation)

    # Undo the get_sql's mocks above, otherwise reused datasets might produce different results. This issue can
    # affect tests, for instance.
    for metric in mocked_metrics:
        metric.get_sql = metric._origin_get_sql

    return blender_query
Пример #3
0
def _replace_field_if_needed(field: Field, fields_per_set_filter,
                             target_dataset: 'DataSet') -> Tuple[Field]:
    set_filter = fields_per_set_filter.get(_unwrap_field(field))

    if not set_filter:
        return (field, )

    set_dimension = _make_set_dimension(set_filter, target_dataset)

    if isinstance(field, DimensionModifier):
        modified_set_dimension = deepcopy(field)
        modified_set_dimension.dimension = set_dimension
        set_dimension = modified_set_dimension

    if set_filter.will_replace_referenced_dimension and not is_metric_field(
            set_dimension):
        # Metrics should not be replaced.
        return (set_dimension, )
    else:
        return (set_dimension, field)
Пример #4
0
def _make_set_dimension(set_filter: Field, target_dataset: 'DataSet') -> Field:
    """
    Returns a new dimension that uses a CASE statement as its definition, in order to represent membership to a set,
    given the provided conditional.

    :param set_filter: A ResultSet instance.
    :param target_dataset: A DataSet instance, that will be used as the dataset for which the new dimension
                           will be applied to.
    :return: A new Field instance.
    """
    old_definition = set_filter.filter

    while hasattr(old_definition, 'definition'):
        old_definition = old_definition.definition

    old_definition = deepcopy(old_definition)
    old_definition_sql = old_definition.get_sql(quote_char="")

    set_dimension = deepcopy(set_filter.filter.field)

    is_metric = is_metric_field(set_dimension)

    # When using data blending, the dataset table of the set filter needs to be re-mapped to the table in the
    # target dataset (i.e. primary or secondary). Otherwise table not found issues would pop up when resolving
    # the joins.
    if target_dataset and not is_metric:
        target_dataset_definition = target_dataset.fields[
            set_dimension.alias].definition
        target_dataset_leaf_definition = target_dataset_definition

        while hasattr(target_dataset_leaf_definition, 'definition'):
            # Sometimes a fireant's Field can have nested fireant Fields.
            target_dataset_leaf_definition = target_dataset_leaf_definition.definition

        old_definition = _definition_field_for_data_blending(
            target_dataset_definition, target_dataset_leaf_definition,
            old_definition)

    set_term = set_filter.set_label
    complement_term = set_filter.complement_label

    if not set_term and not complement_term:
        set_term = "set({})".format(old_definition_sql)
        complement_term = "complement({})".format(old_definition_sql)

    if not set_filter.will_group_complement:
        complement_term = set_filter.filter.field.definition

    if is_metric or not set_filter.will_replace_referenced_dimension:
        # When keeping a referenced dimension, we name the set dimension with a custom alias, so as to have no
        # alias clashes. That prevents issues with rollups/share dimensions, given the original dimension
        # is maintained. Also, metrics need to have the same treatment, given that, unlike dimensions, they are
        # never replaced.
        set_dimension.alias = alias_selector(
            "set({})".format(old_definition_sql))

    set_dimension.data_type = DataType.text
    set_dimension.label = "Set({})".format(
        old_definition_sql
    ) if not set_filter.set_label else set_filter.set_label
    set_dimension.definition = Case().when(old_definition,
                                           set_term).else_(complement_term)

    # Necessary for set feature to work properly with data blending's field mapping.
    set_dimension.is_artificial = True

    return set_dimension
Пример #5
0
def _make_set_dimension(set_filter: Field, target_dataset: 'DataSet') -> Field:
    """
    Returns a new dimension that uses a CASE statement as its definition, in order to represent membership to a set,
    given the provided conditional.

    :param set_filter: A ResultSet instance.
    :param target_dataset: A DataSet instance, that will be used as the dataset for which the new dimension
                           will be applied to.
    :return: A new Field instance.
    """
    old_definition = set_filter.filter

    while hasattr(old_definition, 'definition'):
        old_definition = old_definition.definition

    old_definition = deepcopy(old_definition)
    old_definition_sql = old_definition.get_sql(quote_char="")

    set_dimension = deepcopy(set_filter.filter.field)

    is_metric = is_metric_field(set_dimension)

    # When using data blending, the dataset table of the set filter needs to be re-mapped to the table in the
    # target dataset (i.e. primary or secondary). The easiest way to do that is to select the field in the
    # target dataset directly.
    if (target_dataset and not is_metric and isinstance(
            old_definition,
        (
            terms.ContainsCriterion,
            terms.NullCriterion,
            terms.BetweenCriterion,
            terms.BitwiseAndCriterion,
        ),
    )):
        target_dataset_definition = deepcopy(
            target_dataset.fields[set_dimension.alias].definition)

        if not isinstance(old_definition.term,
                          (terms.ValueWrapper, terms.Function)):
            old_definition.term = target_dataset_definition

    if target_dataset and not is_metric and isinstance(old_definition,
                                                       terms.BasicCriterion):
        target_dataset_definition = deepcopy(
            target_dataset.fields[set_dimension.alias].definition)

        if not isinstance(old_definition.left,
                          (terms.ValueWrapper, terms.Function)):
            old_definition.left = target_dataset_definition

        if not isinstance(old_definition.right,
                          (terms.ValueWrapper, terms.Function)):
            old_definition.right = target_dataset_definition

    set_term = set_filter.set_label
    complement_term = set_filter.complement_label

    if not set_term and not complement_term:
        set_term = "set({})".format(old_definition_sql)
        complement_term = "complement({})".format(old_definition_sql)

    if not set_filter.will_group_complement:
        complement_term = set_filter.filter.field.definition

    if is_metric or not set_filter.will_replace_referenced_dimension:
        # When keeping a referenced dimension, we name the set dimension with a custom alias, so as to have no
        # alias clashes. That prevents issues with rollups/share dimensions, given the original dimension
        # is maintained. Also, metrics need to have the same treatment, given that, unlike dimensions, they are
        # never replaced.
        set_dimension.alias = alias_selector(
            "set({})".format(old_definition_sql))

    set_dimension.data_type = DataType.text
    set_dimension.label = "Set({})".format(
        old_definition_sql
    ) if not set_filter.set_label else set_filter.set_label
    set_dimension.definition = Case().when(old_definition,
                                           set_term).else_(complement_term)

    # Necessary for set feature to work properly with data blending's field mapping.
    set_dimension.is_artificial = True

    return set_dimension
    def sql(self):
        """
        Serialize this query builder to a list of Pypika/SQL queries. This function will return one query for every
        combination of reference and rolled up dimension (including null options).

        This collects all of the metrics in each widget, dimensions, and filters and builds a corresponding pypika query
        to fetch the data.  When references are used, the base query normally produced is wrapped in an outer query and
        a query for each reference is joined based on the referenced dimension shifted.

        :return: a list of Pypika's Query subclass instances.
        """
        # First run validation for the query on all widgets
        self._validate()

        datasets, field_maps = _datasets_and_field_maps(
            self.dataset, self._filters)

        selected_blender_dimensions = self.dimensions
        selected_blender_dimensions_aliases = {
            dimension.alias
            for dimension in selected_blender_dimensions
        }
        selected_blender_metrics = find_metrics_for_widgets(self._widgets)
        selected_blender_metrics_aliases = {
            metric.alias
            for metric in selected_blender_metrics
        }

        operations = find_operations_for_widgets(self._widgets)
        share_dimensions = find_share_dimensions(selected_blender_dimensions,
                                                 operations)
        non_set_filters = omit_set_filters(self._filters)

        # Add fields to be ordered on, to metrics if they aren't yet selected in metrics or dimensions
        # To think about: if the selected order_by field is a dimension, should we add it to dimensions?
        for field, _ in self.orders:
            if (field.alias not in selected_blender_metrics_aliases and
                    field.alias not in selected_blender_dimensions_aliases):
                selected_blender_metrics.append(field)

        # Needed dimensions in final query as tuples of (dimension, is_selected_dimension)
        needed_blender_dimensions = [
            (dimension_field, True)
            for dimension_field in selected_blender_dimensions
        ]
        # Add dimension filters which are not selected to the pool of needed dimensions
        for filter_ in non_set_filters:
            if not is_metric_field(filter_.field) and (
                    filter_.field.alias
                    not in selected_blender_dimensions_aliases):
                needed_blender_dimensions.append((filter_.field, False))

        selected_metrics_as_dataset_fields = find_dataset_fields(
            selected_blender_metrics)

        # Determine for each dataset which metrics and dimensions need to be selected
        dataset_dimensions = [[] for _ in range(len(datasets))]
        dataset_metrics = []
        dataset_filters = []
        dataset_included_in_final_query = [False] * len(datasets)

        # First determine the metrics. If a a metric is requested, and the dataset has it, add it for that dataset.
        # We include metrics used in filters. We also save for each dataset the mapped metrics and filters
        for dataset_index, dataset in enumerate(datasets):

            dataset_metrics.append(
                map_blender_fields_to_dataset_fields(
                    selected_metrics_as_dataset_fields,
                    field_maps[dataset_index],
                    dataset,
                ))

            dataset_filters.append(
                map_blender_fields_to_dataset_fields(non_set_filters,
                                                     field_maps[dataset_index],
                                                     dataset))

            # Metric selected from this dataset, so include it.
            if dataset_metrics[dataset_index]:
                dataset_included_in_final_query[dataset_index] = True
                continue

            # Filter with metric from this dataset selected, so include it.
            for filter_ in dataset_filters[dataset_index]:
                if is_metric_field(filter_.field):
                    dataset_included_in_final_query[dataset_index] = True
                    break

        # Second map the dimensions and find the dimensions which are unique to a dataset. Include those.
        # Also save for each dimension of which datasets it is part of.
        dimensions_dataset_info = []
        for blender_dimension_field, is_selected_dimension in needed_blender_dimensions:
            dimension_dataset_info = []

            for dataset_index, dataset in enumerate(datasets):
                mapped_dimension = map_blender_field_to_dataset_field(
                    blender_dimension_field, field_maps[dataset_index],
                    dataset)

                if mapped_dimension is not None:
                    dimension_dataset_info.append(
                        (dataset_index, mapped_dimension,
                         is_selected_dimension))

            if len(dimension_dataset_info) == 0:
                # This case should only happen when using sets, otherwise I would have raised the following exception:
                # raise Exception("Dimension requested that was not part of any dataset.")
                pass
            elif len(dimension_dataset_info) == 1:
                # This is the only dataset that has this dimension, assign it
                dataset_index, _, _ = dimension_dataset_info[0]
                dataset_included_in_final_query[dataset_index] = True

            if dimension_dataset_info:
                dimensions_dataset_info.append(dimension_dataset_info)

        # Add all the dimensions to the subqueries that are already selected for the final query
        # Add dimensions that are not yet accounted for to the first dataset that has it
        for dimension_dataset_info in dimensions_dataset_info:
            dimension_accounted_for = False
            first_dataset_that_has_the_dimension = None
            for (dataset_index, mapped_dimension,
                 is_selected_dimension) in dimension_dataset_info:
                # If the dataset is already part of the final query, add this dimension
                if dataset_included_in_final_query[dataset_index]:
                    dimension_accounted_for = True
                    if is_selected_dimension:
                        dataset_dimensions[dataset_index].append(
                            mapped_dimension)

                # Update first_dataset_that_has_the_dimension if needed
                if not dimension_accounted_for and first_dataset_that_has_the_dimension is None:
                    first_dataset_that_has_the_dimension = (
                        dataset_index,
                        mapped_dimension,
                        is_selected_dimension,
                    )

            if not dimension_accounted_for:
                # Dimension not yet accounted for! Take first dataset that has the dimension.
                dataset_index, mapped_dimension, is_selected_dimension = first_dataset_that_has_the_dimension
                dataset_included_in_final_query[dataset_index] = True
                if is_selected_dimension:
                    dataset_dimensions[dataset_index].append(mapped_dimension)

        datasets_queries = []
        filtered_field_maps = []
        for dataset_index, dataset in enumerate(datasets):
            if dataset_included_in_final_query[dataset_index]:
                datasets_queries.append(
                    _build_dataset_query(
                        dataset,
                        field_maps[dataset_index],
                        dataset_metrics[dataset_index],
                        dataset_dimensions[dataset_index],
                        dataset_filters[dataset_index],
                        self._references,
                        operations,
                        share_dimensions,
                    ))
                # Filter the field maps of which the dataset is not going to be in the final query.
                filtered_field_maps.append(field_maps[dataset_index])
        """
        A dataset query can yield one or more sql queries, depending on how many types of references or dimensions 
        with totals are selected. A blended dataset query must yield the same number and types of sql queries, but each
        blended together. The individual dataset queries built above will always yield the same number of sql queries, 
        so here those lists of sql queries are zipped.
        
               base   ref  totals ref+totals
        ds1 | ds1_a  ds1_b  ds1_c   ds1_d  
        ds2 | ds2_a  ds2_b  ds2_c   ds2_d  
        
        More concretely, using the diagram above as a reference, a dataset query with 1 reference and 1 totals dimension
        would yield 4 sql queries. With data blending with 1 reference and 1 totals dimension, 4 sql queries must also 
        be produced.  The following lines convert the list of rows of the table in the diagram to a list of columns.
        Each set of queries in a column are then reduced to a single data blending sql query.
        """

        per_dataset_queries_count = max(
            [len(dataset_queries) for dataset_queries in datasets_queries])
        # There will be the same amount of query sets as the longest length of queries for a single dataset
        query_sets = [[] for _ in range(per_dataset_queries_count)]

        # Add the queries returned for each dataset to the correct queryset
        for dataset_index, dataset_queries in enumerate(datasets_queries):
            for i, query in enumerate(dataset_queries):
                query_sets[i].append(query)

        blended_queries = []
        for queryset in query_sets:
            blended_query = _blend_query(
                selected_blender_dimensions,
                selected_blender_metrics,
                self.orders,
                filtered_field_maps,
                queryset,
                self,
            )
            blended_query = self._apply_pagination(blended_query)

            if blended_query:
                blended_queries.append(blended_query)

        return blended_queries