Exemplo n.º 1
0
    def setUp(self):
        self.registry = MeasureRegistry()

        data_dir = os.path.join(os.path.dirname(__file__), 'data')

        people = (PandasMeasureProvider(
            name='people',
            data=os.path.join(data_dir, 'people.csv')).provides_identifier(
                'person', expr='id', role='primary').provides_identifier(
                    'geography', expr='id_country',
                    role='foreign').provides_dimension(
                        'name').provides_measure('age'))
        self.registry.register(people)

        transactions = (PandasMeasureProvider(
            name='transactions',
            data=os.path.join(
                data_dir, 'transactions.csv')).provides_identifier(
                    'transaction', expr='id',
                    role='primary').provides_identifier(
                        'person:buyer', expr='id_buyer',
                        role='foreign').provides_identifier(
                            'person:seller', expr='id_seller',
                            role='foreign').provides_measure('value'))
        self.registry.register(transactions)

        self.registry.show()
Exemplo n.º 2
0
    def setUp(self):
        self.registry = MetaMeasureProvider()

        data_dir = os.path.join(os.path.dirname(__file__), 'data')

        people = (
            PandasMeasureProvider(
                name='people',
                data=os.path.join(data_dir, 'people.csv')
            )
            .add_identifier('person', expr='id', role='primary')
            .add_dimension('name')
            .add_measure('age')
            .add_partition('ds')
        )
        self.registry.register(people)

        people2 = (
            PandasMeasureProvider(
                name='people2',
                data=os.path.join(data_dir, 'people.csv')
            )
            .add_identifier('person', expr='id', role='unique')
            .add_identifier('geography', expr='id_geography', role='foreign')
            .add_partition('ds')
        )
        self.registry.register(people2)

        geographies = (
            PandasMeasureProvider(
                name='geographies',
                data=os.path.join(data_dir, 'geographies.csv')
            )
            .add_identifier('geography', expr='id_geography', role='primary')
            .add_dimension('name')
            .add_measure('population')
            .add_partition('ds')

        )
        self.registry.register(geographies)

        transactions = (
            PandasMeasureProvider(
                name='transactions',
                data=os.path.join(data_dir, 'transactions.csv')
            )
            .add_identifier('transaction', expr='id', role='primary')
            .add_identifier('person:buyer', expr='id_buyer', role='foreign')
            .add_identifier('person:seller', expr='id_seller', role='foreign')
            .add_measure('value')
            .add_partition('ds', requires_constraint=True)
        )
        self.registry.register(transactions)
Exemplo n.º 3
0
    def evaluate(self,
                 unit_type,
                 measures=None,
                 segment_by=None,
                 where=None,
                 joins=None,
                 stats=True,
                 covariates=False,
                 context=None,
                 stats_registry=None,
                 **opts):
        """
        This method evaluates the requested `measures` in this MeasureProvider
        segmented by the dimensions in `segment_by` after joining in the
        joins in `joins` and subject to the constraints in `where`; treating
        `unit_type` objects as indivisible.

        Args:
            unit_type (str, _StatisticalUnitIdentifier): The unit to treat as
                indivisible in this analysis.
            measures (list<str, _Measure>): The measures to be calculated.
            segment_by (list<str, _Feature>): The dimensions by which to segment
                the measure computations.
            where (dict, list, tuple, BaseConstraint): The
                constraints within which measures should be computed.
            stats (bool): Whether to keep track of the distribution of the
                measures, rather than just their sum.
            covariates (bool, list<tuple>): Whether to compute all covariates
                (if bool) or else a list of tuples of measures within which
                all pairs of covariates should be computed.
            context (dict): The context in which to perform the evaluation.
            opts (dict): Additional arguments to be passed onto `._evalaute`
                implementations.

        Returns:
            EvaluatedMeasures: A wrapper around the dataframe of the results of the computation.
        """
        from mensor.backends.pandas import PandasMeasureProvider  # We need this for some pandas transformations

        # Split joins into compatible and incompatible joins; 'joins_pre' and
        # 'joins_post' (so-called because compatible joins occur before any
        # computation in this method).
        joins_pre = [j for j in joins if j.compatible]
        joins_post = [j for j in joins if not j.compatible]

        # If there are post-joins, we will need to add the 'count' measure
        # (assuming it has not already been requested), so that we can weight
        # post-joins appropriately.
        if len(joins_post) > 0 and 'count' not in measures:
            count_measure = self.measures['count'].as_private
            measures[count_measure] = count_measure

        # If there are post-joins, we need to ensure that the pre- operations
        # that happen within the `._evaluate` method do not suppress prematurely
        # private fields that are necessary to later join in the post-joins.
        # We therefore modify the privacy of fields for the `._evaluate` stage
        # depending on whether they are needed later. We also suppress and
        # external fields not provided by pre-joins, so that `._evaluate`
        # instances need not concern themselves with them.

        # Moreover, if there are post-joins and where constraints, some of the constraints
        # may need to be applied after post-joins. As such, we split the where
        # constraints into where_pre and where_post.
        measures_pre, segment_by_pre, where_pre, measures_post, segment_by_post, where_post = (
            self._compat_fields_split(measures,
                                      segment_by,
                                      where,
                                      joins_post=joins_post))

        # Allow MeasureProvider instance to evaluate all pre- computations.
        result = self._evaluate(unit_type,
                                measures_pre,
                                segment_by=segment_by_pre,
                                where=where_pre,
                                joins=joins_pre,
                                stats_registry=stats_registry,
                                stats=stats and not joins_post,
                                covariates=covariates,
                                context=context,
                                **opts)

        if len(joins_post) > 0:

            # Join in precomputed incompatible joins
            # TODO: Clean-up how joined measures are detected (remembering measure fields have suffixes)
            joined_measure_fields = set()
            if len(joins_post) > 0:
                for join in joins_post:
                    joined_measure_fields.update(join.object.measure_fields)
                    result = result.merge(join.object.raw,
                                          left_on=join.left_on,
                                          right_on=join.right_on,
                                          how=join.how)

            # Check columns in resulting dataframe
            expected_columns = _Measure.get_all_fields(
                measures_post,
                unit_type=unit_type,
                rebase_agg=True,
                stats_registry=stats_registry,
                stats=False) + [f.via_name for f in segment_by_post]
            excess_columns = set(result.columns).difference(expected_columns)
            missing_columns = set(expected_columns).difference(result.columns)
            if len(
                    excess_columns
            ):  # remove any unnecessary columns (such as now used join keys)
                result = result.drop(excess_columns, axis=1)
            if len(missing_columns):
                raise RuntimeError(
                    'Data is missing columns: {}.'.format(missing_columns))

            # All new joined in measures need to be multiplied by the count series of
            # this dataframe, so that they are properly weighted.
            if len(joined_measure_fields) > 0:
                result = result.apply(
                    lambda col: result['count|raw'] * col
                    if col.name in joined_measure_fields else col,
                    axis=0)

            result = PandasMeasureProvider._finalise_dataframe(
                df=result,
                unit_type=unit_type,
                measures=measures_post,
                segment_by=segment_by_post,
                where=where_post,
                stats=stats,
                stats_registry=stats_registry,
                covariates=covariates,
                rebase_agg=False,
                reagg=False)

        return EvaluatedMeasures.for_measures(result,
                                              stats_registry=stats_registry)