예제 #1
0
    def for_ts(
        cls,
        first_enrollment_date,
        last_date_full_data,
        time_series_period,
        num_dates_enrollment,
    ):
        """Return a ``TimeLimits`` instance for a time series.

        Args:
            first_enrollment_date (str): First date on which enrollment
                events were received; the start date of the experiment.
            last_date_full_data (str): The most recent date for which we
                have complete data, e.g. '2019-03-22'. If you want to ignore
                all data collected after a certain date (e.g. when the
                experiment recipe was deactivated), then do that here.
            time_series_period: 'daily' or 'weekly'.
            num_dates_enrollment (int): Take this many days of client
                enrollments. This is a mandatory argument because it
                determines the number of points in the time series.
        """
        if time_series_period not in ("daily", "weekly"):
            raise ValueError(
                "Unsupported time series period {}".format(time_series_period)
            )

        if num_dates_enrollment <= 0:
            raise ValueError("Number of enrollment dates must be a positive number")

        analysis_window_length_dates = 1 if time_series_period == "daily" else 7

        last_enrollment_date = add_days(first_enrollment_date, num_dates_enrollment - 1)
        max_dates_of_data = date_sub(last_date_full_data, last_enrollment_date) + 1
        num_periods = max_dates_of_data // analysis_window_length_dates

        if num_periods <= 0:
            raise ValueError("Insufficient data")

        analysis_windows = tuple(
            [
                AnalysisWindow(
                    i * analysis_window_length_dates,
                    (i + 1) * analysis_window_length_dates - 1,
                )
                for i in range(num_periods)
            ]
        )

        last_date_data_required = add_days(
            last_enrollment_date, analysis_windows[-1].end
        )

        return cls(
            first_enrollment_date=first_enrollment_date,
            last_enrollment_date=last_enrollment_date,
            first_date_data_required=first_enrollment_date,
            last_date_data_required=last_date_data_required,
            analysis_windows=analysis_windows,
        )
예제 #2
0
    def _validate_first_date_data_required(self, attribute, value):
        assert self.first_date_data_required <= self.last_date_data_required

        min_analysis_window_start = min(aw.start for aw in self.analysis_windows)
        assert self.first_date_data_required == add_days(
            self.first_enrollment_date, min_analysis_window_start
        )
예제 #3
0
파일: analysis.py 프로젝트: matf/jetstream
    def _calculate_metrics(
        self,
        exp: mozanalysis.experiment.Experiment,
        time_limits: TimeLimits,
        period: AnalysisPeriod,
        dry_run: bool,
    ):
        """
        Calculate metrics for a specific experiment.
        Returns the BigQuery table results are written to.
        """

        window = len(time_limits.analysis_windows)
        last_analysis_window = time_limits.analysis_windows[-1]
        # TODO: Add this functionality to TimeLimits.
        last_window_limits = attr.evolve(
            time_limits,
            analysis_windows=[last_analysis_window],
            first_date_data_required=add_days(
                time_limits.first_enrollment_date, last_analysis_window.start),
        )

        res_table_name = self._table_name(period.value, window)

        sql = exp.build_query(
            {m.metric
             for m in self.config.metrics[period]},
            last_window_limits,
            "normandy",
            self.config.experiment.enrollment_query,
            self.config.experiment.segments,
        )

        if dry_run:
            logger.info(
                "Dry run; not actually calculating %s metrics for %s",
                period.value,
                self.config.experiment.normandy_slug,
            )
        else:
            logger.info(
                "Executing query for %s (%s)",
                self.config.experiment.normandy_slug,
                period.value,
            )
            self.bigquery.execute(sql, res_table_name)
            self._publish_view(period)

        return res_table_name
예제 #4
0
def test_process_data_source_df(spark):
    start_date = '20190101'
    exp_8d = Experiment('experiment-with-8-day-cohort', start_date, 8)
    data_source_df = _get_data_source_df(spark)

    end_date = '20190114'

    # Are the fixtures sufficiently complicated that we're actually testing
    # things?
    assert _simple_return_agg_date(F.min, data_source_df) < start_date
    assert _simple_return_agg_date(F.max, data_source_df) > end_date

    tl_03 = TimeLimits.for_single_analysis_window(
        first_enrollment_date=exp_8d.start_date,
        last_date_full_data=end_date,
        analysis_start_days=0,
        analysis_length_dates=3,
        num_dates_enrollment=exp_8d.num_dates_enrollment)
    assert tl_03.first_date_data_required == start_date
    assert tl_03.last_date_data_required == '20190110'

    proc_ds = exp_8d._process_data_source_df(data_source_df, tl_03)

    assert _simple_return_agg_date(F.min,
                                   proc_ds) == tl_03.first_date_data_required
    assert _simple_return_agg_date(F.max,
                                   proc_ds) == tl_03.last_date_data_required

    tl_23 = TimeLimits.for_single_analysis_window(
        first_enrollment_date=exp_8d.start_date,
        last_date_full_data=end_date,
        analysis_start_days=2,
        analysis_length_dates=3,
        num_dates_enrollment=exp_8d.num_dates_enrollment)
    assert tl_23.first_date_data_required == add_days(start_date, 2)
    assert tl_23.last_date_data_required == '20190112'

    p_ds_2 = exp_8d._process_data_source_df(data_source_df, tl_23)

    assert _simple_return_agg_date(F.min,
                                   p_ds_2) == tl_23.first_date_data_required
    assert _simple_return_agg_date(F.max,
                                   p_ds_2) == tl_23.last_date_data_required

    assert proc_ds.select(F.col('data_source.client_id'))
    with pytest.raises(AnalysisException):
        assert data_source_df.select(F.col('data_source.client_id'))
예제 #5
0
def _get_data_source_df(spark):
    clients_branches = [
        ('aaaa', 'control'),
        ('bbbb', 'test'),
    ]
    dates = [add_days('20181215', i) for i in range(32)]

    data_rows = [[client, submission_date_s3, {
        'a-stub': branch
    }, 1.] for client, branch in clients_branches
                 for submission_date_s3 in dates]

    return spark.createDataFrame(
        data_rows,
        [
            "client_id",
            "submission_date_s3",
            "experiments",
            "constant_one",
        ],
    )
예제 #6
0
    def for_single_analysis_window(
        cls,
        first_enrollment_date,
        last_date_full_data,
        analysis_start_days,
        analysis_length_dates,
        num_dates_enrollment=None,
    ):
        """Return a ``TimeLimits`` instance with the following parameters

        Args:
            first_enrollment_date (str): First date on which enrollment
                events were received; the start date of the experiment.
            last_date_full_data (str): The most recent date for which we
                have complete data, e.g. '2019-03-22'. If you want to ignore
                all data collected after a certain date (e.g. when the
                experiment recipe was deactivated), then do that here.
            analysis_start_days (int): the start of the analysis window,
                measured in 'days since the client enrolled'. We ignore data
                collected outside this analysis window.
            analysis_length_days (int): the length of the analysis window,
                measured in days.
            num_dates_enrollment (int, optional): Only include this many days
                of enrollments. If ``None`` then use the maximum number of days
                as determined by the metric's analysis window and
                ``last_date_full_data``. Typically ``7n+1``, e.g. ``8``. The
                factor ``7`` removes weekly seasonality, and the ``+1``
                accounts for the fact that enrollment typically starts a few
                hours before UTC midnight.
        """
        analysis_window = AnalysisWindow(
            analysis_start_days, analysis_start_days + analysis_length_dates - 1
        )

        if num_dates_enrollment is None:
            last_enrollment_date = add_days(last_date_full_data, -analysis_window.end)
        else:
            last_enrollment_date = add_days(
                first_enrollment_date, num_dates_enrollment - 1
            )

        first_date_data_required = add_days(
            first_enrollment_date, analysis_window.start
        )
        last_date_data_required = add_days(last_enrollment_date, analysis_window.end)

        if last_date_data_required > last_date_full_data:
            raise ValueError(
                "You said you wanted {} dates of enrollment, ".format(
                    num_dates_enrollment
                )
                + "and need data from the {}th day after enrollment. ".format(
                    analysis_window.end
                )
                + "For that, you need to wait until we have data for {}.".format(
                    last_date_data_required
                )
            )

        tl = cls(
            first_enrollment_date=first_enrollment_date,
            last_enrollment_date=last_enrollment_date,
            first_date_data_required=first_date_data_required,
            last_date_data_required=last_date_data_required,
            analysis_windows=(analysis_window,),
        )
        return tl
예제 #7
0
 def _validate_last_date_data_required(self, attribute, value):
     max_analysis_window_end = max(aw.end for aw in self.analysis_windows)
     assert self.last_date_data_required == add_days(
         self.last_enrollment_date, max_analysis_window_end
     )
예제 #8
0
def test_add_days():
    assert add_days('2019-01-01', 0) == '2019-01-01'
    assert add_days('2019-01-01', 1) == '2019-01-02'
    assert add_days('2019-01-01', -1) == '2018-12-31'
예제 #9
0
    def calculate_metrics(
        self,
        exp: mozanalysis.experiment.Experiment,
        time_limits: TimeLimits,
        period: AnalysisPeriod,
        analysis_basis: AnalysisBasis,
        dry_run: bool,
    ):
        """
        Calculate metrics for a specific experiment.
        Returns the BigQuery table results are written to.
        """
        window = len(time_limits.analysis_windows)
        last_analysis_window = time_limits.analysis_windows[-1]
        # TODO: Add this functionality to TimeLimits.
        last_window_limits = attr.evolve(
            time_limits,
            analysis_windows=[last_analysis_window],
            first_date_data_required=add_days(
                time_limits.first_enrollment_date, last_analysis_window.start
            ),
        )

        res_table_name = self._table_name(period.value, window, analysis_basis=analysis_basis)
        normalized_slug = bq_normalize_name(self.config.experiment.normandy_slug)

        if dry_run:
            logger.info(
                "Dry run; not actually calculating %s metrics for %s",
                period.value,
                self.config.experiment.normandy_slug,
            )
        else:
            logger.info(
                "Executing query for %s (%s)",
                self.config.experiment.normandy_slug,
                period.value,
            )

            enrollments_table_name = f"enrollments_{normalized_slug}"
            exposure_signal = None

            if self.config.experiment.exposure_signal:
                # if a custom exposure signal has been defined in the config, we'll
                # need to pass it into the metrics computation
                exposure_signal = (
                    self.config.experiment.exposure_signal.to_mozanalysis_exposure_signal(
                        last_window_limits
                    )
                )

            metrics_sql = exp.build_metrics_query(
                {
                    m.metric.to_mozanalysis_metric()
                    for m in self.config.metrics[period]
                    if m.metric.analysis_bases == analysis_basis
                    or analysis_basis in m.metric.analysis_bases
                },
                last_window_limits,
                enrollments_table_name,
                analysis_basis,
                exposure_signal,
            )

            self.bigquery.execute(metrics_sql, res_table_name)
            self._publish_view(period, analysis_basis=analysis_basis.value)

        return res_table_name
예제 #10
0
    def get_enrollments(self,
                        spark,
                        study_type='pref_flip',
                        end_date=None,
                        debug_dupes=False):
        """Return a DataFrame of enrolled clients.

        This works for pref-flip and addon studies.

        The underlying queries are different for pref-flip vs addon
        studies, because as of 2019/04/02, branch information isn't
        reliably available in the ``events`` table for addon experiments:
        branch may be NULL for all enrollments. The enrollment
        information for them is most reliably available in
        ``telemetry_shield_study_parquet``. Once this issue is resolved,
        we will probably start using normandy events for all desktop
        studies.
        Ref: https://bugzilla.mozilla.org/show_bug.cgi?id=1536644

        Args:
            spark: The spark context.
            study_type (str): One of the following strings:

                * 'pref_flip'
                * 'addon'

                or a callable that accepts a spark context as an argument
                and returns a Spark DataFrame containing all enrollment events
                ever conducted using that method, with columns ``client_id``,
                ``experiment_slug``, ``branch``, ``enrollment_date``,
                and ``addon_version`` if it's relevant.

            end_date (str, optional): Ignore enrollments after this
                date: for faster queries on stale experiments. If you
                set ``num_dates_enrollment`` then do not set this; at best
                it would be redundant, at worst it's contradictory.

            debug_dupes (bool, optional): Include a column ``num_events``
                giving the number of enrollment events associated with
                the ``client_id`` and ``branch``.

        Returns:
            A Spark DataFrame of enrollment data. One row per
            enrollment. Columns:

                * client_id (str)
                * enrollment_date (str): e.g. '20190329'
                * branch (str)
                * num_events (int, optional)
        """
        if callable(study_type):
            enrollments = study_type(spark)
        elif study_type == 'pref_flip':
            enrollments = self._get_enrollments_view_normandy(spark)
        elif study_type == 'addon':
            enrollments = self._get_enrollments_view_addon(spark)
        # elif study_type == 'glean':
        #     raise NotImplementedError
        else:
            raise ValueError("Unrecognized study_type: {}".format(study_type))

        enrollments = enrollments.filter(
            enrollments.enrollment_date >= self.start_date).filter(
                enrollments.experiment_slug == self.experiment_slug)

        if self.addon_version:
            if "addon_version" not in enrollments.columns:
                raise ValueError(
                    ("Experiment constructed with an addon_version but your  "
                     "study_type (%s) is incompatible with addon versions."
                     ).format(study_type))
            enrollments = enrollments.filter(
                enrollments.addon_version == self.addon_version).drop(
                    enrollments.addon_version)

        if self.num_dates_enrollment is not None:
            if end_date is not None:
                raise ValueError(
                    "Don't specify both 'end_date' and "
                    "'num_dates_enrollment'; you might contradict yourself.")
            enrollments = enrollments.filter(
                enrollments.enrollment_date <= add_days(
                    self.start_date, self.num_dates_enrollment - 1))
        elif end_date is not None:
            enrollments = enrollments.filter(
                enrollments.enrollment_date <= end_date)

        # Deduplicate enrollment events. Optionally keep track of what
        # had to be deduplicated. Deduplicating a client who enrolls in
        # multiple branches is left as an exercise to the reader :|
        enrollments = enrollments.groupBy(
            enrollments.client_id, enrollments.branch).agg(*(
                [F.min(enrollments.enrollment_date).alias('enrollment_date')] +
                ([F.count(enrollments.enrollment_date).
                  alias('num_events')] if debug_dupes else [])))

        enrollments.cache()

        return enrollments
예제 #11
0
def test_add_days():
    assert add_days("2019-01-01", 0) == "2019-01-01"
    assert add_days("2019-01-01", 1) == "2019-01-02"
    assert add_days("2019-01-01", -1) == "2018-12-31"