Exemplo n.º 1
0
    def get_time_series_data(
        self,
        bq_context,
        metric_list,
        last_date_full_data,
        time_series_period="weekly",
        enrollments_query_type="normandy",
        custom_enrollments_query=None,
        custom_exposure_query=None,
        exposure_signal=None,
        segment_list=None,
    ):
        """Return a TimeSeriesResult with per-client metric values.

        Roughly equivalent to looping over :meth:`.get_single_window_data`
        with different analysis windows, and reorganising the results.

        Args:
            bq_context (BigQueryContext): BigQuery configuration and client.
            metric_list (list of mozanalysis.metric.Metric):
                The metrics to analyze.
            last_date_full_data (str): The most recent date for which we
                have complete data, e.g. '2019-03-22'. If you want to ignore
                all data collected after a certain date (e.g. when the
                experiment recipe was deactivated), then do that here.
            time_series_period ('daily' or 'weekly'): How long each
                analysis window should be.
            enrollments_query_type ('normandy', 'glean-event' or 'fenix-fallback'):
                Specifies the query type to use to get the experiment's
                enrollments, unless overridden by
                ``custom_enrollments_query``.
            custom_enrollments_query (str): A full SQL query to be used
                in the main query::

                    WITH raw_enrollments AS ({custom_enrollments_query})

                N.B. this query's results must be uniquely keyed by
                (client_id, branch), or else your results will be subtly
                wrong.

            custom_exposure_query (str): A full SQL query to be used in the main
                query::

                    WITH ...
                    exposures AS ({custom_exposure_query})

                If not provided, the exposure will be determined based on
                `exposure_signal`, if provided, or Normandy and Nimbus exposure events.
                `custom_exposure_query` takes precedence over `exposure_signal`.

            exposure_signal (ExposureSignal): Optional signal definition of when a
                client has been exposed to the experiment. If not provided,
                the exposure will be determined based on Normandy exposure events
                for desktop and Nimbus exposure events for Fenix and iOS.
            segment_list (list of mozanalysis.segment.Segment): The user
                segments to study.

        Returns:
            A :class:`mozanalysis.experiment.TimeSeriesResult` object,
            which may be used to obtain a
            pandas DataFrame of per-client metric data, for each
            analysis window. Each DataFrame is a pandas DataFrame in
            "the standard format": one row per client, some metadata
            columns, plus one column per metric and sanity-check metric.
            Its columns (not necessarily in order):

                * branch (str): The client's branch
                * other columns of ``enrollments``.
                * [metric 1]: The client's value for the first metric in
                  ``metric_list``.
                * ...
                * [metric n]: The client's value for the nth (final)
                  metric in ``metric_list``.
                * [sanity check 1]: The client's value for the first
                  sanity check metric for the first data source that
                  supports sanity checks.
                * ...
                * [sanity check n]: The client's value for the last
                  sanity check metric for the last data source that
                  supports sanity checks.
        """

        time_limits = TimeLimits.for_ts(
            self.start_date,
            last_date_full_data,
            time_series_period,
            self.num_dates_enrollment,
        )

        enrollments_sql = self.build_enrollments_query(
            time_limits=time_limits,
            enrollments_query_type=enrollments_query_type,
            custom_enrollments_query=custom_enrollments_query,
            custom_exposure_query=custom_exposure_query,
            exposure_signal=exposure_signal,
            segment_list=segment_list,
        )

        enrollments_table_name = sanitize_table_name_for_bq(
            "_".join(
                [
                    last_date_full_data,
                    "enrollments",
                    self.experiment_slug,
                    hash_ish(enrollments_sql),
                ]
            )
        )

        bq_context.run_query(enrollments_sql, enrollments_table_name)

        metrics_sql = self.build_metrics_query(
            metric_list=metric_list,
            time_limits=time_limits,
            enrollments_table=bq_context.fully_qualify_table_name(
                enrollments_table_name
            ),
        )

        full_res_table_name = sanitize_table_name_for_bq(
            "_".join([last_date_full_data, self.experiment_slug, hash_ish(metrics_sql)])
        )

        bq_context.run_query(metrics_sql, full_res_table_name)

        return TimeSeriesResult(
            fully_qualified_table_name=bq_context.fully_qualify_table_name(
                full_res_table_name
            ),
            analysis_windows=time_limits.analysis_windows,
        )
Exemplo n.º 2
0
    def get_time_series_data(self,
                             bq_context,
                             metric_list,
                             last_date_full_data,
                             time_series_period='weekly',
                             enrollments_query_type='normandy',
                             custom_enrollments_query=None):
        """Return a TimeSeriesResult with per-client metric values.

        Roughly equivalent to looping over :meth:`.get_single_window_data`
        with different analysis windows, and reorganising the results.

        Args:
            bq_context (BigQueryContext): BigQuery configuration and client.
            metric_list (list of mozanalysis.metric.Metric):
                The metrics to analyze.
            last_date_full_data (str): The most recent date for which we
                have complete data, e.g. '2019-03-22'. If you want to ignore
                all data collected after a certain date (e.g. when the
                experiment recipe was deactivated), then do that here.
            time_series_period ('daily' or 'weekly'): How long each
                analysis window should be.
            enrollments_query_type (str): Specifies the query type to use to
                get the experiment's enrollments, unless overridden by
                custom_enrollments_query.
            custom_enrollments_query (str): A full SQL query to be used
                in the main query::

                    WITH raw_enrollments AS ({custom_enrollments_query})

        Returns:
            A TimeSeriesResult object, which may be used to obtain a
            pandas DataFrame of per-client metric data, for each
            analysis window. Each DataFrame is a pandas DataFrame in
            "the standard format": one row per client, some metadata
            columns, plus one column per metric and sanity-check metric.
            Its columns (not necessarily in order):

                * branch (str): The client's branch
                * other columns of ``enrollments``.
                * [metric 1]: The client's value for the first metric in
                  ``metric_list``.
                * ...
                * [metric n]: The client's value for the nth (final)
                  metric in ``metric_list``.
                * [sanity check 1]: The client's value for the first
                  sanity check metric for the first data source that
                  supports sanity checks.
                * ...
                * [sanity check n]: The client's value for the last
                  sanity check metric for the last data source that
                  supports sanity checks.
        """

        time_limits = TimeLimits.for_ts(self.start_date, last_date_full_data,
                                        time_series_period,
                                        self.num_dates_enrollment)

        sql = self.build_query(metric_list, time_limits,
                               enrollments_query_type,
                               custom_enrollments_query)

        full_res_table_name = sanitize_table_name_for_bq('_'.join(
            [last_date_full_data, self.experiment_slug,
             hash_ish(sql)]))

        bq_context.run_query(sql, full_res_table_name)

        return TimeSeriesResult(
            fully_qualified_table_name=bq_context.fully_qualify_table_name(
                full_res_table_name),
            analysis_windows=time_limits.analysis_windows)
Exemplo n.º 3
0
    def get_single_window_data(
        self,
        bq_context,
        metric_list,
        last_date_full_data,
        analysis_start_days,
        analysis_length_days,
        enrollments_query_type="normandy",
        custom_enrollments_query=None,
        custom_exposure_query=None,
        exposure_signal=None,
        segment_list=None,
    ):
        """Return a DataFrame containing per-client metric values.

        Also store them in a permanent table in BigQuery. The name of
        this table will be printed. Subsequent calls to this function
        will simply read the results from this table.

        Args:
            bq_context (BigQueryContext): BigQuery configuration and client.
            metric_list (list of mozanalysis.metric.Metric): The metrics
                to analyze.
            last_date_full_data (str): The most recent date for which we
                have complete data, e.g. '2019-03-22'. If you want to ignore
                all data collected after a certain date (e.g. when the
                experiment recipe was deactivated), then do that here.
            analysis_start_days (int): the start of the analysis window,
                measured in 'days since the client enrolled'. We ignore data
                collected outside this analysis window.
            analysis_length_days (int): the length of the analysis window,
                measured in days.
            enrollments_query_type ('normandy', 'glean-event' or 'fenix-fallback'):
                Specifies the query type to use to get the experiment's
                enrollments, unless overridden by
                ``custom_enrollments_query``.
            custom_enrollments_query (str): A full SQL query to be used
                in the main query::

                    WITH raw_enrollments AS ({custom_enrollments_query})

                N.B. this query's results must be uniquely keyed by
                (client_id, branch), or else your results will be subtly
                wrong.

            custom_exposure_query (str): A full SQL query to be used in the main
                query::

                    WITH ...
                    exposures AS ({custom_exposure_query})

                If not provided, the exposure will be determined based on
                `exposure_signal`, if provided, or Normandy and Nimbus exposure events.
                `custom_exposure_query` takes precedence over `exposure_signal`.

            exposure_signal (ExposureSignal): Optional signal definition of when a
                client has been exposed to the experiment. If not provided,
                the exposure will be determined based on Normandy exposure events
                for desktop and Nimbus exposure events for Fenix and iOS.
            segment_list (list of mozanalysis.segment.Segment): The user
                segments to study.

        Returns:
            A pandas DataFrame of experiment data. One row per ``client_id``.
            Some metadata columns, then one column per metric in
            ``metric_list``, and one column per sanity-check metric.
            Columns (not necessarily in order):

                * client_id (str): Not necessary for "happy path" analyses.
                * branch (str): The client's branch
                * other columns of ``enrollments``.
                * [metric 1]: The client's value for the first metric in
                  ``metric_list``.
                * ...
                * [metric n]: The client's value for the nth (final)
                  metric in ``metric_list``.
                * [sanity check 1]: The client's value for the first
                  sanity check metric for the first data source that
                  supports sanity checks.
                * ...
                * [sanity check n]: The client's value for the last
                  sanity check metric for the last data source that
                  supports sanity checks.

            This format - the schema plus there being one row per
            enrolled client, regardless of whether the client has data
            in ``data_source`` - was agreed upon by the DS team, and is the
            standard format for queried experimental data.
        """
        time_limits = TimeLimits.for_single_analysis_window(
            self.start_date,
            last_date_full_data,
            analysis_start_days,
            analysis_length_days,
            self.num_dates_enrollment,
        )

        enrollments_sql = self.build_enrollments_query(
            time_limits=time_limits,
            enrollments_query_type=enrollments_query_type,
            custom_enrollments_query=custom_enrollments_query,
            custom_exposure_query=custom_exposure_query,
            exposure_signal=exposure_signal,
            segment_list=segment_list,
        )

        enrollments_table_name = sanitize_table_name_for_bq(
            "_".join(
                [
                    last_date_full_data,
                    "enrollments",
                    self.experiment_slug,
                    hash_ish(enrollments_sql),
                ]
            )
        )

        bq_context.run_query(enrollments_sql, enrollments_table_name)

        metrics_sql = self.build_metrics_query(
            metric_list=metric_list,
            time_limits=time_limits,
            enrollments_table=bq_context.fully_qualify_table_name(
                enrollments_table_name
            ),
        )

        full_res_table_name = sanitize_table_name_for_bq(
            "_".join([last_date_full_data, self.experiment_slug, hash_ish(metrics_sql)])
        )

        return bq_context.run_query(metrics_sql, full_res_table_name).to_dataframe()
Exemplo n.º 4
0
    def get_single_window_data(self,
                               bq_context,
                               metric_list,
                               last_date_full_data,
                               analysis_start_days,
                               analysis_length_days,
                               enrollments_query_type='normandy',
                               custom_enrollments_query=None):
        """Return a DataFrame containing per-client metric values.

        Also store them in a permanent table in BigQuery. The name of
        this table will be printed. Subsequent calls to this function
        will simply read the results from this table.

        Args:
            bq_context (BigQueryContext): BigQuery configuration and client.
            metric_list (list of mozanalysis.metric.Metric): The metrics
                to analyze.
            last_date_full_data (str): The most recent date for which we
                have complete data, e.g. '2019-03-22'. If you want to ignore
                all data collected after a certain date (e.g. when the
                experiment recipe was deactivated), then do that here.
            analysis_start_days (int): the start of the analysis window,
                measured in 'days since the client enrolled'. We ignore data
                collected outside this analysis window.
            analysis_length_days (int): the length of the analysis window,
                measured in days.
            enrollments_query_type (str): Specifies the query type to use to
                get the experiment's enrollments, unless overridden by
                custom_enrollments_query.
            custom_enrollments_query (str): A full SQL query to be used
                in the main query::

                    WITH raw_enrollments AS ({custom_enrollments_query})

        Returns:
            A pandas DataFrame of experiment data. One row per ``client_id``.
            Some metadata columns, then one column per metric in
            ``metric_list``, and one column per sanity-check metric.
            Columns (not necessarily in order):

                * client_id (str): Not necessary for "happy path" analyses.
                * branch (str): The client's branch
                * other columns of ``enrollments``.
                * [metric 1]: The client's value for the first metric in
                  ``metric_list``.
                * ...
                * [metric n]: The client's value for the nth (final)
                  metric in ``metric_list``.
                * [sanity check 1]: The client's value for the first
                  sanity check metric for the first data source that
                  supports sanity checks.
                * ...
                * [sanity check n]: The client's value for the last
                  sanity check metric for the last data source that
                  supports sanity checks.

            This format - the schema plus there being one row per
            enrolled client, regardless of whether the client has data
            in ``data_source`` - was agreed upon by the DS team, and is the
            standard format for queried experimental data.
        """
        time_limits = TimeLimits.for_single_analysis_window(
            self.start_date, last_date_full_data, analysis_start_days,
            analysis_length_days, self.num_dates_enrollment)

        sql = self.build_query(metric_list, time_limits,
                               enrollments_query_type,
                               custom_enrollments_query)

        res_table_name = sanitize_table_name_for_bq('_'.join(
            [last_date_full_data, self.experiment_slug,
             hash_ish(sql)]))

        return bq_context.run_query(sql, res_table_name).to_dataframe()