Python DataSource示例，mozanalysis.metrics.DataSource Python示例

示例#1

0

显示文件

def test_process_metrics(spark):
    exp = Experiment('a-stub', '20190101', num_dates_enrollment=8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))

    ds_df_A = register_data_source_fixture(spark, name='ds_df_A')
    ds_df_B = register_data_source_fixture(spark, name='ds_df_B')

    ds_A = DataSource.from_dataframe('ds_df_A', ds_df_A)
    ds_B = DataSource.from_dataframe('ds_df_B', ds_df_B)

    m1 = Metric.from_col('m1', ds_df_A.numeric_col, ds_A)
    m2 = Metric.from_col('m2', ds_df_A.bool_col, ds_A)
    m3 = Metric.from_col('m3', ds_df_B.numeric_col, ds_B)

    metric_list = [m1, m2, m3]

    exp = Experiment('a-stub', '20190101')

    data_sources_and_metrics = exp._process_metrics(enrollments, metric_list)

    assert len(data_sources_and_metrics) == 2

    assert len(data_sources_and_metrics[ds_df_A]) == 2
    assert len(data_sources_and_metrics[ds_df_B]) == 1

    assert 'numeric_col' in repr(data_sources_and_metrics[ds_df_B][0])
    assert '`m3`' in repr(data_sources_and_metrics[ds_df_B][0])
    assert repr(data_sources_and_metrics[ds_df_B][0]) in {
        "Column<b'numeric_col AS `m3`'>",  # py3
        "Column<numeric_col AS `m3`>",  # py2
    }

示例#2

0

显示文件

def test_complains_about_template_without_default():
    with pytest.raises(ValueError):
        DataSource(
            name="foo",
            from_expr="moz-fx-data-shared-prod.{dataset}.foo",
        )
    DataSource(
        name="foo",
        from_expr="moz-fx-data-shared-prod.{dataset}.foo",
        default_dataset="dataset",
    )

示例#3

0

显示文件

文件： test_metrics.py 项目： mmccorks/mozanalysis

def test_datasource_constructor_fails(name, from_expr, experiments_column_type, error):
    with pytest.raises(error):
        DataSource(
            name=name,
            from_expr=from_expr,
            experiments_column_type=experiments_column_type,
        )

示例#4

0

显示文件

def _get_metrics(spark):
    ds_df = _get_data_source_df(spark)
    ds = DataSource.from_dataframe('bla_ds', ds_df)

    return {
        'how_many_ones':
        Metric.from_col('how_many_ones', agg_sum(ds_df.constant_one), ds),
    }

示例#5

0

显示文件

    def test_to_mozanalysis_metric(self):
        metric = Metric(
            name="test",
            data_source=DataSource(name="test_data_source", from_expr="test.test"),
            select_expression="test",
            analysis_bases=[AnalysisBasis.EXPOSURES],
        )

        mozanalysis_metric = metric.to_mozanalysis_metric()

        assert mozanalysis_metric
        assert mozanalysis_metric.name == metric.name
        assert metric.analysis_bases == [AnalysisBasis.EXPOSURES]

示例#6

0

显示文件

def test_process_metrics_dupe_data_source(spark):
    exp = Experiment('a-stub', '20190101', num_dates_enrollment=8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))

    ds_df = register_data_source_fixture(spark, name='ds_df_A')

    ds_1 = DataSource.from_dataframe('ds_df_A', ds_df)
    ds_2 = DataSource.from_dataframe('ds_df_A', ds_df)

    m1 = Metric.from_col('m1', ds_df.numeric_col, ds_1)
    m2 = Metric.from_col('m2', ds_df.bool_col, ds_2)

    metric_list = [m1, m2]

    exp = Experiment('a-stub', '20190101')

    data_sources_and_metrics = exp._process_metrics(enrollments, metric_list)

    assert len(data_sources_and_metrics) == 1

    assert len(data_sources_and_metrics[ds_df]) == 2

示例#7

0

显示文件

def _make_metric_list(report):
    metric_list = list()
    for metric in report['metrics']:
        try:
            metric_list.append(getattr(desktop, metric))
        except AttributeError:
            print(f'`{metric}` is not a pre-defined Metric. Will skip')
    if 'user_defined_metrics' in report:
        for data_source, data_source_metrics \
                in report['user_defined_metrics'].items():
            if not getattr(desktop, data_source, None):
                from_expr = report['user_defined_data_source'][data_source]
                data_source = DataSource(name=data_source,
                                         from_expr=from_expr,
                                         experiments_column_type='native')
            else:
                data_source = getattr(desktop, data_source)
            for key, select_expr in data_source_metrics.items():
                new_metric = Metric(name=key,
                                    data_source=data_source,
                                    select_expr=select_expr)
                metric_list.append(new_metric)

    return metric_list

示例#8

0

显示文件

文件： desktop.py 项目： felixlawrence/mozanalysis

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

# from pyspark.sql import functions as F

from mozanalysis.metrics import Metric, DataSource, agg_sum, agg_any
from mozanalysis.utils import all_  # , any_

clients_daily = DataSource.from_table_name('clients_daily')
main_summary = DataSource.from_table_name('main_summary')
search_clients_daily = DataSource.from_table_name('search_clients_daily')
events = DataSource.from_table_name('events')


@DataSource.from_func()
def telemetry_shield_study_parquet(spark, experiment):
    """DataSource commonly used with addon studies.

    Used when we need to collect experiment-specific telemetry. We
    filter to just include the data submitted by this experiment's
    addon.
    """
    tssp = spark.table('telemetry_shield_study_parquet')

    this_exp = tssp.filter(tssp.payload.study_name ==
                           experiment.experiment_slug).withColumnRenamed(
                               'submission', 'submission_date_s3')

    if experiment.addon_version is None:
        return this_exp

示例#9

0

显示文件

文件： test_analysis_integration.py 项目： mozilla/jetstream

    def test_logging(self, monkeypatch, client, project_id, static_dataset,
                     temporary_dataset):
        experiment = Experiment(
            experimenter_slug="test-experiment",
            type="rollout",
            status="Live",
            start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc),
            end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc),
            proposed_enrollment=7,
            branches=[
                Branch(slug="branch1", ratio=0.5),
                Branch(slug="branch2", ratio=0.5)
            ],
            reference_branch="branch2",
            normandy_slug="test-experiment",
            is_high_population=False,
            app_name="firefox_desktop",
            app_id="firefox-desktop",
        )

        config = AnalysisSpec().resolve(experiment)

        test_clients_daily = DataSource(
            name="clients_daily",
            from_expr=f"`{project_id}.test_data.clients_daily`",
        )

        test_active_hours = Metric(
            name="active_hours",
            data_source=test_clients_daily,
            select_expression=agg_sum("active_hours_sum"),
        )

        config.metrics = {
            AnalysisPeriod.WEEK: [
                Summary(test_active_hours,
                        BootstrapMean(confidence_interval=10))
            ]
        }

        log_config = LogConfiguration(
            log_project_id=project_id,
            log_dataset_id=temporary_dataset,
            log_table_id="logs",
            log_to_bigquery=True,
            task_profiling_log_table_id="task_profiling_logs",
            task_monitoring_log_table_id="task_monitoring_logs",
            capacity=1,
        )
        self.analysis_mock_run(monkeypatch, config, static_dataset,
                               temporary_dataset, project_id, log_config)

        assert client.client.get_table(
            f"{project_id}.{temporary_dataset}.logs") is not None

        logs = list(
            client.client.list_rows(f"{project_id}.{temporary_dataset}.logs"))

        assert len(logs) >= 1
        error_logs = [log for log in logs if log.get("log_level") == "ERROR"]
        assert (
            "Error while computing statistic bootstrap_mean for metric active_hours"
            in error_logs[0].get("message"))
        assert error_logs[0].get("log_level") == "ERROR"

示例#10

0

显示文件

文件： test_analysis_integration.py 项目： mozilla/jetstream

    def test_with_segments(self, monkeypatch, client, project_id,
                           static_dataset, temporary_dataset):
        experiment = Experiment(
            experimenter_slug="test-experiment",
            type="rollout",
            status="Live",
            start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc),
            end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc),
            proposed_enrollment=7,
            branches=[
                Branch(slug="branch1", ratio=0.5),
                Branch(slug="branch2", ratio=0.5)
            ],
            reference_branch="branch2",
            normandy_slug="test-experiment",
            is_high_population=False,
            app_name="firefox_desktop",
            app_id="firefox-desktop",
        )

        config = AnalysisSpec().resolve(experiment)

        test_clients_daily = DataSource(
            name="clients_daily",
            from_expr=f"`{project_id}.test_data.clients_daily`",
        )

        test_active_hours = Metric(
            name="active_hours",
            data_source=test_clients_daily,
            select_expression=agg_sum("active_hours_sum"),
        )

        test_clients_last_seen = SegmentDataSource(
            "clients_last_seen", f"`{project_id}.test_data.clients_last_seen`")
        regular_user_v3 = Segment(
            "regular_user_v3",
            test_clients_last_seen,
            "COALESCE(LOGICAL_OR(is_regular_user_v3), FALSE)",
        )
        config.experiment.segments = [regular_user_v3]

        config.metrics = {
            AnalysisPeriod.WEEK: [Summary(test_active_hours, BootstrapMean())]
        }

        self.analysis_mock_run(monkeypatch, config, static_dataset,
                               temporary_dataset, project_id)

        query_job = client.client.query(f"""
            SELECT
              *
            FROM `{project_id}.{temporary_dataset}.test_experiment_enrollments_week_1`
            ORDER BY enrollment_date DESC
        """)

        expected_metrics_results = [
            {
                "client_id": "bbbb",
                "branch": "branch2",
                "enrollment_date": datetime.date(2020, 4, 3),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
                "regular_user_v3": True,
            },
            {
                "client_id": "aaaa",
                "branch": "branch1",
                "enrollment_date": datetime.date(2020, 4, 2),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
                "regular_user_v3": False,
            },
        ]

        for i, row in enumerate(query_job.result()):
            for k, v in expected_metrics_results[i].items():
                assert row[k] == v

        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.test_experiment_enrollments_weekly"
        ) is not None)
        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1"
        ) is not None)

        stats = client.client.list_rows(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1"
        ).to_dataframe()

        # Only one count per segment and branch, please
        assert (stats.query(
            "metric == 'identity' and statistic == 'count'").groupby(
                ["segment", "analysis_basis", "window_index",
                 "branch"]).size() == 1).all()

        count_by_branch = stats.query(
            "segment == 'all' and statistic == 'count'").set_index("branch")
        assert count_by_branch.loc["branch1", "point"] == 1.0
        assert count_by_branch.loc["branch2", "point"] == 1.0
        assert count_by_branch.loc["branch2",
                                   "analysis_basis"] == "enrollments"

示例#11

0

显示文件

    def test_metrics(self, client, project_id, static_dataset,
                     temporary_dataset):
        experiment = Experiment(
            experimenter_slug="test-experiment",
            type="rollout",
            status="Live",
            start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc),
            end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc),
            proposed_enrollment=7,
            branches=[
                Branch(slug="branch1", ratio=0.5),
                Branch(slug="branch2", ratio=0.5)
            ],
            reference_branch="branch2",
            features=[],
            normandy_slug="test-experiment",
        )

        config = AnalysisSpec().resolve(experiment)

        test_clients_daily = DataSource(
            name="clients_daily",
            from_expr=f"`{project_id}.test_data.clients_daily`",
        )

        test_active_hours = Metric(
            name="active_hours",
            data_source=test_clients_daily,
            select_expr=agg_sum("active_hours_sum"),
        )

        config.metrics = {
            AnalysisPeriod.WEEK: [Summary(test_active_hours, BootstrapMean())]
        }

        self.analysis_mock_run(config, static_dataset, temporary_dataset,
                               project_id)

        query_job = client.client.query(f"""
            SELECT
              *
            FROM `{project_id}.{temporary_dataset}.test_experiment_week_1`
            ORDER BY enrollment_date DESC
        """)

        expected_metrics_results = [
            {
                "client_id": "bbbb",
                "branch": "branch2",
                "enrollment_date": datetime.date(2020, 4, 3),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
            },
            {
                "client_id": "aaaa",
                "branch": "branch1",
                "enrollment_date": datetime.date(2020, 4, 2),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
            },
        ]

        for i, row in enumerate(query_job.result()):
            for k, v in expected_metrics_results[i].items():
                assert row[k] == v

        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.test_experiment_weekly")
                is not None)
        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1"
        ) is not None)

        stats = client.client.list_rows(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1"
        ).to_dataframe()

        count_by_branch = stats.query("statistic == 'count'").set_index(
            "branch")
        assert count_by_branch.loc["branch1", "point"] == 1.0
        assert count_by_branch.loc["branch2", "point"] == 1.0

        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_weekly"
        ) is not None)

示例#12

0

显示文件

文件： desktop.py 项目： mozilla/mozanalysis

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from textwrap import dedent

from mozanalysis.metrics import DataSource, Metric, agg_any, agg_sum

#: DataSource: The clients_daily table.
clients_daily = DataSource(
    name="clients_daily",
    from_expr="mozdata.telemetry.clients_daily",
)

#: DataSource: The `search_clients_engines_sources_daily`_ table.
#: This table unpacks search counts from the main ping;
#: it contains one row per (client_id, submission_date, engine, source).
#:
#: .. _`search_clients_engines_sources_daily`: https://docs.telemetry.mozilla.org/
#:    datasets/search/search_clients_engines_sources_daily/reference.html
search_clients_engines_sources_daily = DataSource(
    name="search_clients_engines_sources_daily",
    from_expr="mozdata.search.search_clients_engines_sources_daily",
    experiments_column_type=None,
)

#: DataSource: A clone of `search_clients_engines_sources_daily`.
#: Exists for backwards compatibility; new uses should use the new name.
search_clients_daily = search_clients_engines_sources_daily

#: DataSource: The main_summary table.
main_summary = DataSource(name="main_summary",

示例#13

0

显示文件

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from mozanalysis.metrics import Metric, DataSource, agg_sum, agg_any

clients_daily = DataSource(
    name='clients_daily',
    from_expr="`moz-fx-data-shared-prod.telemetry.clients_daily`",
)

search_clients_daily = DataSource(
    name='search_clients_daily',
    from_expr='`moz-fx-data-shared-prod.search.search_clients_daily`',
    experiments_column_type=None,
)

main_summary = DataSource(
    name='main_summary',
    from_expr="`moz-fx-data-shared-prod.telemetry.main_summary`")

events = DataSource(
    name='events',
    from_expr="`moz-fx-data-shared-prod.telemetry.events`",
    experiments_column_type='native',
)

# The telemetry.events table is clustered by event_category.
# Normandy accounts for about 10% of event volume, so this dramatically
# reduces bytes queried compared to counting rows from the generic events DataSource.
normandy_events = DataSource(

示例#14

0

显示文件

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from mozanalysis.metrics import Metric, DataSource, agg_sum, agg_any

clients_daily = DataSource(
    name='clients_daily',
    from_expr="`moz-fx-data-shared-prod.telemetry.clients_daily`",
)

search_clients_daily = DataSource(
    name='search_clients_daily',
    from_expr='`moz-fx-data-shared-prod.search.search_clients_daily`',
    experiments_column_type=None,
)

main_summary = DataSource(
    name='main_summary',
    from_expr="`moz-fx-data-shared-prod.telemetry.main_summary`")

events = DataSource(
    name='events',
    from_expr="`moz-fx-data-shared-prod.telemetry.events`",
    experiments_column_type='native',
)

main = DataSource(
    name='main',
    from_expr="""(
                SELECT

示例#15

0

显示文件

def test_get_per_client_data_join(spark):
    exp = Experiment('a-stub', '20190101')

    enrollments = spark.createDataFrame(
        [
            ['aaaa', 'control', '20190101'],
            ['bbbb', 'test', '20190101'],
            ['cccc', 'control', '20190108'],
            ['dddd', 'test', '20190109'],
            ['annie-nodata', 'control', '20190101'],
            ['bob-badtiming', 'test', '20190102'],
            ['carol-gooddata', 'test', '20190101'],
            ['derek-lateisok', 'control', '20190110'],
        ],
        [
            "client_id",
            "branch",
            "enrollment_date",
        ],
    )

    ex_d = {'a-stub': 'fake-branch-lifes-too-short'}
    data_source_df = spark.createDataFrame(
        [
            # bob-badtiming only has data before/after analysis window
            # but missed by `process_data_source`
            ['bob-badtiming', '20190102', ex_d, 1],
            ['bob-badtiming', '20190106', ex_d, 2],
            # carol-gooddata has data on two days (including a dupe day)
            ['carol-gooddata', '20190102', ex_d, 3],
            ['carol-gooddata', '20190102', ex_d, 2],
            ['carol-gooddata', '20190104', ex_d, 6],
            # derek-lateisok has data before and during the analysis window
            ['derek-lateisok', '20190110', ex_d, 1000],
            ['derek-lateisok', '20190111', ex_d, 1],
            # TODO: exercise the last condition on the join
        ],
        [
            "client_id",
            "submission_date_s3",
            "experiments",
            "some_value",
        ],
    )

    ds = DataSource.from_dataframe('ds', data_source_df)
    metric = Metric.from_col('some_value', agg_sum(data_source_df.some_value),
                             ds)

    res = exp.get_per_client_data(enrollments, [metric],
                                  '20190114',
                                  1,
                                  3,
                                  keep_client_id=True)

    # Check that the dataframe has the correct number of rows
    assert res.count() == enrollments.count()

    # Check that dataless enrollments are handled correctly
    annie_nodata = res.filter(res.client_id == 'annie-nodata')
    assert annie_nodata.count() == 1
    assert annie_nodata.first()['some_value'] == 0

    # Check that early and late data were ignored
    # i.e. check the join, not just _process_data_source_df
    bob_badtiming = res.filter(res.client_id == 'bob-badtiming')
    assert bob_badtiming.count() == 1
    assert bob_badtiming.first()['some_value'] == 0
    # Check that _process_data_source_df didn't do the
    # heavy lifting above
    time_limits = TimeLimits.for_single_analysis_window(
        exp.start_date, '20190114', 1, 3, exp.num_dates_enrollment)
    pds = exp._process_data_source_df(data_source_df, time_limits)
    assert pds.filter(pds.client_id == 'bob-badtiming').select(
        F.sum(pds.some_value).alias('agg_val')).first()['agg_val'] == 3

    # Check that relevant data was included appropriately
    carol_gooddata = res.filter(res.client_id == 'carol-gooddata')
    assert carol_gooddata.count() == 1
    assert carol_gooddata.first()['some_value'] == 11

    derek_lateisok = res.filter(res.client_id == 'derek-lateisok')
    assert derek_lateisok.count() == 1
    assert derek_lateisok.first()['some_value'] == 1

    # Check that it still works for `data_source`s without an experiments map
    ds_df_noexp = data_source_df.drop('experiments')
    ds_noexp = DataSource.from_dataframe('ds_noexp', ds_df_noexp)
    metric_noexp = Metric.from_col('some_value',
                                   agg_sum(ds_df_noexp.some_value), ds_noexp)

    res2 = exp.get_per_client_data(enrollments, [metric_noexp],
                                   '20190114',
                                   1,
                                   3,
                                   keep_client_id=True)

    assert res2.count() == enrollments.count()

示例#16

0

显示文件

文件： test_analysis_integration.py 项目： mozilla/jetstream

    def test_metrics_with_exposure(self, monkeypatch, client, project_id,
                                   static_dataset, temporary_dataset):
        experiment = Experiment(
            experimenter_slug="test-experiment",
            type="rollout",
            status="Live",
            start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc),
            end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc),
            proposed_enrollment=7,
            branches=[
                Branch(slug="branch1", ratio=0.5),
                Branch(slug="branch2", ratio=0.5)
            ],
            reference_branch="branch2",
            normandy_slug="test-experiment",
            is_high_population=False,
            app_name="firefox_desktop",
            app_id="firefox-desktop",
        )

        config = AnalysisSpec().resolve(experiment)

        test_clients_daily = DataSource(
            name="clients_daily",
            from_expr=f"`{project_id}.test_data.clients_daily`",
        )

        test_active_hours = Metric(
            name="active_hours",
            data_source=test_clients_daily,
            select_expression=agg_sum("active_hours_sum"),
            analysis_bases=[AnalysisBasis.EXPOSURES],
        )

        config.metrics = {
            AnalysisPeriod.WEEK: [Summary(test_active_hours, BootstrapMean())]
        }
        config.experiment.exposure_signal = ExposureSignal(
            name="ad_exposure",
            data_source=test_clients_daily,
            select_expression="active_hours_sum > 0",
            friendly_name="Ad exposure",
            description="Clients have clicked on ad",
            window_start="enrollment_start",
            window_end="analysis_window_end",
        )

        self.analysis_mock_run(monkeypatch, config, static_dataset,
                               temporary_dataset, project_id)

        query_job = client.client.query(f"""
            SELECT
              *
            FROM `{project_id}.{temporary_dataset}.test_experiment_exposures_week_1`
            ORDER BY enrollment_date DESC
        """)

        expected_metrics_results = [
            {
                "client_id": "bbbb",
                "branch": "branch2",
                "enrollment_date": datetime.date(2020, 4, 3),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
            },
            {
                "client_id": "aaaa",
                "branch": "branch1",
                "enrollment_date": datetime.date(2020, 4, 2),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
            },
        ]

        r = query_job.result()

        for i, row in enumerate(r):
            for k, v in expected_metrics_results[i].items():
                assert row[k] == v

        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.test_experiment_exposures_weekly"
        ) is not None)
        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1"
        ) is not None)

        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_weekly"
        ) is not None)

示例#17

0

显示文件

文件： test_metrics.py 项目： mmccorks/mozanalysis

def test_datasource_constructor_succeeds(experiments_column_type):
    DataSource(
        name="foo",
        from_expr="my_table.name",
        experiments_column_type=None,
    )

示例#18

0

显示文件

文件： test_analysis_integration.py 项目： mozilla/jetstream

    def test_no_enrollments(self, monkeypatch, client, project_id,
                            static_dataset, temporary_dataset):
        experiment = Experiment(
            experimenter_slug="test-experiment-2",
            type="rollout",
            status="Live",
            start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc),
            end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc),
            proposed_enrollment=7,
            branches=[
                Branch(slug="a", ratio=0.5),
                Branch(slug="b", ratio=0.5)
            ],
            reference_branch="a",
            normandy_slug="test-experiment-2",
            is_high_population=False,
            app_name="firefox_desktop",
            app_id="firefox-desktop",
        )

        config = AnalysisSpec().resolve(experiment)

        test_clients_daily = DataSource(
            name="clients_daily",
            from_expr=f"`{project_id}.test_data.clients_daily`",
        )

        test_active_hours = Metric(
            name="active_hours",
            data_source=test_clients_daily,
            select_expression=agg_sum("active_hours_sum"),
        )

        config.metrics = {
            AnalysisPeriod.WEEK: [Summary(test_active_hours, BootstrapMean())]
        }

        self.analysis_mock_run(monkeypatch, config, static_dataset,
                               temporary_dataset, project_id)

        query_job = client.client.query(f"""
            SELECT
              *
            FROM `{project_id}.{temporary_dataset}.test_experiment_2_enrollments_week_1`
            ORDER BY enrollment_date DESC
        """)

        assert query_job.result().total_rows == 0

        stats = client.client.list_rows(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_2_week_1"
        ).to_dataframe()

        count_by_branch = stats.query("statistic == 'count'").set_index(
            "branch")
        assert count_by_branch.loc["a", "point"] == 0.0
        assert count_by_branch.loc["b", "point"] == 0.0
        assert count_by_branch.loc["b", "analysis_basis"] == "enrollments"

        assert (client.client.get_table(
            f"{project_id}.{temporary_dataset}.statistics_test_experiment_2_weekly"
        ) is not None)

示例#19

0

显示文件

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from mozanalysis.metrics import DataSource, Metric

#: DataSource: The baseline ping table.
baseline = DataSource(
    name="baseline",
    from_expr="""(
                SELECT
                    p.*,
                    DATE(p.submission_timestamp) AS submission_date
                FROM `moz-fx-data-shared-prod.{dataset}.baseline` p
            )""",
    client_id_column="client_info.client_id",
    experiments_column_type="glean",
    default_dataset="org_mozilla_ios_firefox",
)


#: DataSource: Events table.
#: For convenience, this is exploded to one-row-per-event
#: like the ``telemetry.events`` dataset.
events = DataSource(
    name="events",
    from_expr="""(
                SELECT
                    p.* EXCEPT (events),
                    DATE(p.submission_timestamp) AS submission_date,
                    event

示例#20

0

显示文件

文件： test_analysis_integration.py 项目： ksiegler1/pensieve

    def test_metrics(self, client):
        experiment = Experiment(
            slug="test-experiment",
            type="rollout",
            start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc),
            end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc),
            proposed_enrollment=7,
            variants=[
                Variant(is_control=False, slug="branch1", ratio=0.5),
                Variant(is_control=True, slug="branch2", ratio=0.5),
            ],
            normandy_slug="test-experiment",
        )

        orig = mozanalysis.experiment.Experiment.build_query

        def build_query_test_project(instance, *args, **kwargs):
            # to use the test project and dataset, we need to change the SQL query
            # generated by mozanalysis
            query = orig(instance, args[0], args[1], args[2], args[3])
            query = query.replace("moz-fx-data-shared-prod", self.project_id)
            query = query.replace("telemetry", self.static_dataset)
            return query

        config = AnalysisSpec().resolve(experiment)

        test_clients_daily = DataSource(
            name="clients_daily", from_expr=f"`{self.project_id}.test_data.clients_daily`",
        )

        test_active_hours = Metric(
            name="active_hours",
            data_source=test_clients_daily,
            select_expr=agg_sum("active_hours_sum"),
        )

        config.metrics = {
            AnalysisPeriod.WEEK: [
                Summary(test_active_hours, BootstrapMean(ref_branch_label="branch1"))
            ]
        }

        analysis = Analysis(self.project_id, self.test_dataset, config)

        with mock.patch.object(
            mozanalysis.experiment.Experiment, "build_query", new=build_query_test_project
        ):
            analysis.run(current_date=dt.datetime(2020, 4, 12), dry_run=False)

        query_job = client.query(
            f"""
            SELECT
              *
            FROM `{self.project_id}.{self.test_dataset}.test_experiment_week_1`
            ORDER BY enrollment_date DESC
        """
        )

        expected_metrics_results = [
            {
                "client_id": "bbbb",
                "branch": "branch2",
                "enrollment_date": datetime.date(2020, 4, 3),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
            },
            {
                "client_id": "aaaa",
                "branch": "branch1",
                "enrollment_date": datetime.date(2020, 4, 2),
                "num_enrollment_events": 1,
                "analysis_window_start": 0,
                "analysis_window_end": 6,
            },
        ]

        for i, row in enumerate(query_job.result()):
            for k, v in expected_metrics_results[i].items():
                assert row[k] == v

        assert (
            client.get_table(f"{self.project_id}.{self.test_dataset}.test_experiment_weekly")
            is not None
        )
        assert (
            client.get_table(
                f"{self.project_id}.{self.test_dataset}.statistics_test_experiment_week_1"
            )
            is not None
        )
        assert (
            client.get_table(
                f"{self.project_id}.{self.test_dataset}.statistics_test_experiment_weekly"
            )
            is not None
        )