def active_hours(cd): """Active hours, from ``active_ticks`` At any given moment, a client is "active" if there was a keyboard or mouse interaction (click, scroll, move) in the previous 5 seconds. """ return agg_sum(cd.active_hours_sum)
def _get_metrics(spark): ds_df = _get_data_source_df(spark) ds = DataSource.from_dataframe('bla_ds', ds_df) return { 'how_many_ones': Metric.from_col('how_many_ones', agg_sum(ds_df.constant_one), ds), }
def test_agg_sum(spark): df = register_fixture(spark) res = df.groupBy('client_id').agg( mm.agg_sum(df.numeric_col).alias('metric_value')).toPandas().set_index( 'client_id').metric_value assert res['aaaa'] == 2 assert res['bb'] == 0 assert res['ccc'] == 5 assert res['dd'] == 0
def test_metric_from_col(spark): orig_df = register_fixture(spark) ds = mm.DataSource.from_dataframe('an_ordinary_data_source', orig_df) metric = mm.Metric.from_col('a_special_metric', mm.agg_sum(orig_df.numeric_col), ds) assert metric.name == 'a_special_metric' assert metric.data_source.get_dataframe(spark, None) == orig_df res = orig_df.groupBy('client_id').agg(metric.get_col( spark, None)).toPandas().set_index('client_id').a_special_metric assert res['aaaa'] == 2 assert res['bb'] == 0 assert res['ccc'] == 5 assert res['dd'] == 0
def search_count(scd): return agg_sum(scd.sap)
def test_logging(self, monkeypatch, client, project_id, static_dataset, temporary_dataset): experiment = Experiment( experimenter_slug="test-experiment", type="rollout", status="Live", start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc), end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc), proposed_enrollment=7, branches=[ Branch(slug="branch1", ratio=0.5), Branch(slug="branch2", ratio=0.5) ], reference_branch="branch2", normandy_slug="test-experiment", is_high_population=False, app_name="firefox_desktop", app_id="firefox-desktop", ) config = AnalysisSpec().resolve(experiment) test_clients_daily = DataSource( name="clients_daily", from_expr=f"`{project_id}.test_data.clients_daily`", ) test_active_hours = Metric( name="active_hours", data_source=test_clients_daily, select_expression=agg_sum("active_hours_sum"), ) config.metrics = { AnalysisPeriod.WEEK: [ Summary(test_active_hours, BootstrapMean(confidence_interval=10)) ] } log_config = LogConfiguration( log_project_id=project_id, log_dataset_id=temporary_dataset, log_table_id="logs", log_to_bigquery=True, task_profiling_log_table_id="task_profiling_logs", task_monitoring_log_table_id="task_monitoring_logs", capacity=1, ) self.analysis_mock_run(monkeypatch, config, static_dataset, temporary_dataset, project_id, log_config) assert client.client.get_table( f"{project_id}.{temporary_dataset}.logs") is not None logs = list( client.client.list_rows(f"{project_id}.{temporary_dataset}.logs")) assert len(logs) >= 1 error_logs = [log for log in logs if log.get("log_level") == "ERROR"] assert ( "Error while computing statistic bootstrap_mean for metric active_hours" in error_logs[0].get("message")) assert error_logs[0].get("log_level") == "ERROR"
def test_with_segments(self, monkeypatch, client, project_id, static_dataset, temporary_dataset): experiment = Experiment( experimenter_slug="test-experiment", type="rollout", status="Live", start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc), end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc), proposed_enrollment=7, branches=[ Branch(slug="branch1", ratio=0.5), Branch(slug="branch2", ratio=0.5) ], reference_branch="branch2", normandy_slug="test-experiment", is_high_population=False, app_name="firefox_desktop", app_id="firefox-desktop", ) config = AnalysisSpec().resolve(experiment) test_clients_daily = DataSource( name="clients_daily", from_expr=f"`{project_id}.test_data.clients_daily`", ) test_active_hours = Metric( name="active_hours", data_source=test_clients_daily, select_expression=agg_sum("active_hours_sum"), ) test_clients_last_seen = SegmentDataSource( "clients_last_seen", f"`{project_id}.test_data.clients_last_seen`") regular_user_v3 = Segment( "regular_user_v3", test_clients_last_seen, "COALESCE(LOGICAL_OR(is_regular_user_v3), FALSE)", ) config.experiment.segments = [regular_user_v3] config.metrics = { AnalysisPeriod.WEEK: [Summary(test_active_hours, BootstrapMean())] } self.analysis_mock_run(monkeypatch, config, static_dataset, temporary_dataset, project_id) query_job = client.client.query(f""" SELECT * FROM `{project_id}.{temporary_dataset}.test_experiment_enrollments_week_1` ORDER BY enrollment_date DESC """) expected_metrics_results = [ { "client_id": "bbbb", "branch": "branch2", "enrollment_date": datetime.date(2020, 4, 3), "num_enrollment_events": 1, "analysis_window_start": 0, "analysis_window_end": 6, "regular_user_v3": True, }, { "client_id": "aaaa", "branch": "branch1", "enrollment_date": datetime.date(2020, 4, 2), "num_enrollment_events": 1, "analysis_window_start": 0, "analysis_window_end": 6, "regular_user_v3": False, }, ] for i, row in enumerate(query_job.result()): for k, v in expected_metrics_results[i].items(): assert row[k] == v assert (client.client.get_table( f"{project_id}.{temporary_dataset}.test_experiment_enrollments_weekly" ) is not None) assert (client.client.get_table( f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1" ) is not None) stats = client.client.list_rows( f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1" ).to_dataframe() # Only one count per segment and branch, please assert (stats.query( "metric == 'identity' and statistic == 'count'").groupby( ["segment", "analysis_basis", "window_index", "branch"]).size() == 1).all() count_by_branch = stats.query( "segment == 'all' and statistic == 'count'").set_index("branch") assert count_by_branch.loc["branch1", "point"] == 1.0 assert count_by_branch.loc["branch2", "point"] == 1.0 assert count_by_branch.loc["branch2", "analysis_basis"] == "enrollments"
def test_no_enrollments(self, monkeypatch, client, project_id, static_dataset, temporary_dataset): experiment = Experiment( experimenter_slug="test-experiment-2", type="rollout", status="Live", start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc), end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc), proposed_enrollment=7, branches=[ Branch(slug="a", ratio=0.5), Branch(slug="b", ratio=0.5) ], reference_branch="a", normandy_slug="test-experiment-2", is_high_population=False, app_name="firefox_desktop", app_id="firefox-desktop", ) config = AnalysisSpec().resolve(experiment) test_clients_daily = DataSource( name="clients_daily", from_expr=f"`{project_id}.test_data.clients_daily`", ) test_active_hours = Metric( name="active_hours", data_source=test_clients_daily, select_expression=agg_sum("active_hours_sum"), ) config.metrics = { AnalysisPeriod.WEEK: [Summary(test_active_hours, BootstrapMean())] } self.analysis_mock_run(monkeypatch, config, static_dataset, temporary_dataset, project_id) query_job = client.client.query(f""" SELECT * FROM `{project_id}.{temporary_dataset}.test_experiment_2_enrollments_week_1` ORDER BY enrollment_date DESC """) assert query_job.result().total_rows == 0 stats = client.client.list_rows( f"{project_id}.{temporary_dataset}.statistics_test_experiment_2_week_1" ).to_dataframe() count_by_branch = stats.query("statistic == 'count'").set_index( "branch") assert count_by_branch.loc["a", "point"] == 0.0 assert count_by_branch.loc["b", "point"] == 0.0 assert count_by_branch.loc["b", "analysis_basis"] == "enrollments" assert (client.client.get_table( f"{project_id}.{temporary_dataset}.statistics_test_experiment_2_weekly" ) is not None)
def organic_search_count(scd): return agg_sum(scd.organic)
def a_lovely_metric(df): """Hi there!""" return mm.agg_sum(df.numeric_col)
from_expr="""( SELECT p.*, DATE(p.submission_timestamp) AS submission_date FROM `moz-fx-data-shared-prod.{dataset}.metrics` p )""", client_id_column="client_info.client_id", experiments_column_type="glean", default_dataset="org_mozilla_firefox", ) #: Metric: ... uri_count = Metric( name="uri_count", data_source=baseline, select_expr=agg_sum("metrics.counter.events_total_uri_count"), friendly_name="URIs visited", description="Counts the number of URIs each client visited", ) #: Metric: ... user_reports_site_issue_count = Metric( name="user_reports_site_issue_count", data_source=events, select_expr="COUNTIF(event.name = 'browser_menu_action' AND " + "mozfun.map.get_key('event.extra', 'item') = 'report_site_issue')", friendly_name="Site issues reported", description= "Counts the number of times clients reported an issue with a site.", )
) cfr = DataSource( name='cfr', from_expr="""( SELECT *, DATE(submission_timestamp) AS submission_date FROM `moz-fx-data-derived-datasets`.messaging_system.cfr )""", experiments_column_type="native", ) active_hours = Metric(name='active_hours', data_source=clients_daily, select_expr=agg_sum('active_hours_sum')) uri_count = Metric(name='uri_count', data_source=clients_daily, select_expr=agg_sum( 'scalar_parent_browser_engagement_total_uri_count_sum')) search_count = Metric(name='search_count', data_source=search_clients_daily, select_expr=agg_sum('sap')) tagged_search_count = Metric(name='tagged_search_count', data_source=search_clients_daily, select_expr=agg_sum('tagged_sap')) tagged_follow_on_search_count = Metric(name='tagged_follow_on_search_count',
crash = DataSource( name='crash', from_expr="""( SELECT *, DATE(submission_timestamp) AS submission_date, environment.experiments FROM `moz-fx-data-shared-prod`.telemetry.crash )""", experiments_column_type="native", ) active_hours = Metric(name='active_hours', data_source=clients_daily, select_expr=agg_sum('active_hours_sum')) uri_count = Metric(name='uri_count', data_source=clients_daily, select_expr=agg_sum( 'scalar_parent_browser_engagement_total_uri_count_sum')) search_count = Metric(name='search_count', data_source=search_clients_daily, select_expr=agg_sum('sap')) ad_clicks = Metric(name='ad_clicks', data_source=search_clients_daily, select_expr=agg_sum('ad_click')) organic_search_count = Metric(name='organic_search_count',
def test_get_per_client_data_join(spark): exp = Experiment('a-stub', '20190101') enrollments = spark.createDataFrame( [ ['aaaa', 'control', '20190101'], ['bbbb', 'test', '20190101'], ['cccc', 'control', '20190108'], ['dddd', 'test', '20190109'], ['annie-nodata', 'control', '20190101'], ['bob-badtiming', 'test', '20190102'], ['carol-gooddata', 'test', '20190101'], ['derek-lateisok', 'control', '20190110'], ], [ "client_id", "branch", "enrollment_date", ], ) ex_d = {'a-stub': 'fake-branch-lifes-too-short'} data_source_df = spark.createDataFrame( [ # bob-badtiming only has data before/after analysis window # but missed by `process_data_source` ['bob-badtiming', '20190102', ex_d, 1], ['bob-badtiming', '20190106', ex_d, 2], # carol-gooddata has data on two days (including a dupe day) ['carol-gooddata', '20190102', ex_d, 3], ['carol-gooddata', '20190102', ex_d, 2], ['carol-gooddata', '20190104', ex_d, 6], # derek-lateisok has data before and during the analysis window ['derek-lateisok', '20190110', ex_d, 1000], ['derek-lateisok', '20190111', ex_d, 1], # TODO: exercise the last condition on the join ], [ "client_id", "submission_date_s3", "experiments", "some_value", ], ) ds = DataSource.from_dataframe('ds', data_source_df) metric = Metric.from_col('some_value', agg_sum(data_source_df.some_value), ds) res = exp.get_per_client_data(enrollments, [metric], '20190114', 1, 3, keep_client_id=True) # Check that the dataframe has the correct number of rows assert res.count() == enrollments.count() # Check that dataless enrollments are handled correctly annie_nodata = res.filter(res.client_id == 'annie-nodata') assert annie_nodata.count() == 1 assert annie_nodata.first()['some_value'] == 0 # Check that early and late data were ignored # i.e. check the join, not just _process_data_source_df bob_badtiming = res.filter(res.client_id == 'bob-badtiming') assert bob_badtiming.count() == 1 assert bob_badtiming.first()['some_value'] == 0 # Check that _process_data_source_df didn't do the # heavy lifting above time_limits = TimeLimits.for_single_analysis_window( exp.start_date, '20190114', 1, 3, exp.num_dates_enrollment) pds = exp._process_data_source_df(data_source_df, time_limits) assert pds.filter(pds.client_id == 'bob-badtiming').select( F.sum(pds.some_value).alias('agg_val')).first()['agg_val'] == 3 # Check that relevant data was included appropriately carol_gooddata = res.filter(res.client_id == 'carol-gooddata') assert carol_gooddata.count() == 1 assert carol_gooddata.first()['some_value'] == 11 derek_lateisok = res.filter(res.client_id == 'derek-lateisok') assert derek_lateisok.count() == 1 assert derek_lateisok.first()['some_value'] == 1 # Check that it still works for `data_source`s without an experiments map ds_df_noexp = data_source_df.drop('experiments') ds_noexp = DataSource.from_dataframe('ds_noexp', ds_df_noexp) metric_noexp = Metric.from_col('some_value', agg_sum(ds_df_noexp.some_value), ds_noexp) res2 = exp.get_per_client_data(enrollments, [metric_noexp], '20190114', 1, 3, keep_client_id=True) assert res2.count() == enrollments.count()
def ad_clicks(scd): return agg_sum(scd.ad_click)
activity_stream_events = DataSource( name="activity_stream_events", from_expr="""( SELECT *, DATE(submission_timestamp) AS submission_date FROM mozdata.activity_stream.events )""", experiments_column_type="native", ) #: Metric: ... active_hours = Metric( name="active_hours", data_source=clients_daily, select_expr=agg_sum("active_hours_sum"), friendly_name="Active hours", description=dedent("""\ Measures the amount of time (in 5-second increments) during which Firefox received user input from a keyboard or mouse. The Firefox window does not need to be focused. """), ) #: Metric: ... uri_count = Metric( name="uri_count", data_source=clients_daily, select_expr=agg_sum( "scalar_parent_browser_engagement_total_uri_count_sum"), friendly_name="URIs visited",
def uri_count(cd): return agg_sum(cd.scalar_parent_browser_engagement_total_uri_count_sum)
def test_metrics_with_exposure(self, monkeypatch, client, project_id, static_dataset, temporary_dataset): experiment = Experiment( experimenter_slug="test-experiment", type="rollout", status="Live", start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc), end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc), proposed_enrollment=7, branches=[ Branch(slug="branch1", ratio=0.5), Branch(slug="branch2", ratio=0.5) ], reference_branch="branch2", normandy_slug="test-experiment", is_high_population=False, app_name="firefox_desktop", app_id="firefox-desktop", ) config = AnalysisSpec().resolve(experiment) test_clients_daily = DataSource( name="clients_daily", from_expr=f"`{project_id}.test_data.clients_daily`", ) test_active_hours = Metric( name="active_hours", data_source=test_clients_daily, select_expression=agg_sum("active_hours_sum"), analysis_bases=[AnalysisBasis.EXPOSURES], ) config.metrics = { AnalysisPeriod.WEEK: [Summary(test_active_hours, BootstrapMean())] } config.experiment.exposure_signal = ExposureSignal( name="ad_exposure", data_source=test_clients_daily, select_expression="active_hours_sum > 0", friendly_name="Ad exposure", description="Clients have clicked on ad", window_start="enrollment_start", window_end="analysis_window_end", ) self.analysis_mock_run(monkeypatch, config, static_dataset, temporary_dataset, project_id) query_job = client.client.query(f""" SELECT * FROM `{project_id}.{temporary_dataset}.test_experiment_exposures_week_1` ORDER BY enrollment_date DESC """) expected_metrics_results = [ { "client_id": "bbbb", "branch": "branch2", "enrollment_date": datetime.date(2020, 4, 3), "num_enrollment_events": 1, "analysis_window_start": 0, "analysis_window_end": 6, }, { "client_id": "aaaa", "branch": "branch1", "enrollment_date": datetime.date(2020, 4, 2), "num_enrollment_events": 1, "analysis_window_start": 0, "analysis_window_end": 6, }, ] r = query_job.result() for i, row in enumerate(r): for k, v in expected_metrics_results[i].items(): assert row[k] == v assert (client.client.get_table( f"{project_id}.{temporary_dataset}.test_experiment_exposures_weekly" ) is not None) assert (client.client.get_table( f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1" ) is not None) assert (client.client.get_table( f"{project_id}.{temporary_dataset}.statistics_test_experiment_weekly" ) is not None)
def test_metrics(self, client, project_id, static_dataset, temporary_dataset): experiment = Experiment( experimenter_slug="test-experiment", type="rollout", status="Live", start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc), end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc), proposed_enrollment=7, branches=[ Branch(slug="branch1", ratio=0.5), Branch(slug="branch2", ratio=0.5) ], reference_branch="branch2", features=[], normandy_slug="test-experiment", ) config = AnalysisSpec().resolve(experiment) test_clients_daily = DataSource( name="clients_daily", from_expr=f"`{project_id}.test_data.clients_daily`", ) test_active_hours = Metric( name="active_hours", data_source=test_clients_daily, select_expr=agg_sum("active_hours_sum"), ) config.metrics = { AnalysisPeriod.WEEK: [Summary(test_active_hours, BootstrapMean())] } self.analysis_mock_run(config, static_dataset, temporary_dataset, project_id) query_job = client.client.query(f""" SELECT * FROM `{project_id}.{temporary_dataset}.test_experiment_week_1` ORDER BY enrollment_date DESC """) expected_metrics_results = [ { "client_id": "bbbb", "branch": "branch2", "enrollment_date": datetime.date(2020, 4, 3), "num_enrollment_events": 1, "analysis_window_start": 0, "analysis_window_end": 6, }, { "client_id": "aaaa", "branch": "branch1", "enrollment_date": datetime.date(2020, 4, 2), "num_enrollment_events": 1, "analysis_window_start": 0, "analysis_window_end": 6, }, ] for i, row in enumerate(query_job.result()): for k, v in expected_metrics_results[i].items(): assert row[k] == v assert (client.client.get_table( f"{project_id}.{temporary_dataset}.test_experiment_weekly") is not None) assert (client.client.get_table( f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1" ) is not None) stats = client.client.list_rows( f"{project_id}.{temporary_dataset}.statistics_test_experiment_week_1" ).to_dataframe() count_by_branch = stats.query("statistic == 'count'").set_index( "branch") assert count_by_branch.loc["branch1", "point"] == 1.0 assert count_by_branch.loc["branch2", "point"] == 1.0 assert (client.client.get_table( f"{project_id}.{temporary_dataset}.statistics_test_experiment_weekly" ) is not None)
def test_metrics(self, client): experiment = Experiment( slug="test-experiment", type="rollout", start_date=dt.datetime(2020, 3, 30, tzinfo=pytz.utc), end_date=dt.datetime(2020, 6, 1, tzinfo=pytz.utc), proposed_enrollment=7, variants=[ Variant(is_control=False, slug="branch1", ratio=0.5), Variant(is_control=True, slug="branch2", ratio=0.5), ], normandy_slug="test-experiment", ) orig = mozanalysis.experiment.Experiment.build_query def build_query_test_project(instance, *args, **kwargs): # to use the test project and dataset, we need to change the SQL query # generated by mozanalysis query = orig(instance, args[0], args[1], args[2], args[3]) query = query.replace("moz-fx-data-shared-prod", self.project_id) query = query.replace("telemetry", self.static_dataset) return query config = AnalysisSpec().resolve(experiment) test_clients_daily = DataSource( name="clients_daily", from_expr=f"`{self.project_id}.test_data.clients_daily`", ) test_active_hours = Metric( name="active_hours", data_source=test_clients_daily, select_expr=agg_sum("active_hours_sum"), ) config.metrics = { AnalysisPeriod.WEEK: [ Summary(test_active_hours, BootstrapMean(ref_branch_label="branch1")) ] } analysis = Analysis(self.project_id, self.test_dataset, config) with mock.patch.object( mozanalysis.experiment.Experiment, "build_query", new=build_query_test_project ): analysis.run(current_date=dt.datetime(2020, 4, 12), dry_run=False) query_job = client.query( f""" SELECT * FROM `{self.project_id}.{self.test_dataset}.test_experiment_week_1` ORDER BY enrollment_date DESC """ ) expected_metrics_results = [ { "client_id": "bbbb", "branch": "branch2", "enrollment_date": datetime.date(2020, 4, 3), "num_enrollment_events": 1, "analysis_window_start": 0, "analysis_window_end": 6, }, { "client_id": "aaaa", "branch": "branch1", "enrollment_date": datetime.date(2020, 4, 2), "num_enrollment_events": 1, "analysis_window_start": 0, "analysis_window_end": 6, }, ] for i, row in enumerate(query_job.result()): for k, v in expected_metrics_results[i].items(): assert row[k] == v assert ( client.get_table(f"{self.project_id}.{self.test_dataset}.test_experiment_weekly") is not None ) assert ( client.get_table( f"{self.project_id}.{self.test_dataset}.statistics_test_experiment_week_1" ) is not None ) assert ( client.get_table( f"{self.project_id}.{self.test_dataset}.statistics_test_experiment_weekly" ) is not None )