def test_get_enrollments_debug_dupes(spark): exp = Experiment('a-stub', '20190101') view_method = _get_enrollment_view("a-stub") enrl = exp.get_enrollments(spark, view_method) assert 'num_events' not in enrl.columns enrl2 = exp.get_enrollments(spark, view_method, debug_dupes=True) assert 'num_events' in enrl2.columns penrl2 = enrl2.toPandas() assert (penrl2['num_events'] == 1).all()
def test_process_metrics(spark): exp = Experiment('a-stub', '20190101', num_dates_enrollment=8) enrollments = exp.get_enrollments(spark, _get_enrollment_view(slug="a-stub")) ds_df_A = register_data_source_fixture(spark, name='ds_df_A') ds_df_B = register_data_source_fixture(spark, name='ds_df_B') ds_A = DataSource.from_dataframe('ds_df_A', ds_df_A) ds_B = DataSource.from_dataframe('ds_df_B', ds_df_B) m1 = Metric.from_col('m1', ds_df_A.numeric_col, ds_A) m2 = Metric.from_col('m2', ds_df_A.bool_col, ds_A) m3 = Metric.from_col('m3', ds_df_B.numeric_col, ds_B) metric_list = [m1, m2, m3] exp = Experiment('a-stub', '20190101') data_sources_and_metrics = exp._process_metrics(enrollments, metric_list) assert len(data_sources_and_metrics) == 2 assert len(data_sources_and_metrics[ds_df_A]) == 2 assert len(data_sources_and_metrics[ds_df_B]) == 1 assert 'numeric_col' in repr(data_sources_and_metrics[ds_df_B][0]) assert '`m3`' in repr(data_sources_and_metrics[ds_df_B][0]) assert repr(data_sources_and_metrics[ds_df_B][0]) in { "Column<b'numeric_col AS `m3`'>", # py3 "Column<numeric_col AS `m3`>", # py2 }
def test_get_time_series_data_lazy_daily(spark): exp = Experiment('a-stub', '20190101', 8) enrollments = exp.get_enrollments(spark, _get_enrollment_view(slug="a-stub")) metrics = _get_metrics(spark) metric__how_many_ones = metrics['how_many_ones'] res = exp.get_time_series_data_lazy( enrollments, [metric__how_many_ones], '20190114', time_series_period='daily', keep_client_id=True, ) assert len(res) == 7 for df in res.values(): pdf = df.toPandas() assert pdf.client_id.nunique() == 3 assert len(pdf) == 3 pdf = pdf.set_index('client_id') assert pdf.loc['aaaa', 'how_many_ones'] == 1 assert pdf.loc['bbbb', 'how_many_ones'] == 1 assert pdf.loc['cccc', 'how_many_ones'] == 0 assert (pdf['bla_ds_has_contradictory_branch'] == 0).all() assert (pdf['bla_ds_has_non_enrolled_data'] == 0).all()
def test_process_enrollments(spark): exp = Experiment('a-stub', '20190101') enrollments = exp.get_enrollments(spark, _get_enrollment_view(slug="a-stub")) assert enrollments.count() == 4 # With final data collected on '20190114', we have 7 dates of data # for 'cccc' enrolled on '20190108' but not for 'dddd' enrolled on # '20190109'. tl = TimeLimits.for_single_analysis_window( first_enrollment_date=exp.start_date, last_date_full_data='20190114', analysis_start_days=0, analysis_length_dates=7, num_dates_enrollment=exp.num_dates_enrollment) assert tl.last_enrollment_date == '20190108' assert len(tl.analysis_windows) == 1 assert tl.analysis_windows[0].end == 6 pe = exp._process_enrollments(enrollments, tl) assert pe.count() == 3 pe = exp._process_enrollments(enrollments.alias('main_summary'), tl) assert pe.select(F.col('enrollments.enrollment_date')) with pytest.raises(AnalysisException): assert pe.select(F.col('main_summary.enrollment_date'))
def test_add_analysis_windows_to_enrollments(spark): exp = Experiment('a-stub', '20190101', num_dates_enrollment=8) enrollments = exp.get_enrollments(spark, _get_enrollment_view(slug="a-stub")) assert enrollments.count() == 3 tl = TimeLimits.for_ts( first_enrollment_date=exp.start_date, last_date_full_data='20190114', time_series_period='daily', num_dates_enrollment=exp.num_dates_enrollment, ) assert len(tl.analysis_windows) == 7 new_enrollments = exp._add_analysis_windows_to_enrollments(enrollments, tl) nep = new_enrollments.toPandas() assert len(nep) == enrollments.count() * len(tl.analysis_windows) a = nep[nep['client_id'] == 'aaaa'] assert len(a) == len(tl.analysis_windows) assert (a.mozanalysis_analysis_window_start.sort_values() == np.arange( len(tl.analysis_windows))).all() assert (a.mozanalysis_analysis_window_end.sort_values() == np.arange( len(tl.analysis_windows))).all()
def test_get_per_client_data_doesnt_crash(spark): exp = Experiment('a-stub', '20190101', 8) enrollments = exp.get_enrollments(spark, _get_enrollment_view(slug="a-stub")) metrics = _get_metrics(spark) metric__how_many_ones = metrics['how_many_ones'] exp.get_per_client_data(enrollments, [metric__how_many_ones], '20190114', 0, 3)
def test_get_enrollments(spark): exp = Experiment('a-stub', '20190101') view_method = _get_enrollment_view("a-stub") assert exp.get_enrollments(spark, view_method).count() == 4 exp2 = Experiment('a-stub2', '20190102') view_method2 = _get_enrollment_view("a-stub2") enrl2 = exp2.get_enrollments(spark, study_type=view_method2) assert enrl2.count() == 2 assert enrl2.select(F.min( enrl2.enrollment_date).alias('b')).first()['b'] == '20190108' exp_8d = Experiment('experiment-with-8-day-cohort', '20190101', 8) view_method_8d = _get_enrollment_view("experiment-with-8-day-cohort") enrl_8d = exp_8d.get_enrollments(spark, view_method_8d) assert enrl_8d.count() == 3 assert enrl_8d.select(F.max( enrl_8d.enrollment_date).alias('b')).first()['b'] == '20190108'
def test_get_time_series_data(spark): exp = Experiment('a-stub', '20190101', 8) enrollments = exp.get_enrollments(spark, _get_enrollment_view(slug="a-stub")) metrics = _get_metrics(spark) metric__how_many_ones = metrics['how_many_ones'] res = exp.get_time_series_data( enrollments, [metric__how_many_ones], '20190128', time_series_period='weekly', keep_client_id=True, ) assert len(res) == 3 df = res[0] assert df.client_id.nunique() == 3 assert len(df) == 3 df = df.set_index('client_id') print(df.columns) assert df.loc['aaaa', 'how_many_ones'] == 7 assert df.loc['bbbb', 'how_many_ones'] == 7 assert df.loc['cccc', 'how_many_ones'] == 0 assert (df['bla_ds_has_contradictory_branch'] == 0).all() assert (df['bla_ds_has_non_enrolled_data'] == 0).all() df = res[14] assert df.client_id.nunique() == 3 assert len(df) == 3 df = df.set_index('client_id') assert df.loc['aaaa', 'how_many_ones'] == 1 assert df.loc['bbbb', 'how_many_ones'] == 1 assert df.loc['cccc', 'how_many_ones'] == 0 assert (df['bla_ds_has_contradictory_branch'] == 0).all() assert (df['bla_ds_has_non_enrolled_data'] == 0).all()
def test_process_metrics_dupe_data_source(spark): exp = Experiment('a-stub', '20190101', num_dates_enrollment=8) enrollments = exp.get_enrollments(spark, _get_enrollment_view(slug="a-stub")) ds_df = register_data_source_fixture(spark, name='ds_df_A') ds_1 = DataSource.from_dataframe('ds_df_A', ds_df) ds_2 = DataSource.from_dataframe('ds_df_A', ds_df) m1 = Metric.from_col('m1', ds_df.numeric_col, ds_1) m2 = Metric.from_col('m2', ds_df.bool_col, ds_2) metric_list = [m1, m2] exp = Experiment('a-stub', '20190101') data_sources_and_metrics = exp._process_metrics(enrollments, metric_list) assert len(data_sources_and_metrics) == 1 assert len(data_sources_and_metrics[ds_df]) == 2