Exemplo n.º 1
0
def test_get_enrollments_debug_dupes(spark):
    exp = Experiment('a-stub', '20190101')
    view_method = _get_enrollment_view("a-stub")

    enrl = exp.get_enrollments(spark, view_method)
    assert 'num_events' not in enrl.columns

    enrl2 = exp.get_enrollments(spark, view_method, debug_dupes=True)
    assert 'num_events' in enrl2.columns

    penrl2 = enrl2.toPandas()
    assert (penrl2['num_events'] == 1).all()
Exemplo n.º 2
0
def test_process_metrics(spark):
    exp = Experiment('a-stub', '20190101', num_dates_enrollment=8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))

    ds_df_A = register_data_source_fixture(spark, name='ds_df_A')
    ds_df_B = register_data_source_fixture(spark, name='ds_df_B')

    ds_A = DataSource.from_dataframe('ds_df_A', ds_df_A)
    ds_B = DataSource.from_dataframe('ds_df_B', ds_df_B)

    m1 = Metric.from_col('m1', ds_df_A.numeric_col, ds_A)
    m2 = Metric.from_col('m2', ds_df_A.bool_col, ds_A)
    m3 = Metric.from_col('m3', ds_df_B.numeric_col, ds_B)

    metric_list = [m1, m2, m3]

    exp = Experiment('a-stub', '20190101')

    data_sources_and_metrics = exp._process_metrics(enrollments, metric_list)

    assert len(data_sources_and_metrics) == 2

    assert len(data_sources_and_metrics[ds_df_A]) == 2
    assert len(data_sources_and_metrics[ds_df_B]) == 1

    assert 'numeric_col' in repr(data_sources_and_metrics[ds_df_B][0])
    assert '`m3`' in repr(data_sources_and_metrics[ds_df_B][0])
    assert repr(data_sources_and_metrics[ds_df_B][0]) in {
        "Column<b'numeric_col AS `m3`'>",  # py3
        "Column<numeric_col AS `m3`>",  # py2
    }
Exemplo n.º 3
0
def test_get_time_series_data_lazy_daily(spark):
    exp = Experiment('a-stub', '20190101', 8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))
    metrics = _get_metrics(spark)
    metric__how_many_ones = metrics['how_many_ones']

    res = exp.get_time_series_data_lazy(
        enrollments,
        [metric__how_many_ones],
        '20190114',
        time_series_period='daily',
        keep_client_id=True,
    )

    assert len(res) == 7

    for df in res.values():
        pdf = df.toPandas()
        assert pdf.client_id.nunique() == 3
        assert len(pdf) == 3

        pdf = pdf.set_index('client_id')

        assert pdf.loc['aaaa', 'how_many_ones'] == 1
        assert pdf.loc['bbbb', 'how_many_ones'] == 1
        assert pdf.loc['cccc', 'how_many_ones'] == 0
        assert (pdf['bla_ds_has_contradictory_branch'] == 0).all()
        assert (pdf['bla_ds_has_non_enrolled_data'] == 0).all()
Exemplo n.º 4
0
def test_process_enrollments(spark):
    exp = Experiment('a-stub', '20190101')
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))
    assert enrollments.count() == 4

    # With final data collected on '20190114', we have 7 dates of data
    # for 'cccc' enrolled on '20190108' but not for 'dddd' enrolled on
    # '20190109'.
    tl = TimeLimits.for_single_analysis_window(
        first_enrollment_date=exp.start_date,
        last_date_full_data='20190114',
        analysis_start_days=0,
        analysis_length_dates=7,
        num_dates_enrollment=exp.num_dates_enrollment)
    assert tl.last_enrollment_date == '20190108'
    assert len(tl.analysis_windows) == 1
    assert tl.analysis_windows[0].end == 6

    pe = exp._process_enrollments(enrollments, tl)
    assert pe.count() == 3

    pe = exp._process_enrollments(enrollments.alias('main_summary'), tl)
    assert pe.select(F.col('enrollments.enrollment_date'))
    with pytest.raises(AnalysisException):
        assert pe.select(F.col('main_summary.enrollment_date'))
Exemplo n.º 5
0
def test_add_analysis_windows_to_enrollments(spark):
    exp = Experiment('a-stub', '20190101', num_dates_enrollment=8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))
    assert enrollments.count() == 3

    tl = TimeLimits.for_ts(
        first_enrollment_date=exp.start_date,
        last_date_full_data='20190114',
        time_series_period='daily',
        num_dates_enrollment=exp.num_dates_enrollment,
    )
    assert len(tl.analysis_windows) == 7

    new_enrollments = exp._add_analysis_windows_to_enrollments(enrollments, tl)

    nep = new_enrollments.toPandas()
    assert len(nep) == enrollments.count() * len(tl.analysis_windows)

    a = nep[nep['client_id'] == 'aaaa']
    assert len(a) == len(tl.analysis_windows)
    assert (a.mozanalysis_analysis_window_start.sort_values() == np.arange(
        len(tl.analysis_windows))).all()
    assert (a.mozanalysis_analysis_window_end.sort_values() == np.arange(
        len(tl.analysis_windows))).all()
Exemplo n.º 6
0
def test_get_per_client_data_doesnt_crash(spark):
    exp = Experiment('a-stub', '20190101', 8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))
    metrics = _get_metrics(spark)
    metric__how_many_ones = metrics['how_many_ones']

    exp.get_per_client_data(enrollments, [metric__how_many_ones], '20190114',
                            0, 3)
Exemplo n.º 7
0
def test_get_enrollments(spark):
    exp = Experiment('a-stub', '20190101')
    view_method = _get_enrollment_view("a-stub")
    assert exp.get_enrollments(spark, view_method).count() == 4

    exp2 = Experiment('a-stub2', '20190102')
    view_method2 = _get_enrollment_view("a-stub2")
    enrl2 = exp2.get_enrollments(spark, study_type=view_method2)
    assert enrl2.count() == 2
    assert enrl2.select(F.min(
        enrl2.enrollment_date).alias('b')).first()['b'] == '20190108'

    exp_8d = Experiment('experiment-with-8-day-cohort', '20190101', 8)
    view_method_8d = _get_enrollment_view("experiment-with-8-day-cohort")
    enrl_8d = exp_8d.get_enrollments(spark, view_method_8d)
    assert enrl_8d.count() == 3
    assert enrl_8d.select(F.max(
        enrl_8d.enrollment_date).alias('b')).first()['b'] == '20190108'
Exemplo n.º 8
0
def test_get_time_series_data(spark):
    exp = Experiment('a-stub', '20190101', 8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))
    metrics = _get_metrics(spark)
    metric__how_many_ones = metrics['how_many_ones']

    res = exp.get_time_series_data(
        enrollments,
        [metric__how_many_ones],
        '20190128',
        time_series_period='weekly',
        keep_client_id=True,
    )

    assert len(res) == 3
    df = res[0]
    assert df.client_id.nunique() == 3
    assert len(df) == 3

    df = df.set_index('client_id')
    print(df.columns)

    assert df.loc['aaaa', 'how_many_ones'] == 7
    assert df.loc['bbbb', 'how_many_ones'] == 7
    assert df.loc['cccc', 'how_many_ones'] == 0
    assert (df['bla_ds_has_contradictory_branch'] == 0).all()
    assert (df['bla_ds_has_non_enrolled_data'] == 0).all()

    df = res[14]
    assert df.client_id.nunique() == 3
    assert len(df) == 3

    df = df.set_index('client_id')

    assert df.loc['aaaa', 'how_many_ones'] == 1
    assert df.loc['bbbb', 'how_many_ones'] == 1
    assert df.loc['cccc', 'how_many_ones'] == 0
    assert (df['bla_ds_has_contradictory_branch'] == 0).all()
    assert (df['bla_ds_has_non_enrolled_data'] == 0).all()
Exemplo n.º 9
0
def test_process_metrics_dupe_data_source(spark):
    exp = Experiment('a-stub', '20190101', num_dates_enrollment=8)
    enrollments = exp.get_enrollments(spark,
                                      _get_enrollment_view(slug="a-stub"))

    ds_df = register_data_source_fixture(spark, name='ds_df_A')

    ds_1 = DataSource.from_dataframe('ds_df_A', ds_df)
    ds_2 = DataSource.from_dataframe('ds_df_A', ds_df)

    m1 = Metric.from_col('m1', ds_df.numeric_col, ds_1)
    m2 = Metric.from_col('m2', ds_df.bool_col, ds_2)

    metric_list = [m1, m2]

    exp = Experiment('a-stub', '20190101')

    data_sources_and_metrics = exp._process_metrics(enrollments, metric_list)

    assert len(data_sources_and_metrics) == 1

    assert len(data_sources_and_metrics[ds_df]) == 2