示例#1
0
def main(
    date,
    input_bucket,
    input_prefix,
    output_bucket,
    output_prefix,
    output_version,
    lag_days,
):
    """
    Aggregate by (client_id, experiment_id, day).

    Note that the target day will actually be `lag-days` days before
    the supplied date. In other words, if you pass in 2017-01-20 and
    set `lag-days` to 5, the aggregation will be processed for
    day 2017-01-15 (the resulting data will cover submission dates
    including the activity day itself plus 5 days of lag for a total
    of 6 days).
    """
    spark = SparkSession.builder.appName("experiments_daily").getOrCreate()
    parquet_path = format_spark_path(input_bucket, input_prefix)
    frame = load_experiments_summary(spark, parquet_path)
    day_frame, start_date = extract_submission_window_for_activity_day(
        frame, date, lag_days)
    searches_frame = extract_search_counts(frame)
    results = to_experiment_profile_day_aggregates(searches_frame)
    spark.conf.set(
        "mapreduce.fileoutputcommitter.marksuccessfuljobs", "false"
    )  # Don't write _SUCCESS files, which interfere w/ReDash discovery
    output_base_path = "{}/v{}/activity_date_s3={}".format(
        format_spark_path(output_bucket, output_prefix),
        output_version,
        start_date.strftime("%Y-%m-%d"),
    )
    results.write.mode("overwrite").parquet(output_base_path)
def test_extract_search_counts(spark):
    from mozetl.clientsdaily import rollup

    frame = make_frame(spark)
    extracted = rollup.extract_search_counts(frame)
    row = extracted.agg({'search_count': 'sum'}).collect()[0]
    total = row.asDict().values()[0]
    assert total == EXPECTED_INTEGER_VALUES['search_count_sum']
示例#3
0
def test_rollup(sample_data):
    from mozetl.experimentsdaily import rollup
    from mozetl.clientsdaily.rollup import extract_search_counts

    assert sample_data.count() == 100

    client_id_count = sample_data.where("client_id is not null").count()
    assert client_id_count == sample_data.count()

    search_count_count = sample_data.where("search_counts is not null").count()
    assert search_count_count == 23

    searches_frame = extract_search_counts(sample_data)

    # Two rows are skipped for containing only unknown SAPs:
    #  833c2828-e84d-42d4-b245-8ea5783fdace
    #  1a3c4318-a5d9-42a4-9cae-7a68eec6e1eb
    assert searches_frame.count() == 98

    filtered = (
        searches_frame.where("subsession_start_date LIKE '2017-09-08%'").where(
            "experiment_id = 'pref-flip-searchcomp1-pref3-1390584'").where(
                "search_count_all > 0").orderBy("client_id"))

    assert filtered.count() == 4
    f_collected = filtered.collect()
    assert len(f_collected) == 4
    assert f_collected[0].client_id == "259b7010-90b0-4edb-a0fb-510330e172ea"
    assert f_collected[0].search_count_all == 1
    assert f_collected[1].client_id == "ae7e68d4-7fb6-4f2b-b195-2095dea77bec"
    assert f_collected[1].search_count_all == 1
    assert f_collected[2].client_id == "bc627386-e4a8-448f-bbc5-f9dcf716cc1f"
    assert f_collected[2].search_count_all == 9
    assert f_collected[3].client_id == "e5d896cd-97a7-4b96-8944-3ec74c2fede3"
    assert f_collected[3].search_count_all == 1

    results = rollup.to_experiment_profile_day_aggregates(searches_frame)
    aggers = []
    for k in EXPECTED_INTEGER_VALUES:
        aggers.append("sum({})".format(k))
    agged = results.selectExpr(*aggers).collect()
    row = agged[0].asDict()
    for k in sorted(EXPECTED_INTEGER_VALUES.keys()):
        assert EXPECTED_INTEGER_VALUES[k] == row["sum({})".format(k)]
def main_summary_with_search(main_summary):
    return cd.extract_search_counts(main_summary)
def make_frame_with_extracts(spark):
    from mozetl.clientsdaily import rollup
    frame = make_frame(spark)
    return rollup.extract_search_counts(frame)