def main( date, input_bucket, input_prefix, output_bucket, output_prefix, output_version, lag_days, ): """ Aggregate by (client_id, experiment_id, day). Note that the target day will actually be `lag-days` days before the supplied date. In other words, if you pass in 2017-01-20 and set `lag-days` to 5, the aggregation will be processed for day 2017-01-15 (the resulting data will cover submission dates including the activity day itself plus 5 days of lag for a total of 6 days). """ spark = SparkSession.builder.appName("experiments_daily").getOrCreate() parquet_path = format_spark_path(input_bucket, input_prefix) frame = load_experiments_summary(spark, parquet_path) day_frame, start_date = extract_submission_window_for_activity_day( frame, date, lag_days) searches_frame = extract_search_counts(frame) results = to_experiment_profile_day_aggregates(searches_frame) spark.conf.set( "mapreduce.fileoutputcommitter.marksuccessfuljobs", "false" ) # Don't write _SUCCESS files, which interfere w/ReDash discovery output_base_path = "{}/v{}/activity_date_s3={}".format( format_spark_path(output_bucket, output_prefix), output_version, start_date.strftime("%Y-%m-%d"), ) results.write.mode("overwrite").parquet(output_base_path)
def test_extract_search_counts(spark): from mozetl.clientsdaily import rollup frame = make_frame(spark) extracted = rollup.extract_search_counts(frame) row = extracted.agg({'search_count': 'sum'}).collect()[0] total = row.asDict().values()[0] assert total == EXPECTED_INTEGER_VALUES['search_count_sum']
def test_rollup(sample_data): from mozetl.experimentsdaily import rollup from mozetl.clientsdaily.rollup import extract_search_counts assert sample_data.count() == 100 client_id_count = sample_data.where("client_id is not null").count() assert client_id_count == sample_data.count() search_count_count = sample_data.where("search_counts is not null").count() assert search_count_count == 23 searches_frame = extract_search_counts(sample_data) # Two rows are skipped for containing only unknown SAPs: # 833c2828-e84d-42d4-b245-8ea5783fdace # 1a3c4318-a5d9-42a4-9cae-7a68eec6e1eb assert searches_frame.count() == 98 filtered = ( searches_frame.where("subsession_start_date LIKE '2017-09-08%'").where( "experiment_id = 'pref-flip-searchcomp1-pref3-1390584'").where( "search_count_all > 0").orderBy("client_id")) assert filtered.count() == 4 f_collected = filtered.collect() assert len(f_collected) == 4 assert f_collected[0].client_id == "259b7010-90b0-4edb-a0fb-510330e172ea" assert f_collected[0].search_count_all == 1 assert f_collected[1].client_id == "ae7e68d4-7fb6-4f2b-b195-2095dea77bec" assert f_collected[1].search_count_all == 1 assert f_collected[2].client_id == "bc627386-e4a8-448f-bbc5-f9dcf716cc1f" assert f_collected[2].search_count_all == 9 assert f_collected[3].client_id == "e5d896cd-97a7-4b96-8944-3ec74c2fede3" assert f_collected[3].search_count_all == 1 results = rollup.to_experiment_profile_day_aggregates(searches_frame) aggers = [] for k in EXPECTED_INTEGER_VALUES: aggers.append("sum({})".format(k)) agged = results.selectExpr(*aggers).collect() row = agged[0].asDict() for k in sorted(EXPECTED_INTEGER_VALUES.keys()): assert EXPECTED_INTEGER_VALUES[k] == row["sum({})".format(k)]
def main_summary_with_search(main_summary): return cd.extract_search_counts(main_summary)
def make_frame_with_extracts(spark): from mozetl.clientsdaily import rollup frame = make_frame(spark) return rollup.extract_search_counts(frame)