Exemplo n.º 1
0
def main(
    date,
    input_bucket,
    input_prefix,
    output_bucket,
    output_prefix,
    output_version,
    lag_days,
):
    """
    Aggregate by (client_id, experiment_id, day).

    Note that the target day will actually be `lag-days` days before
    the supplied date. In other words, if you pass in 2017-01-20 and
    set `lag-days` to 5, the aggregation will be processed for
    day 2017-01-15 (the resulting data will cover submission dates
    including the activity day itself plus 5 days of lag for a total
    of 6 days).
    """
    spark = SparkSession.builder.appName("experiments_daily").getOrCreate()
    parquet_path = format_spark_path(input_bucket, input_prefix)
    frame = load_experiments_summary(spark, parquet_path)
    day_frame, start_date = extract_submission_window_for_activity_day(
        frame, date, lag_days)
    searches_frame = extract_search_counts(frame)
    results = to_experiment_profile_day_aggregates(searches_frame)
    spark.conf.set(
        "mapreduce.fileoutputcommitter.marksuccessfuljobs", "false"
    )  # Don't write _SUCCESS files, which interfere w/ReDash discovery
    output_base_path = "{}/v{}/activity_date_s3={}".format(
        format_spark_path(output_bucket, output_prefix),
        output_version,
        start_date.strftime("%Y-%m-%d"),
    )
    results.write.mode("overwrite").parquet(output_base_path)
Exemplo n.º 2
0
def main(input_bucket, input_prefix, output_bucket, output_prefix):
    s3client = boto3.client('s3', 'us-west-2')
    transferer = S3Transfer(s3client)
    last_rollup_basename = get_last_rollup(transferer)
    if last_rollup_basename:
        since, carryover = parse_last_rollup(last_rollup_basename)
        logging.info("Generating counts since {}".format(since))
    else:
        since, carryover = None, []
        logging.info("Generating counts since beginning")
    spark = (
        SparkSession
        .builder
        .appName("maudau")
        .getOrCreate()
    )
    path = U.format_spark_path(input_bucket, input_prefix)
    logging.info("Loading main_summary from {}".format(path))
    main_summary = spark.read.option("mergeSchema", "true").parquet(path)
    updates = generate_counts(main_summary, since)
    logging.info("Generated counts for {} days".format(len(updates)))
    results = carryover + updates
    output_basename = write_locally(results)
    publish_to_s3(s3client, output_bucket, output_prefix, output_basename)
    if not DEVELOPMENT:
        logging.info("Published to S3; done.")
Exemplo n.º 3
0
def main(date, input_bucket, input_prefix, output_bucket, output_prefix,
         output_version, sample_id, lag_days):
    """
    Aggregate by (client_id, day) for a given day.

    Note that the target day will actually be `lag-days` days before
    the supplied date. In other words, if you pass in 2017-01-20 and
    set `lag-days` to 5, the aggregation will be processed for
    day 2017-01-15 (the resulting data will cover submission dates
    including the activity day itself plus 5 days of lag for a total
    of 6 days).
    """
    spark = (SparkSession.builder.appName("clients_daily").getOrCreate())
    # Per https://issues.apache.org/jira/browse/PARQUET-142 ,
    # don't write _SUCCESS files, which interfere w/ReDash discovery
    spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
    main_summary = load_main_summary(spark, input_bucket, input_prefix)
    day_frame, start_date = extract_submission_window_for_activity_day(
        main_summary, date, lag_days)
    if sample_id:
        day_frame = day_frame.where("sample_id = '{}'".format(sample_id))
    with_searches = extract_search_counts(day_frame)
    results = to_profile_day_aggregates(with_searches)
    partition_count = get_partition_count_for_writing(bool(sample_id))
    output_base_path = "{}/v{}/".format(
        format_spark_path(output_bucket, output_prefix), output_version)
    write_one_activity_day(results, start_date, output_base_path,
                           partition_count)
Exemplo n.º 4
0
def write_by_activity_day(results, day_pointer, output_bucket, output_prefix,
                          partition_count):
    month = day_pointer.month
    prefix_template = os.path.join(output_prefix, 'activity_date_s3={}')
    while day_pointer.month == month:
        isoday = day_pointer.isoformat()
        prefix = prefix_template.format(isoday)
        output_path = format_spark_path(output_bucket, prefix)
        data_for_date = results.where(results.activity_date == isoday)
        data_for_date.coalesce(partition_count).write.parquet(output_path)
        day_pointer += DT.timedelta(1)
Exemplo n.º 5
0
def main(local, submission_date_s3, input_bucket, input_prefix, output_bucket,
         output_prefix):
    # print argument information
    for k, v in locals().items():
        print("{}: {}".format(k, v))

    print("Python version: {}".format(sys.version_info))
    spark = SparkSession.builder.getOrCreate()
    print("Spark version: {}".format(spark.version))

    # run a basic count over a sample of `main_summary` from 2 days ago
    if not local:
        ds_nodash = submission_date_s3
        input_path = format_spark_path(input_bucket, input_prefix)
        output_path = format_spark_path(output_bucket, output_prefix)

        print(
            "Reading data for {ds_nodash} from {input_path} and writing to {output_path}"
            .format(ds_nodash=ds_nodash,
                    input_path=input_path,
                    output_path=output_path))

        path = "{}/submission_date_s3={}/sample_id={}".format(
            input_path, ds_nodash, 1)
        subset = spark.read.parquet(path)
        print("Saw {} documents".format(subset.count()))

        summary = subset.select("memory_mb", "cpu_cores",
                                "subsession_length").describe()
        summary.show()

        summary.write.parquet(output_path +
                              "/submission_date_s3={}/".format(ds_nodash),
                              mode="overwrite")

    stop_session_safely(spark)
    print("Done!")
Exemplo n.º 6
0
def load_main_summary(spark, input_bucket, input_prefix):
    main_summary_path = format_spark_path(input_bucket, input_prefix)
    return (spark.read.option("mergeSchema",
                              "true").parquet(main_summary_path))