示例#1
0
def write_raw_dataset(df, ctx, spark):
    logger.info("Caching {} data (version: {})".format(ctx.app["name"],
                                                       ctx.dataset_version))
    acc, df = spark_util.accumulate_count(df, spark)
    df.write.mode("overwrite").parquet(
        aws.s3a_path(ctx.bucket, ctx.raw_dataset["key"]))
    return acc.value
示例#2
0
def write_training_data(model_name, df, ctx):
    model = ctx.models[model_name]
    training_dataset = model["dataset"]
    feature_names = model["features"] + [model["target"]
                                         ] + model["training_features"]

    df = df.select(*feature_names)

    metadata = {"dataset_size": df.count()}
    aws.upload_json_to_s3(metadata, training_dataset["metadata_key"],
                          ctx.bucket)

    train_ratio = model["data_partition_ratio"]["training"]
    eval_ratio = model["data_partition_ratio"]["evaluation"]
    [train_df, eval_df] = df.randomSplit([train_ratio, eval_ratio])
    train_df.write.mode("overwrite").format("tfrecords").option(
        "recordType",
        "Example").save(aws.s3a_path(ctx.bucket,
                                     training_dataset["train_key"]))
    eval_df.write.mode("overwrite").format("tfrecords").option(
        "recordType",
        "Example").save(aws.s3a_path(ctx.bucket, training_dataset["eval_key"]))
    return df
示例#3
0
def write_raw_dataset(df, ctx):
    df.write.mode("overwrite").parquet(
        aws.s3a_path(ctx.bucket, ctx.raw_dataset_key))
示例#4
0
def read_raw_dataset(ctx, spark):
    return spark.read.parquet(aws.s3a_path(ctx.bucket, ctx.raw_dataset_key))