Пример #1
0
 def upload_resource_status_end(self, exit_code, *resources):
     timestamp = util.now_timestamp_rfc_3339()
     for resource in resources:
         status = self.get_resource_status(resource)
         if status.get("end") != None:
             continue
         status["end"] = timestamp
         status["exit_code"] = exit_code
         key = self.resource_status_key(resource)
         aws.upload_json_to_s3(status, key, self.bucket)
Пример #2
0
def drop_null_and_write(ingest_df, ctx, spark):
    full_dataset_size = ingest_df.count()
    logger.info("Dropping any rows that contain null values")
    ingest_df = ingest_df.dropna()
    written_count = write_raw_dataset(ingest_df, ctx, spark)
    metadata = {"dataset_size": written_count}
    aws.upload_json_to_s3(metadata, ctx.raw_dataset["metadata_key"],
                          ctx.bucket)
    logger.info("{} rows read, {} rows dropped, {} rows ingested".format(
        full_dataset_size, full_dataset_size - written_count, written_count))
Пример #3
0
 def upload_resource_status_start(self, *resources):
     timestamp = util.now_timestamp_rfc_3339()
     for resource in resources:
         key = self.resource_status_key(resource)
         status = {
             "resource_id": resource["id"],
             "resource_type": resource["resource_type"],
             "workload_id": resource["workload_id"],
             "app_name": self.app["name"],
             "start": timestamp,
         }
         aws.upload_json_to_s3(status, key, self.bucket)
Пример #4
0
def ingest_raw_dataset(spark, ctx, cols_to_validate, should_ingest):
    if should_ingest:
        cols_to_validate = list(ctx.rf_id_map.keys())

    if len(cols_to_validate) == 0:
        logger.info("Reading {} data (version: {})".format(ctx.app["name"], ctx.dataset_version))
        return spark_util.read_raw_dataset(ctx, spark)

    col_resources_to_validate = [ctx.rf_id_map[f] for f in cols_to_validate]
    ctx.upload_resource_status_start(*col_resources_to_validate)
    try:
        if should_ingest:
            data_config = ctx.environment["data"]

            logger.info("Ingesting")
            logger.info("Ingesting {} data from {}".format(ctx.app["name"], data_config["path"]))
            ingest_df = spark_util.ingest(ctx, spark)

            full_dataset_size = ingest_df.count()

            if data_config.get("drop_null"):
                logger.info("Dropping any rows that contain null values")
                ingest_df = ingest_df.dropna()

            if ctx.environment.get("limit"):
                ingest_df = limit_dataset(full_dataset_size, ingest_df, ctx.environment["limit"])

            written_count = write_raw_dataset(ingest_df, ctx, spark)
            metadata = {"dataset_size": written_count}
            aws.upload_json_to_s3(metadata, ctx.raw_dataset["metadata_key"], ctx.bucket)
            if written_count != full_dataset_size:
                logger.info(
                    "{} rows read, {} rows dropped, {} rows ingested".format(
                        full_dataset_size, full_dataset_size - written_count, written_count
                    )
                )
            else:
                logger.info("{} rows ingested".format(written_count))

        logger.info("Reading {} data (version: {})".format(ctx.app["name"], ctx.dataset_version))
        raw_df = spark_util.read_raw_dataset(ctx, spark)
        validate_dataset(ctx, raw_df, cols_to_validate)
    except:
        ctx.upload_resource_status_failed(*col_resources_to_validate)
        raise
    ctx.upload_resource_status_success(*col_resources_to_validate)
    logger.info("First {} samples:".format(3))
    show_df(raw_df, ctx, 3)

    return raw_df
Пример #5
0
def ingest_raw_dataset(spark, ctx, cols_to_validate, should_ingest):
    if should_ingest:
        cols_to_validate = list(ctx.rf_id_map.keys())

    if len(cols_to_validate) == 0:
        logger.info("Reading {} data (version: {})".format(
            ctx.app["name"], ctx.dataset_version))
        return spark_util.read_raw_dataset(ctx, spark)

    col_resources_to_validate = [ctx.rf_id_map[f] for f in cols_to_validate]
    ctx.upload_resource_status_start(*col_resources_to_validate)
    try:
        if should_ingest:
            logger.info("Ingesting")
            logger.info("Ingesting {} data from {}".format(
                ctx.app["name"], ctx.environment["data"]["path"]))
            ingest_df = spark_util.ingest(ctx, spark)

            if ctx.environment["data"].get("drop_null"):
                drop_null_and_write(ingest_df, ctx, spark)
            else:
                written_count = write_raw_dataset(ingest_df, ctx, spark)
                metadata = {"dataset_size": written_count}
                aws.upload_json_to_s3(metadata,
                                      ctx.raw_dataset["metadata_key"],
                                      ctx.bucket)
                logger.info("{} rows ingested".format(written_count))

        logger.info("Reading {} data (version: {})".format(
            ctx.app["name"], ctx.dataset_version))
        raw_df = spark_util.read_raw_dataset(ctx, spark)
        validate_dataset(ctx, raw_df, cols_to_validate)
    except:
        ctx.upload_resource_status_failed(*col_resources_to_validate)
        raise
    ctx.upload_resource_status_success(*col_resources_to_validate)
    logger.info("First {} samples:".format(3))
    show_df(raw_df, ctx, 3)

    return raw_df
Пример #6
0
def write_training_data(model_name, df, ctx):
    model = ctx.models[model_name]
    training_dataset = model["dataset"]
    feature_names = model["features"] + [model["target"]
                                         ] + model["training_features"]

    df = df.select(*feature_names)

    metadata = {"dataset_size": df.count()}
    aws.upload_json_to_s3(metadata, training_dataset["metadata_key"],
                          ctx.bucket)

    train_ratio = model["data_partition_ratio"]["training"]
    eval_ratio = model["data_partition_ratio"]["evaluation"]
    [train_df, eval_df] = df.randomSplit([train_ratio, eval_ratio])
    train_df.write.mode("overwrite").format("tfrecords").option(
        "recordType",
        "Example").save(aws.s3a_path(ctx.bucket,
                                     training_dataset["train_key"]))
    eval_df.write.mode("overwrite").format("tfrecords").option(
        "recordType",
        "Example").save(aws.s3a_path(ctx.bucket, training_dataset["eval_key"]))
    return df