Пример #1
0
def run_train(data: PathStr, model_path: PathStr):
    spark = SparkSession.builder.appName("train_data_spark").getOrCreate()
    data_df = spark.read.csv(data, inferSchema=True, header=True, sep=",")

    log_dataframe("ingest", data_df, path=data, with_histograms=True)
    train_model_for_customer_spark(data_df, saved_model=model_path)
    spark.stop()
Пример #2
0
def run_ingest(data: PathStr, output_path: PathStr):
    spark = SparkSession.builder.appName("ingest_data_spark").getOrCreate()
    data_df = spark.read.csv(data, inferSchema=True, header=True, sep=",")

    result_df = ingest_customer_data(data_df)

    log_dataframe(
        "published",
        result_df,
        path=output_path,
        with_histograms=True,
        operation_type=DbndTargetOperationType.write,
    )
    result_df.write.csv(str(output_path), header=True)
    spark.stop()
Пример #3
0
def create_report(data: DataFrame) -> DataFrame:
    log_metric("Column Count", len(data.columns))
    log_dataframe("ready_data", data, with_histograms=True)
    avg_score = data.agg({"score_label": "sum"}).collect()[0][0]
    log_metric("Avg Score", chaos_float(avg_score))
    return data
Пример #4
0
 def run(self):
     log_dataframe("features", self.store.features)
     self.advanced_features = self.store.features