def run_train(data: PathStr, model_path: PathStr): spark = SparkSession.builder.appName("train_data_spark").getOrCreate() data_df = spark.read.csv(data, inferSchema=True, header=True, sep=",") log_dataframe("ingest", data_df, path=data, with_histograms=True) train_model_for_customer_spark(data_df, saved_model=model_path) spark.stop()
def _test() -> None: import os import doctest import sys import numpy from pyspark.sql import SparkSession import pyspark.pandas.indexes.multi os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.pandas.indexes.multi.__dict__.copy() globs["np"] = numpy globs["ps"] = pyspark.pandas spark = ( SparkSession.builder.master("local[4]") .appName("pyspark.pandas.indexes.multi tests") .getOrCreate() ) (failure_count, test_count) = doctest.testmod( pyspark.pandas.indexes.multi, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE, ) spark.stop() if failure_count: sys.exit(-1)
def run_ingest(data: PathStr, output_path: PathStr): spark = SparkSession.builder.appName("ingest_data_spark").getOrCreate() data_df = spark.read.csv(data, inferSchema=True, header=True, sep=",") result_df = ingest_customer_data(data_df) log_dataframe( "published", result_df, path=output_path, with_histograms=True, operation_type=DbndTargetOperationType.write, ) result_df.write.csv(str(output_path), header=True) spark.stop()