Exemplo n.º 1
0
def run_train(data: PathStr, model_path: PathStr):
    spark = SparkSession.builder.appName("train_data_spark").getOrCreate()
    data_df = spark.read.csv(data, inferSchema=True, header=True, sep=",")

    log_dataframe("ingest", data_df, path=data, with_histograms=True)
    train_model_for_customer_spark(data_df, saved_model=model_path)
    spark.stop()
Exemplo n.º 2
0
def _test() -> None:
    import os
    import doctest
    import sys
    import numpy
    from pyspark.sql import SparkSession
    import pyspark.pandas.indexes.multi

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.pandas.indexes.multi.__dict__.copy()
    globs["np"] = numpy
    globs["ps"] = pyspark.pandas
    spark = (
        SparkSession.builder.master("local[4]")
        .appName("pyspark.pandas.indexes.multi tests")
        .getOrCreate()
    )
    (failure_count, test_count) = doctest.testmod(
        pyspark.pandas.indexes.multi,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
    )
    spark.stop()
    if failure_count:
        sys.exit(-1)
Exemplo n.º 3
0
def run_ingest(data: PathStr, output_path: PathStr):
    spark = SparkSession.builder.appName("ingest_data_spark").getOrCreate()
    data_df = spark.read.csv(data, inferSchema=True, header=True, sep=",")

    result_df = ingest_customer_data(data_df)

    log_dataframe(
        "published",
        result_df,
        path=output_path,
        with_histograms=True,
        operation_type=DbndTargetOperationType.write,
    )
    result_df.write.csv(str(output_path), header=True)
    spark.stop()