def test_spark_keras_autologging_context_provider(spark_session, data_format, file_path):
    mlflow.spark.autolog()
    mlflow.keras.autolog()

    df = (
        spark_session.read.format(data_format)
        .option("header", "true")
        .option("inferSchema", "true")
        .load(file_path)
        .select("number1", "number2")
    )
    pandas_df = df.toPandas()

    # DF info should be logged to the first run (it should be added to our context provider after
    # the toPandas() call above & then logged here)
    with mlflow.start_run():
        run = _fit_keras_model(pandas_df, epochs=1)
    _assert_spark_data_logged(run, file_path, data_format)

    with mlflow.start_run():
        pandas_df2 = df.filter("number1 > 0").toPandas()
        run2 = _fit_keras_model(pandas_df2, epochs=1)
    assert run2.info.run_id != run.info.run_id
    _assert_spark_data_logged(run2, file_path, data_format)
    time.sleep(1)
    assert mlflow.active_run() is None
예제 #2
0
def test_autologging_disabled_then_enabled(spark_session, format_to_file_path):
    mlflow.spark.autolog(disable=True)
    data_format = list(format_to_file_path.keys())[0]
    file_path = format_to_file_path[data_format]
    df = (
        spark_session.read.format(data_format)
        .option("header", "true")
        .option("inferSchema", "true")
        .load(file_path)
    )
    # Logging is disabled here.
    with mlflow.start_run():
        run_id = mlflow.active_run().info.run_id
        df.collect()
        time.sleep(1)
    run = mlflow.get_run(run_id)
    _assert_spark_data_not_logged(run=run)

    # Logging is enabled here.
    mlflow.spark.autolog(disable=False)
    with mlflow.start_run():
        run_id = mlflow.active_run().info.run_id
        df.filter("number1 > 0").collect()
        time.sleep(1)
    run = mlflow.get_run(run_id)
    _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
예제 #3
0
def test_autologging_of_datasources_with_different_formats(
        spark_session, format_to_file_path):
    mlflow.spark.autolog()
    for data_format, file_path in format_to_file_path.items():
        base_df = (spark_session.read.format(data_format).option(
            "header", "true").option("inferSchema", "true").load(file_path))
        base_df.createOrReplaceTempView("temptable")
        table_df0 = spark_session.table("temptable")
        table_df1 = spark_session.sql(
            "SELECT number1, number2 from temptable LIMIT 5")
        dfs = [
            base_df,
            table_df0,
            table_df1,
            base_df.filter("number1 > 0"),
            base_df.select("number1"),
            base_df.limit(2),
            base_df.filter("number1 > 0").select("number1").limit(2),
        ]

        for df in dfs:
            with mlflow.start_run():
                run_id = mlflow.active_run().info.run_id
                df.collect()
                time.sleep(1)
            run = mlflow.get_run(run_id)
            _assert_spark_data_logged(run=run,
                                      path=file_path,
                                      data_format=data_format)
예제 #4
0
def test_autologging_dedups_multiple_reads_of_same_datasource(
        spark_session, format_to_file_path):
    mlflow.spark.autolog()
    data_format = list(format_to_file_path.keys())[0]
    file_path = format_to_file_path[data_format]
    df = (spark_session.read.format(data_format).option(
        "header", "true").option("inferSchema", "true").load(file_path))
    with mlflow.start_run():
        run_id = mlflow.active_run().info.run_id
        df.collect()
        df.filter("number1 > 0").collect()
        df.limit(2).collect()
        df.collect()
        time.sleep(1)
    run = mlflow.get_run(run_id)
    _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
    # Test context provider flow
    df.filter("number1 > 0").collect()
    df.limit(2).collect()
    df.collect()
    with mlflow.start_run():
        run_id2 = mlflow.active_run().info.run_id
    time.sleep(1)
    run2 = mlflow.get_run(run_id2)
    _assert_spark_data_logged(run=run2,
                              path=file_path,
                              data_format=data_format)
def test_enabling_autologging_before_spark_session_works(disable):
    mlflow.spark.autolog(disable=disable)

    # creating spark session AFTER autolog was enabled
    spark_session = _get_or_create_spark_session()

    rows = [Row(100)]
    schema = StructType([StructField("number2", IntegerType())])
    rdd = spark_session.sparkContext.parallelize(rows)
    df = spark_session.createDataFrame(rdd, schema)
    tempdir = tempfile.mkdtemp()
    filepath = os.path.join(tempdir, "test-data")
    df.write.option("header", "true").format("csv").save(filepath)

    read_df = (spark_session.read.format("csv").option(
        "header", "true").option("inferSchema", "true").load(filepath))

    with mlflow.start_run():
        run_id = mlflow.active_run().info.run_id
        read_df.collect()
        time.sleep(1)

    run = mlflow.get_run(run_id)
    if disable:
        _assert_spark_data_not_logged(run=run)
    else:
        _assert_spark_data_logged(run=run, path=filepath, data_format="csv")

    shutil.rmtree(tempdir)
    spark_session.stop()
def test_spark_autologging_with_keras_autologging(spark_session, data_format,
                                                  file_path):
    assert mlflow.active_run() is None
    mlflow.spark.autolog()
    mlflow.keras.autolog()
    df = spark_session.read.format(data_format).option("header", "true"). \
        option("inferSchema", "true").load(file_path).select("number1", "number2")
    pandas_df = df.toPandas()
    run = _fit_keras_model(pandas_df, epochs=1)
    _assert_spark_data_logged(run, file_path, data_format)
    assert mlflow.active_run() is None
def test_spark_and_keras_autologging_all_runs_managed(spark_session,
                                                      data_format, file_path):
    kiwi.spark.autolog()
    kiwi.keras.autolog()
    for _ in range(2):
        with kiwi.start_run():
            df = spark_session.read.format(data_format).option("header", "true"). \
                option("inferSchema", "true").load(file_path).select("number1", "number2")
            pandas_df = df.toPandas()
            run = _fit_keras_model(pandas_df, epochs=1)
        _assert_spark_data_logged(run, file_path, data_format)
    assert kiwi.active_run() is None
def test_spark_and_keras_autologging_all_runs_managed(spark_session,
                                                      tracking_uri_mock,
                                                      data_format, file_path):
    # pylint: disable=unused-argument
    mlflow.spark.autolog()
    mlflow.keras.autolog()
    for _ in range(2):
        with mlflow.start_run():
            df = spark_session.read.format(data_format).option("header", "true"). \
                option("inferSchema", "true").load(file_path).select("number1", "number2")
            pandas_df = df.toPandas()
            run = _fit_keras_model(pandas_df, epochs=1)
        _assert_spark_data_logged(run, file_path, data_format)
    assert mlflow.active_run() is None
def test_spark_and_sklearn_autologging_all_runs_managed(
        spark_session, data_format, file_path):
    mlflow.spark.autolog()
    mlflow.sklearn.autolog()
    for _ in range(2):
        with mlflow.start_run():
            df = (spark_session.read.format(data_format).option(
                "header", "true").option("inferSchema",
                                         "true").load(file_path).select(
                                             "number1", "number2"))
            pandas_df = df.toPandas()
            run = _fit_sklearn_model(pandas_df)
        _assert_spark_data_logged(run, file_path, data_format)
    assert mlflow.active_run() is None
예제 #10
0
def test_autologging_multiple_runs_same_data(spark_session,
                                             format_to_file_path):
    mlflow.spark.autolog()
    data_format = list(format_to_file_path.keys())[0]
    file_path = format_to_file_path[data_format]
    df = (spark_session.read.format(data_format).option(
        "header", "true").option("inferSchema", "true").load(file_path))
    df.collect()

    for _ in range(2):
        with mlflow.start_run():
            time.sleep(1)
            run_id = mlflow.active_run().info.run_id
            run = mlflow.get_run(run_id)
            _assert_spark_data_logged(run=run,
                                      path=file_path,
                                      data_format=data_format)