def test_autologging_dedups_multiple_reads_of_same_datasource( spark_session, format_to_file_path): mlflow.spark.autolog() data_format = list(format_to_file_path.keys())[0] file_path = format_to_file_path[data_format] df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path)) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id df.collect() df.filter("number1 > 0").collect() df.limit(2).collect() df.collect() time.sleep(1) run = mlflow.get_run(run_id) _assert_spark_data_logged(run=run, path=file_path, data_format=data_format) # Test context provider flow df.filter("number1 > 0").collect() df.limit(2).collect() df.collect() with mlflow.start_run(): run_id2 = mlflow.active_run().info.run_id time.sleep(1) run2 = mlflow.get_run(run_id2) _assert_spark_data_logged(run=run2, path=file_path, data_format=data_format)
def test_autologging_of_datasources_with_different_formats( spark_session, format_to_file_path): mlflow.spark.autolog() for data_format, file_path in format_to_file_path.items(): base_df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path)) base_df.createOrReplaceTempView("temptable") table_df0 = spark_session.table("temptable") table_df1 = spark_session.sql( "SELECT number1, number2 from temptable LIMIT 5") dfs = [ base_df, table_df0, table_df1, base_df.filter("number1 > 0"), base_df.select("number1"), base_df.limit(2), base_df.filter("number1 > 0").select("number1").limit(2), ] for df in dfs: with mlflow.start_run(): run_id = mlflow.active_run().info.run_id df.collect() time.sleep(1) run = mlflow.get_run(run_id) _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
def test_enabling_autologging_before_spark_session_works(disable): mlflow.spark.autolog(disable=disable) # creating spark session AFTER autolog was enabled spark_session = _get_or_create_spark_session() rows = [Row(100)] schema = StructType([StructField("number2", IntegerType())]) rdd = spark_session.sparkContext.parallelize(rows) df = spark_session.createDataFrame(rdd, schema) tempdir = tempfile.mkdtemp() filepath = os.path.join(tempdir, "test-data") df.write.option("header", "true").format("csv").save(filepath) read_df = (spark_session.read.format("csv").option( "header", "true").option("inferSchema", "true").load(filepath)) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id read_df.collect() time.sleep(1) run = mlflow.get_run(run_id) if disable: _assert_spark_data_not_logged(run=run) else: _assert_spark_data_logged(run=run, path=filepath, data_format="csv") shutil.rmtree(tempdir) spark_session.stop()
def test_spark_autologging_with_sklearn_autologging(spark_session, data_format, file_path): assert mlflow.active_run() is None mlflow.spark.autolog() mlflow.sklearn.autolog() df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path).select("number1", "number2")) pandas_df = df.toPandas() run = _fit_sklearn_model(pandas_df) _assert_spark_data_logged(run, file_path, data_format) assert mlflow.active_run() is None
def test_autologging_multiple_runs_same_data(spark_session, format_to_file_path): mlflow.spark.autolog() data_format = list(format_to_file_path.keys())[0] file_path = format_to_file_path[data_format] df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path)) df.collect() for _ in range(2): with mlflow.start_run(): time.sleep(1) run_id = mlflow.active_run().info.run_id run = mlflow.get_run(run_id) _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
def test_autologging_disabled_then_enabled(spark_session, format_to_file_path): mlflow.spark.autolog(disable=True) data_format = list(format_to_file_path.keys())[0] file_path = format_to_file_path[data_format] df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path)) # Logging is disabled here. with mlflow.start_run(): run_id = mlflow.active_run().info.run_id df.collect() time.sleep(1) run = mlflow.get_run(run_id) _assert_spark_data_not_logged(run=run) # Logging is enabled here. mlflow.spark.autolog(disable=False) with mlflow.start_run(): run_id = mlflow.active_run().info.run_id df.filter("number1 > 0").collect() time.sleep(1) run = mlflow.get_run(run_id) _assert_spark_data_logged(run=run, path=file_path, data_format=data_format)
def test_spark_sklearn_autologging_context_provider(spark_session, data_format, file_path): mlflow.spark.autolog() mlflow.sklearn.autolog() df = (spark_session.read.format(data_format).option( "header", "true").option("inferSchema", "true").load(file_path).select("number1", "number2")) pandas_df = df.toPandas() # DF info should be logged to the first run (it should be added to our context provider after # the toPandas() call above & then logged here) with mlflow.start_run(): run = _fit_sklearn_model(pandas_df) _assert_spark_data_logged(run, file_path, data_format) with mlflow.start_run(): pandas_df2 = df.filter("number1 > 0").toPandas() run2 = _fit_sklearn_model(pandas_df2) assert run2.info.run_id != run.info.run_id _assert_spark_data_logged(run2, file_path, data_format) time.sleep(1) assert mlflow.active_run() is None