def test_stratified_splitter(test_specs, spark_dataset): splits = spark_stratified_split( spark_dataset, ratio=test_specs["ratio"], filter_by="user", min_rating=10 ) assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratio"], test_specs["tolerance"] ) assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx( 1 - test_specs["ratio"], test_specs["tolerance"] ) # Test if both contains the same user list. This is because stratified split is stratified. users_train = ( splits[0].select(DEFAULT_USER_COL).distinct().rdd.map(lambda r: r[0]).collect() ) users_test = ( splits[1].select(DEFAULT_USER_COL).distinct().rdd.map(lambda r: r[0]).collect() ) assert set(users_train) == set(users_test) splits = spark_stratified_split(spark_dataset, ratio=test_specs["ratios"]) assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][0], test_specs["tolerance"] ) assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][1], test_specs["tolerance"] ) assert splits[2].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][2], test_specs["tolerance"] )
def test_stratified_splitter(spark_dataset): splits = spark_stratified_split(spark_dataset, ratio=RATIOS[0], filter_by="user", min_rating=10) assert splits[0].count() / NUM_ROWS == pytest.approx(RATIOS[0], TOL) assert splits[1].count() / NUM_ROWS == pytest.approx(1 - RATIOS[0], TOL) # Test if both contains the same user list. This is because stratified split is stratified. users_train = (splits[0].select(DEFAULT_USER_COL).distinct().rdd.map( lambda r: r[0]).collect()) users_test = (splits[1].select(DEFAULT_USER_COL).distinct().rdd.map( lambda r: r[0]).collect()) assert set(users_train) == set(users_test) splits = spark_stratified_split(spark_dataset, ratio=RATIOS) assert splits[0].count() / NUM_ROWS == pytest.approx(RATIOS[0], TOL) assert splits[1].count() / NUM_ROWS == pytest.approx(RATIOS[1], TOL) assert splits[2].count() / NUM_ROWS == pytest.approx(RATIOS[2], TOL)
def test_timestamp_splitter(test_specs, spark_dataset): """Test timestamp splitter for Spark dataframes""" from pyspark.sql.functions import col dfs_rating = spark_dataset dfs_rating = dfs_rating.withColumn(DEFAULT_TIMESTAMP_COL, col(DEFAULT_TIMESTAMP_COL).cast("float")) splits = spark_timestamp_split( dfs_rating, ratio=test_specs["ratio"], col_timestamp=DEFAULT_TIMESTAMP_COL ) assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratio"], test_specs["tolerance"] ) assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx( 1 - test_specs["ratio"], test_specs["tolerance"] ) # Test multi split splits = spark_stratified_split(dfs_rating, ratio=test_specs["ratios"]) assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][0], test_specs["tolerance"] ) assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][1], test_specs["tolerance"] ) assert splits[2].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][2], test_specs["tolerance"] ) dfs_train = splits[0] dfs_valid = splits[1] dfs_test = splits[2] # if valid is later than train. all_later_1 = _if_later(dfs_train, dfs_valid, col_timestamp=DEFAULT_TIMESTAMP_COL) assert all_later_1 # if test is later than valid. all_later_2 = _if_later(dfs_valid, dfs_test, col_timestamp=DEFAULT_TIMESTAMP_COL) assert all_later_2
from reco_utils.dataset.spark_splitters import ( spark_random_split, spark_chrono_split, spark_stratified_split, spark_timestamp_split ) import pyspark.sql.functions as sql_func from pyspark.sql.types import * from pyspark.ml.recommendation import ALS, ALSModel from pyspark.context import SparkContext from pyspark.sql.session import SparkSession from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics from pyspark.ml.evaluation import RegressionEvaluator training, test = spark_stratified_split( ratings, ratio=0.65, filter_by="user", col_user='******', col_item='Varenr', seed=42 ) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(alpha=10, rank=35, maxIter=15, regParam=0.01, userCol="Medlemsnr_index", itemCol="Varenr", ratingCol="Rating", coldStartStrategy="drop", implicitPrefs=True, seed=42) model = als.fit(training) #started logging the Model with mlflow.start_run(): mlflow.spark.log_model(model, "MyALSModel") modelpath = "/dbfs/ml/SparkModel/" mlflow.spark.save_model(model, modelpath)