def main(): # sc = SparkContext.getOrCreate() # spark = SparkSession(sc) data_path = os.getcwd() training_data_path = data_path + "/training.parquet" testing_data_path = data_path + "/testing.parquet" training_data = spark.read.parquet(training_data_path) testing_data = spark.read.parquet(testing_data_path) ratings = training_data.union(testing_data) model_save_folder = os.getcwd() als_path = model_save_folder + "/als" als_model_path = model_save_folder + "/als_model" als_module = ALS.load(als_path) model = ALSModel.load(als_model_path) # Run the tests evaluate_model(model=model, testing=testing_data) userRecs, movieRecs = generate_predictions_for_all(model=model) print(userRecs.head()) print(movieRecs.head()) userSubsetRecs, booksSubSetRecs = generate_prediction_group( als=als_module, model=model, ratings=ratings) print(userSubsetRecs.take(2)) print(booksSubSetRecs.take(2)) print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n") print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n") print("Finished!!!!!") print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n") print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
def main(spark, rank, regParam, path, fraction): TEMP_PATH = "/models/ALS_{}_{}_{}".format(rank, regParam, fraction) ALS_PATH = TEMP_PATH + "/als" MODEL_PATH = TEMP_PATH + "/als_model" print("Loading model...") als = ALS.load(path + ALS_PATH) model = ALSModel.load(path + MODEL_PATH) print("Loading data...") testing = spark.read.parquet("{}/data/processed/testing_{}.parquet".format( path, fraction)) testing.createOrReplaceTempView("testing") # RMSE predictions = model.transform(testing) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("RSME:", rmse) predictions = model.recommendForAllUsers(500) predictions.createOrReplaceTempView("predictions") groundtruth = testing.groupby("user_id").agg( F.collect_set("book_id").alias('groundtruth')) groundtruth.createOrReplaceTempView("groundtruth") total = spark.sql( "SELECT g.user_id, g.groundtruth AS groundtruth, p.recommendations AS predictions FROM groundtruth g JOIN predictions p ON g.user_id = p.user_id" ) total.createOrReplaceTempView("total") data = total.selectExpr("predictions.book_id", "groundtruth") print("df to rdd...") rdd = data.rdd.map(tuple) print("creating metrics...") metrics = RankingMetrics(rdd) print("meanAveragePrecision:", metrics.meanAveragePrecision) print("precision at 500:", metrics.precisionAt(500)) print("ndcgAt 500:", metrics.ndcgAt(500))
from pyspark.ml.recommendation import ALS, ALSModel from pyspark.sql import SparkSession spark = SparkSession.builder.appName( 'Load model for Recommendations using explicit interests').getOrCreate() #Load saved als and model als = ALS.load('../models/FlexiGYMExplicitRecommender_ALS') model = ALSModel.load( '../models/FlexiGYMExplicitRecommender_ALSModel/bestModel') print(als.getMaxIter()) print(model.userFactors.collect()) print(model.itemFactors.collect())
sql_query = """ SELECT uid, urlid, label FROM train_set """ ratings = spark.sql(sql_query) print ratings.head() print "finish to sql uid_urlid_label" (training, test) = ratings.randomSplit([0.99, 0.01]) #als = ALS(maxIter=10, regParam=0.01, numUserBlocks=20, numItemBlocks=100, userCol="uid", itemCol="urlid", ratingCol="label") #model = als.fit(ratings) #model.setColdStartStrategy("drop"); # Save and load model als_path = "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/als_conf" #als.save(als_path) als = ALS.load(als_path) model_path = "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/als_model" #model.save(model_path) model = ALSModel.load(model_path) user_recs = model.recommendForAllUsers(10) print "user recs" print user_recs.head() user_recs.rdd.saveAsTextFile( "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_recs" ) item_recs = model.recommendForAllItems(10) print "item recs" print item_recs.head() item_recs.rdd.saveAsTextFile(
# Group out unique user : product combos and count up clicks oer combo dt_new.createOrReplaceTempView('dt_new') sqlquery = "select user, product, product_name, sum(click_counts) as click from dt_new group by user, product, product_name" dt2 = spark.sql(sqlquery) # Scale standardize clicks and center data at zero dt2.createOrReplaceTempView("t") dt3 = spark.sql("select t.*, (t.click - sub.mnm)/(sub.mxm - sub.mnm) as scaledclicks from t cross join (select min(click) as mnm, max(click) as mxm from t) sub") ######################################################################################################################## # Call best ALS model... model = ALS.load(path_to_model) ######################################################################################################################## # Build the recommendation model using ALS on the training data als = ALS(rank = 10, maxIter=5, regParam=0.01, userCol="user", itemCol="product", ratingCol="scaledclicks", implicitPrefs=True, coldStartStrategy="drop") model = als.fit(dt3) # Predict predictions = model.transform(dt3) userRecs = model.recommendForAllUsers(20) # Break the recommendations from array to columnar format
(training, test) = ratings.randomSplit([0.8, 0.2]) als = ALS(maxIter=5, regParam=0.01, userCol="uid", itemCol="urlid", ratingCol="label") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="label", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) predictions.head(5) predictions.write.csv( "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/predictions_test" ) print "finish predictions" # Save and load model als_path = "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/als_model_test" als.save(als_path) als2 = ALS.load(als_path) # $example off$ print "finish load"