示例#1
0
def main():
    # sc = SparkContext.getOrCreate()
    # spark = SparkSession(sc)

    data_path = os.getcwd()
    training_data_path = data_path + "/training.parquet"
    testing_data_path = data_path + "/testing.parquet"

    training_data = spark.read.parquet(training_data_path)
    testing_data = spark.read.parquet(testing_data_path)

    ratings = training_data.union(testing_data)

    model_save_folder = os.getcwd()
    als_path = model_save_folder + "/als"
    als_model_path = model_save_folder + "/als_model"

    als_module = ALS.load(als_path)
    model = ALSModel.load(als_model_path)

    # Run the tests
    evaluate_model(model=model, testing=testing_data)

    userRecs, movieRecs = generate_predictions_for_all(model=model)

    print(userRecs.head())
    print(movieRecs.head())

    userSubsetRecs, booksSubSetRecs = generate_prediction_group(
        als=als_module, model=model, ratings=ratings)

    print(userSubsetRecs.take(2))
    print(booksSubSetRecs.take(2))
    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
    print("Finished!!!!!")
    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
def main(spark, rank, regParam, path, fraction):
    TEMP_PATH = "/models/ALS_{}_{}_{}".format(rank, regParam, fraction)
    ALS_PATH = TEMP_PATH + "/als"
    MODEL_PATH = TEMP_PATH + "/als_model"
    print("Loading model...")
    als = ALS.load(path + ALS_PATH)
    model = ALSModel.load(path + MODEL_PATH)
    print("Loading data...")
    testing = spark.read.parquet("{}/data/processed/testing_{}.parquet".format(
        path, fraction))
    testing.createOrReplaceTempView("testing")

    # RMSE
    predictions = model.transform(testing)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("RSME:", rmse)
    predictions = model.recommendForAllUsers(500)
    predictions.createOrReplaceTempView("predictions")
    groundtruth = testing.groupby("user_id").agg(
        F.collect_set("book_id").alias('groundtruth'))
    groundtruth.createOrReplaceTempView("groundtruth")
    total = spark.sql(
        "SELECT g.user_id, g.groundtruth AS groundtruth, p.recommendations AS predictions FROM groundtruth g JOIN predictions p ON g.user_id = p.user_id"
    )
    total.createOrReplaceTempView("total")

    data = total.selectExpr("predictions.book_id", "groundtruth")
    print("df to rdd...")
    rdd = data.rdd.map(tuple)
    print("creating metrics...")
    metrics = RankingMetrics(rdd)
    print("meanAveragePrecision:", metrics.meanAveragePrecision)
    print("precision at 500:", metrics.precisionAt(500))
    print("ndcgAt 500:", metrics.ndcgAt(500))
示例#3
0
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName(
    'Load model for Recommendations using explicit interests').getOrCreate()

#Load saved als and model
als = ALS.load('../models/FlexiGYMExplicitRecommender_ALS')
model = ALSModel.load(
    '../models/FlexiGYMExplicitRecommender_ALSModel/bestModel')

print(als.getMaxIter())
print(model.userFactors.collect())
print(model.itemFactors.collect())
示例#4
0
sql_query = """
SELECT uid, urlid, label
FROM train_set
"""
ratings = spark.sql(sql_query)
print ratings.head()
print "finish to sql uid_urlid_label"
(training, test) = ratings.randomSplit([0.99, 0.01])
#als = ALS(maxIter=10, regParam=0.01, numUserBlocks=20, numItemBlocks=100, userCol="uid", itemCol="urlid", ratingCol="label")
#model = als.fit(ratings)
#model.setColdStartStrategy("drop");

# Save and load model
als_path = "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/als_conf"
#als.save(als_path)
als = ALS.load(als_path)

model_path = "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/als_model"
#model.save(model_path)
model = ALSModel.load(model_path)

user_recs = model.recommendForAllUsers(10)
print "user recs"
print user_recs.head()
user_recs.rdd.saveAsTextFile(
    "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/user_recs"
)
item_recs = model.recommendForAllItems(10)
print "item recs"
print item_recs.head()
item_recs.rdd.saveAsTextFile(
示例#5
0
# Group out unique user : product combos and count up clicks oer combo
dt_new.createOrReplaceTempView('dt_new')

sqlquery = "select user, product, product_name, sum(click_counts) as click from dt_new group by user, product, product_name"
dt2 = spark.sql(sqlquery)

# Scale standardize clicks and center data at zero
dt2.createOrReplaceTempView("t")

dt3 = spark.sql("select t.*, (t.click - sub.mnm)/(sub.mxm - sub.mnm) as scaledclicks from t cross join (select min(click) as mnm, max(click) as mxm from t) sub")

########################################################################################################################

# Call best ALS model...

model = ALS.load(path_to_model)

########################################################################################################################

# Build the recommendation model using ALS on the training data
als = ALS(rank = 10, maxIter=5, regParam=0.01, userCol="user", itemCol="product", ratingCol="scaledclicks", implicitPrefs=True,
          coldStartStrategy="drop")
model = als.fit(dt3)



# Predict
predictions = model.transform(dt3)
userRecs = model.recommendForAllUsers(20)

# Break the recommendations from array to columnar format
示例#6
0
(training, test) = ratings.randomSplit([0.8, 0.2])

als = ALS(maxIter=5,
          regParam=0.01,
          userCol="uid",
          itemCol="urlid",
          ratingCol="label")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="label",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

predictions.head(5)
predictions.write.csv(
    "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/predictions_test"
)
print "finish predictions"

# Save and load model
als_path = "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/als_model_test"
als.save(als_path)
als2 = ALS.load(als_path)
# $example off$
print "finish load"