def init(): #os.system("/usr/lib/spark/bin/spark-submit /home/ubuntu/flaskapp/engine.py -driver-memory 4g") global recommendation_engine recommendation_engine = RecommendationEngine(sc) #global sameModel if os.path.isdir( "/home/ubuntu/gamesRecommenderSpark/gamesRecommenderSpark/model"): global sameModel sameModel = MatrixFactorizationModel.load( sc, "/home/ubuntu/efs/gamesRecommenderSpark/gamesRecommenderSpark/model" ) else: logger.debug("HAHAHAHA!!!! MIAOMIAOMIAOMIAO!!!!") #child = subprocess.call(["/usr/lib/spark/bin/spark-submit", "--driver-memory", "4g", "/home/ubuntu/flaskapp/engine.py", "/home/ubuntu/Game_Recommendation/gamesRecommenderSpark/pythonGames/data/gamelens/medium", "/home/ubuntu/Game_Recommendation/gamesRecommenderSpark/pythonGames/data/gamelens/personalRatings.txt"]) #child.wait() #child = os.system("/usr/lib/spark/bin/spark-submit --driver-memory 4g /home/ubuntu/flaskapp/engine.py /home/ubuntu/Game_Recommendation/gamesRecommenderSpark/pythonGames/data/gamelens/medium /home/ubuntu/Game_Recommendation/gamesRecommenderSpark/pythonGames/data/gamelens/personalRatings.txt") #print (child) global sameModel sameModel = MatrixFactorizationModel.load( sc, "/home/ubuntu/efs/gamesRecommenderSpark/gamesRecommenderSpark/model" ) logger.debug("parent process") #global recommendation_engine #recommendation_engine = RecommendationEngine(sc) #return app return 'Hello from Game Recommendation System!'
def load_model(path): """ Load best model from given path in HDFS. Args: path (str): Path where best model is stored. """ return MatrixFactorizationModel.load(listenbrainz_spark.context, path)
def loadModel(sc): try: model = MatrixFactorizationModel.load(sc, Path + "ALSmodel") print "载入ALSModel模型" except Exception: print "找不到ALSModel模型,请先训练" return model
def main(spark, model_file, test_file): model = MatrixFactorizationModel.load(sc, model_file) #model = MatrixFactorizationModel.load(sc, 'hdfs:/user/xx852/als_model') test_df = spark.read.parquet(test_file) #test_df = spark.read.parquet('hdfs:/user/xx852/cf_test_small.parquet') test_df = test_df.select('user_label', 'track_label', 'count') #predictions = model.recommendProductsForUsers(500) predictions = model.recommendProductsForUsers(2) prediction_flat = predictions.flatMap(lambda p: p[1]) prediction_df = prediction_flat.toDF() intersections = prediction_df.join(test_df, (prediction_df.product == test_df.track_label)& (prediction_df.user == test_df.user_label), how = 'inner') predLabel = intersections.select('rating', 'count') #predLabel_rdd = predLabel.rdd.map(lambda x: Row(x[0], x[1])) #metrics = RankingMetrics(predLabel_rdd) #print(metrics.meanAveragePrecision) from pyspark.sql import Window import pyspark.sql.functions as psf w_rating = Window.orderBy(psf.desc('rating')) w_count = Window.orderBy(psf.desc('count')) predLabel = predLabel.withColumn('rating_rank', \ psf.dense_rank().over(w_rating)).withColumn('count_rank', \ psf.dense_rank().over(w_count)) predLabel = predLabel.select('rating_rank', 'count_rank') #predLabel_rdd = predLabel.rdd.map(lambda x: Row(x[0], x[1])) predLabel_rdd = predLabel.rdd metrics = RankingMetrics(predLabel_rdd)
def read(self, version): als_model = MatrixFactorizationModel.load(self._sc, self._url) model = Model(sc=self._sc, als_model=als_model, version=version, data_version=1) return model
def loadModel(sc): try: model = MatrixFactorizationModel.load(sc, Path + "output/ALSmodel") print("载入ALSModel模型") except Exception: print("找不到ALSModel模型,请先训练") return model
def predict(userID): completeMovies = sc.textFile('datasets/ml-latest-small/movies.csv') header2 = completeMovies.first() completeMovies = completeMovies.filter(lambda line : line != header2)\ .map(lambda line : line.split(",")) model = MatrixFactorizationModel.load(sc, "target/model") completeRDD = sc.textFile('datasets/ml-latest-small/ratings.csv') header = completeRDD.first() completeRDD = completeRDD.filter(lambda line : line != header)\ .map(lambda line : line.split(","))\ .map(lambda line : (line[0],line[1],line[2])) userRatedMovies = completeRDD.filter(lambda line: line[0] == userID).map( lambda line: line[1]).collect() userUnrated = completeMovies.filter(lambda line: line[ 0] not in userRatedMovies).map(lambda line: (userID, line[0])) predict = model.predictAll(userUnrated).map( lambda line: [str(line[1]), line[2]]).sortBy(lambda line: line[1], ascending=False) movies = predict.join(completeMovies) output = movies.map(lambda line: line[1][1]).take(15) for i in output: print(i)
def main(): import configspark sc = configspark.SPARK_CONTEXT # user/song string ID to int ID mappings full_text = sc.textFile(config.MSD_DATA) full_raw = full_text.map(msd_parse.parse_line) users, songs, _ = msd_parse.get_user_song_maps(full_raw) print("\nLoading MovieLens test dataset\n") test_parsed = ( sc.textFile(config.MSD_TEST) .map(msd_parse.parse_line)) test_prepped = msd_parse.replace_raw_ids(test_parsed, users, songs) test = test_prepped.map(msd_parse.rating_convert) if os.path.exists(config.MSD_MODEL): print("\n\nLoading existing recommendation model from %s\n\n" % config.MSD_MODEL) model = MatrixFactorizationModel.load(sc, config.MSD_MODEL) else: raise RuntimeError("Failed to load ALS model from %s" % config.MSD_MODEL) mse, rmse = evaluate.evaluate_model(model, test) print("\nMSD ALS model performance: MSE=%0.3f RMSE=%0.3f\n" % (mse, rmse))
def process(time, rdd): trainedModel = MatrixFactorizationModel.load( sc, "target/model/myCollaborativeFilter1") # trainedModel.cache() print("========= %s =========" % str(time)) print("Processing RDD") sqs = SimpleQueueService() if rdd.count() > 0: # print(rdd.collect()) # print("RDD Printed") predictions = trainedModel.predictAll(rdd).map(lambda r: ((r[0], r[1]), r[2])) # print(predictions.collect()) # print("PredictionsDone") recommendations = predictions.takeOrdered(2, key=lambda x: -x[1]) # print(recommendations) for rec in recommendations: userNo = str(rec[0][0]) offerNo = str(rec[0][1]) resp = (get( "http://54.173.234.214:5000/api/helperclass/offermapping=true?userNo=" + userNo + "&offerNo=" + offerNo).json()) sqs.addMessageToQueue(resp) print("OKDone")
def loadModel(sc): try: model = MatrixFactorizationModel.load(sc, Path + "ALSmodel") print("load ALSModel") except Exception: print("Can not find model, please train the model first.") return model
def loadModel(sc): try: model = MatrixFactorizationModel.load(sc, Path + "ALSmodel") print("begin to load the model") except Exception: print("we can not find the model, please train the data first") return model
def loadModel(sc): try: model = MatrixFactorizationModel.load(sc, Path + "ALSmodel") print("载入ALSmodel模型") return model except Exception: print("找不到ALSmodel模型,请先训练!")
def load_model(spark_context): base_dir = "/Users/hpnhxxwn/Desktop/proj/DE/yelp/review_out3/" city_name = "Charlotte" model_file_name = "business_recomm_model_for_{}".format(city_name) model_full_path = os.path.join(base_dir, city_name, "mf_based_models", model_file_name) model = MatrixFactorizationModel.load(spark_context, model_full_path) return model
def recommend(self, user, topK): # load model model = MatrixFactorizationModel.load(self.sc, Constants.RESOURCE_FOLDER + Constants.MODEL_FILE) model.recommendProducts(user, topK) #TODO: decide a model to return data
def load_als_model(file_name): """ loads ALS model based on state :param file_name: file :return: als model """ return MatrixFactorizationModel.load(sc=spark.sparkContext, path='als-models/' + file_name)
def load_model(spark_context): try: model = MatrixFactorizationModel.load(spark_context, '../datas/asl-model') print(model) return model except Exception: print("加载模型出错")
def loadModel(sc): # model = MatrixFactorizationModel.load(sc,path+"ALSmodel") try: model = MatrixFactorizationModel.load(sc,path+"ALSmodel") print("载入ALSModel模型") except Exception: print("找不到模型") return model
def load_model(spark_context): base_dir = "/Users/sundeepblue/Bootcamp/allweek/week9/capstone/data/yelp_data/split_business_data_by_city/" city_name = "us_charlotte" model_file_name = "business_recomm_model_for_{}".format(city_name) model_full_path = os.path.join(base_dir, city_name, "mf_based_models", model_file_name) model = MatrixFactorizationModel.load(spark_context, model_full_path) return model
def load_model(sc): try: print("加载模型...") model = MatrixFactorizationModel.load(sc, Path + "ALSmodel") except Exception: print("模型不存在!") return model
def LoadModel(sc): try: model = MatrixFactorizationModel.load(sc, '/models/ALSmodel') except Exception: print("Faild to load model") else: print("model loaded") return model
def LoadModel(sc, path): try: ALS_model = MatrixFactorizationModel.load(sc, path) print("载入模型成功") return ALS_model except Exception: print("模型不存在,请先训练模型") return None
def loadModel(sc): """载入训练好的推荐模型""" try: model = MatrixFactorizationModel.load(sc, Path + "ALSmodel") print("载入模型成功") except Exception: print("模型不存在, 请先训练模型") return model
def loadModel(sc): """load Model""" try: model = MatrixFactorizationModel.load(sc, Path + "ALSmodel") print("success...") except Exception: print("Failed!!") return model
def load_model(self): try: model = MatrixFactorizationModel.load(self.sc, self.path_model) print(model) return model except Exception: print("模型加载出错") return {}
def loadModel(sc): try: alsmodel = os.path.join(Path, 'data', 'ALSmodel') #model = MatrixFactorizationModel.load(sc, Path+"ALSmodel") model = MatrixFactorizationModel.load(sc, alsmodel) print "载入ALSModel模型" except Exception: print "找不到ALSModel模型,请先训练" return model
def trainALS(self): try: self.model = MatrixFactorizationModel.load(self.sc, "als_final") except (RuntimeError, TypeError, NameError) as e: rank = 4 numIterations = 20 self.model = ALS.trainImplicit(self.implicit_ratings, rank, numIterations) self.model.save(self.sc, "als_final")
def load_model(sc, model_path): try: model = MatrixFactorizationModel.load(sc, os.path.join(PATH, model_path)) print("模型已加载") except Exception: print("找不到模型") exit(-1) return model
def __load_model(self): """Load the ALS model with the current dataset """ logger.info("Loading the ALS model...") model_path = os.path.join("file:///usr/local/spark/mycode/recommender", 'models', 'movie_lens_als') # /home/201500130058/task5 # file:///usr/local/spark/mycode/recommender self.model = MatrixFactorizationModel.load(self.sc, model_path) logger.info("ALS model loaded!")
def load_model(self): sc = self.spark.sparkContext try: model = MatrixFactorizationModel.load(sc, "./model_data/als-model") print('载入成功!') return model except Exception: print("模型不存在, 请先训练模型") return self.train()
def __train_model(self, sc): """Train the ALS model with the current dataset """ #load the trained model logger.info("Loading the ALS model from file...") model_path = os.path.join( 'file:///Users/yumi.zhang/Desktop/recommendation', 'models', 'movie_lens_als') self.model = MatrixFactorizationModel.load(sc, model_path) logger.info("ALS model loaded successfully!")
def add_rating(user_id, movie_id, rating): if not recommendation_engine.addRating(user_id, movie_id, rating): return "Rating for user %s and movie %s already exists" % (user_id, movie_id) result = rerun_modeling.delay() result.wait() recommendation_engine.bestRankModel = MatrixFactorizationModel.load( spark.sparkContext, "modelAppended") return "Rating has been added"
def trainModel(limit,data,rank,num_iterations): save_file = "models/"+str(limit)+"rank"+str(rank)+"iterations"+str(num_iterations) if isdir(save_file): print("Rank "+str(rank)+" and Iterations "+str(num_iterations)+" Model already exists, loading...") model = MatrixFactorizationModel.load(sc, save_file) else: print("Model does not exist, training ALS with rank "+str(rank)+" and "+str(num_iterations)+" iterations") model = ALS.train(data, rank, num_iterations) print("Saving new model") model.save(sc,save_file) return model
def hello(): if request.method == 'POST': conf = SparkConf().setAppName("yelp_recommendation-server") sc = SparkContext(conf=conf) global sameModel sameModel = MatrixFactorizationModel.load( sc, "/Users/xx/Desktop/big_data/project/Yelp-Personalized-Recommendation/flask/dataset/myCollaborativeFilter" ) return redirect(url_for('send')) return render_template('hello.html')
def saveModel(self): # Save and load model path = os.path.dirname(os.path.realpath(__file__)) if os.path.exists(path+'/Model'): shutil.rmtree(path+'/Model') path = 'file:///' + path + '/Model' print('\n',20*'-','MODEL SAVED at',20*'-') print(path) print(50*'-') model.save(sc, path) sameModel = MatrixFactorizationModel.load(self.ctx, path)
def main(argv): Conf = (SparkConf().setAppName("test")) sc = SparkContext(conf=Conf) sqlContext = SQLContext(sc) dirPath = 'hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model' sameModel = MatrixFactorizationModel.load(sc, dirPath) b= sameModel.recommendProductsForUsers(193667506) print b.take(10) sc.stop()
def __train_model(self): """Train the ALS model with the current dataset """ try: logger.info("Loading the ALS model...") # self.model = MatrixFactorizationModel.load(self.sc, "als_model.data") self.model = MatrixFactorizationModel.load(self.sc, "als_model.data") except: logger.info("Training the ALS model...") self.model = ALS.train(self.ratings_RDD, self.rank, seed=self.seed, iterations=self.iterations, lambda_=self.regularization_parameter) self.model.save(self.sc,"als_model.data") logger.info("ALS model built!")
def load_data(self): # Model must be already trained and saved to recommendation folder model_path = os.path.join(os.path.dirname(__file__), '%s/trained_model' % c.RECOMMENDATION_DIR) self.model = MatrixFactorizationModel.load(self.sc, model_path) user_path = os.path.join(os.path.dirname(__file__), '%s/user.json' % c.RECOMMENDATION_DIR) with open(user_path, 'r') as f: self.user_lookup = json.load(f) course_path = os.path.join(os.path.dirname(__file__), '%s/course.json' % c.RECOMMENDATION_DIR) with open(course_path, 'r') as f: self.course_lookup = json.load(f)
def prepare_model(sc, filename, user_id, ratings_train): if filename is None and os.path.exists(config.MSD_MODEL): # load the trained model print("\n\nLoading existing recommendation model from %s\n\n" % config.MSD_MODEL) model = MatrixFactorizationModel.load(sc, config.MSD_MODEL) else: # train a new model print("\n\nRetraining recommendation model for User %s\n\n" % user_id) rank, lambda_val = ( evaluate.load_best_params(config.MSD_BEST_PARAMS_FILE)) rank, lambda_val = int(rank), float(lambda_val) model = ALS.trainImplicit(ratings_train, rank, evaluate.ITERATIONS, lambda_val, nonnegative=True) return model
def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") # self.model = ALS.train(self.ratings_RDD, self.rank, seed=self.seed, # iterations=self.iterations, lambda_=self.regularization_parameter) # self.model = MatrixFactorizationModel.load(self.sc, "s3n://patricks3db/modelsComplete") # self.model = MatrixFactorizationModel.load(self.sc, "s3n://patricks3db/modelsComplete100r20i") self.model = MatrixFactorizationModel.load(self.sc, "s3n://patricks3db/modelsComplete200r20i003a") userFeatures = self.model.userFeatures().repartition(1) # userFeatures.cache() userFeatures.persist(StorageLevel.MEMORY_AND_DISK_SER) print userFeatures productFeatures = self.model.productFeatures().repartition(1) # productFeatures.cache() productFeatures.persist(StorageLevel.MEMORY_AND_DISK_SER) print productFeatures logger.info("ALS model built!")
def main(): import configspark sc = configspark.SPARK_CONTEXT print("\nLoading MovieLens test dataset\n") test_text = sc.textFile(config.ML_RATINGS_TEST) ratings_test = ( test_text.map(ml_parse.parse_line).map(ml_parse.rating_convert)) if os.path.exists(config.ML_MODEL): print("\n\nLoading existing recommendation model from %s\n\n" % config.ML_MODEL) model = MatrixFactorizationModel.load(sc, config.ML_MODEL) else: raise RuntimeError("Failed to load ALS model from %s" % config.ML_MODEL) mse, rmse = evaluate.evaluate_model(model, ratings_test) print("\nML ALS model performance: MSE=%0.3f RMSE=%0.3f\n" % (mse, rmse))
def __init__(self, sc, datapath='/media/psf/Home/CS/GIT_HUB/Movie-Recommendation-Project/frontend/', rating_file='ratings_small.csv', complete_rating_file='ratings.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.sc = sc self.start = True self.rating_file = datapath+rating_file self.complete_rating_file = datapath+complete_rating_file self.movie_file = datapath+movie_file self.detail_file = datapath+detail_file self.integration_folder = datapath self.svd = SVD(filename=datapath+model) self.svd.load_data(filename=self.rating_file, sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids': int}) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http') # als stuff self.sqlContext = SQLContext(self.sc) self.movie_data = self.sc.textFile(self.movie_file) self.ratings_data = self.sc.textFile(self.complete_rating_file).map(lambda line: line.split(",")).map(lambda x: (int(x[0]), int(x[1]), float(x[2]))) self.als_model_path = datapath + 'Model_Collaborative_Filtering' self.als_model = MatrixFactorizationModel.load(sc, self.als_model_path) self.movie_df = self.sqlContext.read.load(datapath+'tables/movies') self.detail_df = self.sqlContext.read.load(datapath+'tables/detail') self.rating_df = self.sqlContext.read.load(datapath+'tables/ratings')
sc = SparkContext(conf=conf) # Load and parse the data data = sc.textFile(input) ratings = data.map(lambda l: l.split(',')).filter(lambda a:len(a)==4)\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) # Build the recommendation model using Alternating Least Squares rank = 10 numIterations = 20 model = ALS.train(ratings, rank, numIterations) # Evaluate the model on training data testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata)\ .map(lambda r: ((r[0], r[1]), r[2])) predictions.foreach(my_print) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("mse = " + str(MSE)+", model="+output) # Save and load model model.save(sc, output) resume_model = MatrixFactorizationModel.load(sc, output) preds = resume_model.predictAll(testdata)\ .map(lambda r: ((r[0], r[1]), r[2])) preds.foreach(my_print) sc.stop()
ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) # Build the recommendation model using Alternating Least Squares rank = 10 numIterations = 10 model = ALS.trainImplicit(ratings, rank, numIterations) # Evaluate the model on training data testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE)) #Save and load model #commented out the save for now because the model already exists on hdfs #uncomment this when you are ready to train a new model! #model.save(sc, "target/tmp/myCollaborativeFilter") sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") #parse the AskForRecsFor.csv file f = open('AskForRecsForShort.csv') fp = open("reccomendFile2.txt","w") csv_f = csv.reader(f) #next(csv_f, None) for row in csv_f: a = row[0] b = row[1] recommendation = model.recommendProducts(int(a),int(b)) fp.write(str(recommendation)) fp.close()
from pyspark.sql import SQLContext from pyspark import SparkContext from pyspark import SparkConf from pyspark.mllib.recommendation import MatrixFactorizationModel conf = SparkConf().setAppName("miniproject").setMaster("local[*]") sc = SparkContext.getOrCreate(conf) sqlContext = SQLContext(sc) model = MatrixFactorizationModel.load(sc, '/tmp/model') # model.productFeatures().cache() # model.userFeatures().cache()
os.environ['SPARK_HOME'] = "../spark-1.5.1" sys.path.append("../spark-1.5.1/python/") sys.path.append("../spark-1.5.1/python/lib/py4j-0.8.2.1-src.zip") sys.path.append("../spark-1.5.1/python/lib/py4j/") try: from pyspark import SparkContext, SparkConf from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating print ("Apache-Spark v1.5.1 said, \"All modules found and imported successfully.") except ImportError as e: print ("Couldn't import Spark Modules", e) sys.exit(1) # SETTING CONFIGURATION PARAMETERS config = SparkConf() sc = SparkContext(conf=config) test = sc.textFile("../data/testdata10MB") model = MatrixFactorizationModel.load(sc, "../TrainedModel") ratings = test.map(lambda line: array([float(x) for x in line.split('\t')])) testdata = ratings.map(lambda p: (int(p[0]), int(p[1]))) predictionsRDD = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) predictions = predictionsRDD.collect() print predictions ''' ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictionsRDD) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y)/ratesAndPreds.count() print("Mean Squared Error = " + str(MSE)) '''
def load_model(sc): return MatrixFactorizationModel.load(sc, MODEL_PATH)
def __load_model(self): ####Load the ALS model logger.info("Loading the ALS model...") self.model = MatrixFactorizationModel.load(self.sc, os.path.join("models/ALSBeerFixed")) logger.info("ALS model loaded!")
# In[24]: #recommendations.take(5) # In[25]: path = "s3n://patricks3db/modelsComplete200r20i003a" model.save(sc, path) # In[26]: sameModel = MatrixFactorizationModel.load(sc, path) # In[27]: sameModel.recommendProducts(1,5) # In[28]: sameModel.userFeatures().persist(StorageLevel.MEMORY_AND_DISK_SER) # In[29]: sameModel.recommendProducts(100000,5)
def getRecommendation(arg0, arg1, arg2): # set up environment conf = SparkConf() \ .setAppName("MovieLensALS") \ .set("spark.executor.memory", "2g") \ .set("spark.app.id", '2015Project') sc = SparkContext(conf=conf) # load personal ratings myRatings = loadRatings(arg2) myRatingsRDD = sc.parallelize(myRatings, 1) myRatings1 = myRatings # myRatingsRDD1 = myRatingsRDD # load ratings and movie titles movieLensHomeDir = arg1 # ratings is an RDD of (last digit of timestamp, (userId, movieId, rating)) ratings = sc.textFile(join(movieLensHomeDir, "ratings.dat")).map(parseRating) # movies is an RDD of (movieId, movieTitle) movies = dict(sc.textFile(join(movieLensHomeDir, "movies.dat")).map(parseMovie).collect()) # your code here # myRatings = ratings.count() # myUsers = ratings.values().map(lambda r: r[0]).distinct().count() # myMovies = ratings.values().map(lambda r: r[1]).distinct().count() # myRDDCount = myRatingsRDD.count() modelPath = "/home/hduser/Downloads/spark-training-master/machine-learning/python/model" if not os.path.isdir(modelPath): numPartitions = 4 training = ratings.filter(lambda x: x[0] < 6) \ .values() \ .union(myRatingsRDD) \ .repartition(numPartitions) \ .cache() validation = ratings.filter(lambda x: x[0] >= 6 and x[0] < 8) \ .values() \ .repartition(numPartitions) \ .cache() test = ratings.filter(lambda x: x[0] >= 8).values().cache() numValidation = validation.count() # numTraining = training.count() # numTest = test.count() # print "Training: %d, validation: %d, test: %d" % (numTraining, numValidation, numTest) # # print "Got %d ratings from %d users on %d movies." % (myRatings, myUsers, myMovies) # print "Got %d ratings from %d users on %d movies. rdd %d, train %d" % (myRatings, myUsers, myMovies, myRDDCount, training.count()) # 3 ranks = [8, 12] lambdas = [1.0, 10.0] numIters = [10, 20] bestModel = None bestValidationRmse = float("inf") bestRank = 0 bestLambda = -1.0 bestNumIter = -1 for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters): model = ALS.train(training, rank, numIter, lmbda) validationRmse = computeRmse(model, validation, numValidation) print "RMSE (validation) = %f for the model trained with " % validationRmse + \ "rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter) if (validationRmse < bestValidationRmse): bestModel = model bestValidationRmse = validationRmse bestRank = rank bestLambda = lmbda bestNumIter = numIter bestModel.save(sc, modelPath) else: bestModel = MatrixFactorizationModel.load(sc, modelPath) # testRmse = computeRmse(bestModel, test, numTest) # evaluate the best model on the test set # print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \ # + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse) # 4 myRatedMovieIds = set([x[1] for x in myRatings1]) candidates = sc.parallelize([m for m in movies if m not in myRatedMovieIds]) predictions = bestModel.predictAll(candidates.map(lambda x: (0, x))).collect() recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:10] outputBuffer = "Movies recommended for you:\n" for i in xrange(len(recommendations)): outputBuffer += ("%2d: %s\n" % (i + 1, movies[recommendations[i][1]])).encode('ascii', 'ignore') sc.stop() return outputBuffer
def collaborative_filter(train_dataFile, test_dataFile): conf = SparkConf() \ .setAppName("Collaborative Filter") \ .set("spark.executor.memory", "5g") sc = SparkContext(conf=conf) train_ratings = get_ratings(sc, train_dataFile) ratings_valid = train_ratings.sample(False, 0.1, 12345) ratings_train = train_ratings.subtract(ratings_valid) print(20*'-','TRAINING STARTED',20*'-') ranks = [8] lambdas = [1.0, 10.0, 5.0] numIters = [10] bestModel = None bestValidationMSE = float("inf") bestRank = 0 bestLambda = -1.0 bestNumIter = -1 for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters): print(rank, lmbda, numIter) model = ALS.train(ratings_train, rank, numIter, lmbda) testdata = ratings_valid.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings_valid.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() if (MSE < bestValidationMSE): bestModel = model bestValidationMSE = MSE bestRank = rank bestLambda = lmbda bestNumIter = numIter # evaluate the best model on the test set #model = ALS.train(ratings, rank, numIterations) print(20*'-','TRAINING FINISHED',20*'-') # # TESTING # # # # Evaluate the model on testing data print(20*'-','TESTING STARTED',20*'-') test_ratings = get_ratings(sc, test_dataFile) testdata = test_ratings.map(lambda p: (p[0], p[1])) predictions = bestModel.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = test_ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() MAE = ratesAndPreds.map(lambda r: (abs(abs(r[1][0]) - abs(r[1][1])))).mean() print("Mean Squared Error = " + str(MSE)) print("Mean Absolute Error = " + str(MAE)) print("Root Mean Square Error = ", str(MSE**.5)) print(20*'-','TESTING FINISHED',20*'-') # Save and load model path = os.path.dirname(os.path.realpath(__file__)) if os.path.exists(path+'/myModelPath'): shutil.rmtree(path+'/myModelPath') path = 'file:///' + path + '/myModelPath' print('\n',20*'-','MODEL SAVED at',20*'-') print(path) print(50*'-') model.save(sc, path) sameModel = MatrixFactorizationModel.load(sc, path)
# http://spark.apache.org/docs/latest/mllib-collaborative-filtering.html from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating # Load and parse the data data = sc.textFile("data/behavior-ml-score-ints.csv") ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) training_RDD, test_RDD = ratings.randomSplit([7, 3], seed=0L) # Build the recommendation model using Alternating Least Squares rank = 10 numIterations = 20 model = ALS.train(training_RDD, rank, numIterations) # Evaluate the model on training data testdata = test_RDD.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc, "myModelPath") sameModel = MatrixFactorizationModel.load(sc, "myModelPath")
sq = SQLContext(sc) # load songs data df_songs = sq.read.parquet(database + r"\songs_data2") # load user play data i.e triplets df_triplets = sq.read.parquet(database + r"\triplets_data2") # load song_id and its hash_id df_song_hash_id = sq.read.parquet(database + r"\song_hash_id") # load song similarity data df_song_sim = sq.read.parquet(database + r"\song_similarity2") # load the ALS model ALSmodel = MatrixFactorizationModel.load(sc, args.modelpath) # Take an input from user # userid = '3cd99bb95d2baac1e910a5c847e58388d5e9b3c1' # userid = 'ef484f5d1c2bfe2eac0098ae460b793833b5acbc' userid = raw_input("\n Enter an UserID: ") # Convert the userid into its hash value user_hash = int(hash(userid) & 0xfffffff) #### display user statistcs # Find the songs listened by the user played_songs = df_triplets.filter(df_triplets.user_id == userid) \ .select(df_triplets.song_id, df_triplets.play_count).cache()
# Use the input RDD, new_user_unrated_movies_RDD, with new_ratings_model.predictAll() to predict new ratings for the movies new_user_recommendations_RDD = new_ratings_model.predictAll(new_user_unrated_movies_RDD) # Transform new_user_recommendations_RDD into pairs of the form (Movie ID, Predicted Rating) new_user_recommendations_rating_RDD = new_user_recommendations_RDD.map(lambda x: (x.product, x.rating)) new_user_recommendations_rating_title_and_count_RDD = \ new_user_recommendations_rating_RDD.join(complete_movies_titles).join(movie_rating_counts_RDD) # flatten data, make it readable new_user_recommendations_rating_title_and_count_RDD = \ new_user_recommendations_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1])) # get top 25 rated movies top_movies = new_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2]>=25).takeOrdered(25, key=lambda x: -x[1]) print ('TOP recommended movies (with more than 25 reviews):\n%s' % '\n'.join(map(str, top_movies))) # How to get individual rating my_movie = sc.parallelize([(0, 500)]) # Quiz Show (1994) individual_movie_rating_RDD = new_ratings_model.predictAll(new_user_unrated_movies_RDD) # print individual_movie_rating_RDD.take(1) from pyspark.mllib.recommendation import MatrixFactorizationModel model_path = os.path.join('movie_lens_als') # Save and load model model.save(sc, model_path) same_model = MatrixFactorizationModel.load(sc, model_path)
""" Recommend for a user usermovHistDict: (user:([movie], [rating])) """ userUnratedRDD = movieRDD.flatMap( lambda (movID, movName): [(user, movID)] if movID not in usermovHistDict[user][0] else []).cache() # if not cache userUnratedRDD, might have IOError, could not find the /tmp/blablabla directory or file... # don't know why yet... predUserRDD = model.predictAll(userUnratedRDD).map(lambda x: (x[1], x[2]))# (Movie ID, Predicted Rating) # after join in the expression below, we get something like: (40962, (2.184925882635273, (u'"Yours', 3))) # we want to get (Predicted Rating, Movie Name, number of ratings) return predUserRDD.join(movIDNameCntRDD).map( lambda (x1, x2): (x2[0], x2[1][0], x2[1][1])).filter( lambda x: x[2] > ratedThreshold).takeOrdered(nRec, key=lambda x: -x[0]) # load model if necessary from pyspark.mllib.recommendation import MatrixFactorizationModel modelPath = os.path.join('models', 'movie_ALS') try: bestModel except NameError: bestModel = MatrixFactorizationModel.load(sc, modelPath) print ('Recommend for user %s (movies with more than 20 ratings):\n%s' % (2, '\n'.join(map(str, recommendALS(2, bestModel, moviesRDD, userMovieHistBC.value, movIDNameCntRDD, 30, 20))))) # In[ ]: