示例#1
0
def init():
    #os.system("/usr/lib/spark/bin/spark-submit /home/ubuntu/flaskapp/engine.py -driver-memory 4g")
    global recommendation_engine
    recommendation_engine = RecommendationEngine(sc)
    #global sameModel
    if os.path.isdir(
            "/home/ubuntu/gamesRecommenderSpark/gamesRecommenderSpark/model"):
        global sameModel
        sameModel = MatrixFactorizationModel.load(
            sc,
            "/home/ubuntu/efs/gamesRecommenderSpark/gamesRecommenderSpark/model"
        )
    else:
        logger.debug("HAHAHAHA!!!! MIAOMIAOMIAOMIAO!!!!")
        #child = subprocess.call(["/usr/lib/spark/bin/spark-submit", "--driver-memory", "4g", "/home/ubuntu/flaskapp/engine.py", "/home/ubuntu/Game_Recommendation/gamesRecommenderSpark/pythonGames/data/gamelens/medium", "/home/ubuntu/Game_Recommendation/gamesRecommenderSpark/pythonGames/data/gamelens/personalRatings.txt"])
        #child.wait()
        #child = os.system("/usr/lib/spark/bin/spark-submit --driver-memory 4g /home/ubuntu/flaskapp/engine.py /home/ubuntu/Game_Recommendation/gamesRecommenderSpark/pythonGames/data/gamelens/medium /home/ubuntu/Game_Recommendation/gamesRecommenderSpark/pythonGames/data/gamelens/personalRatings.txt")
        #print (child)
        global sameModel
        sameModel = MatrixFactorizationModel.load(
            sc,
            "/home/ubuntu/efs/gamesRecommenderSpark/gamesRecommenderSpark/model"
        )
    logger.debug("parent process")
    #global recommendation_engine

    #recommendation_engine = RecommendationEngine(sc)
    #return app
    return 'Hello from Game Recommendation System!'
def load_model(path):
    """ Load best model from given path in HDFS.

        Args:
            path (str): Path where best model is stored.
    """
    return MatrixFactorizationModel.load(listenbrainz_spark.context, path)
示例#3
0
def loadModel(sc):
    try:
        model = MatrixFactorizationModel.load(sc, Path + "ALSmodel")
        print "载入ALSModel模型"
    except Exception:
        print "找不到ALSModel模型,请先训练"
    return model
示例#4
0
def main(spark, model_file, test_file):

    model = MatrixFactorizationModel.load(sc, model_file)
    #model = MatrixFactorizationModel.load(sc, 'hdfs:/user/xx852/als_model')
    test_df = spark.read.parquet(test_file)
    #test_df = spark.read.parquet('hdfs:/user/xx852/cf_test_small.parquet')
    test_df = test_df.select('user_label', 'track_label', 'count')

    #predictions = model.recommendProductsForUsers(500)
    predictions = model.recommendProductsForUsers(2)
    prediction_flat = predictions.flatMap(lambda p: p[1])
    prediction_df = prediction_flat.toDF()
    intersections = prediction_df.join(test_df, (prediction_df.product == test_df.track_label)&
                                      (prediction_df.user == test_df.user_label), how = 'inner')
    predLabel = intersections.select('rating', 'count')
    #predLabel_rdd = predLabel.rdd.map(lambda x: Row(x[0], x[1]))
    #metrics = RankingMetrics(predLabel_rdd)
    #print(metrics.meanAveragePrecision)

    from pyspark.sql import Window
    import pyspark.sql.functions as psf
    w_rating = Window.orderBy(psf.desc('rating'))
    w_count = Window.orderBy(psf.desc('count'))
    predLabel = predLabel.withColumn('rating_rank', \
                psf.dense_rank().over(w_rating)).withColumn('count_rank', \
                psf.dense_rank().over(w_count))

    predLabel = predLabel.select('rating_rank', 'count_rank')
    #predLabel_rdd = predLabel.rdd.map(lambda x: Row(x[0], x[1]))
    predLabel_rdd = predLabel.rdd
    metrics = RankingMetrics(predLabel_rdd)
示例#5
0
 def read(self, version):
     als_model = MatrixFactorizationModel.load(self._sc, self._url)
     model = Model(sc=self._sc,
                   als_model=als_model,
                   version=version,
                   data_version=1)
     return model
示例#6
0
def loadModel(sc):
    try:
        model = MatrixFactorizationModel.load(sc, Path + "output/ALSmodel")
        print("载入ALSModel模型")
    except Exception:
        print("找不到ALSModel模型,请先训练")
    return model
示例#7
0
def predict(userID):

    completeMovies = sc.textFile('datasets/ml-latest-small/movies.csv')
    header2 = completeMovies.first()
    completeMovies = completeMovies.filter(lambda line : line != header2)\
        .map(lambda line : line.split(","))

    model = MatrixFactorizationModel.load(sc, "target/model")

    completeRDD = sc.textFile('datasets/ml-latest-small/ratings.csv')
    header = completeRDD.first()

    completeRDD = completeRDD.filter(lambda line : line != header)\
    .map(lambda line : line.split(","))\
    .map(lambda line : (line[0],line[1],line[2]))

    userRatedMovies = completeRDD.filter(lambda line: line[0] == userID).map(
        lambda line: line[1]).collect()

    userUnrated = completeMovies.filter(lambda line: line[
        0] not in userRatedMovies).map(lambda line: (userID, line[0]))

    predict = model.predictAll(userUnrated).map(
        lambda line: [str(line[1]), line[2]]).sortBy(lambda line: line[1],
                                                     ascending=False)

    movies = predict.join(completeMovies)

    output = movies.map(lambda line: line[1][1]).take(15)

    for i in output:
        print(i)
def main():
    import configspark
    sc = configspark.SPARK_CONTEXT

    # user/song string ID to int ID mappings
    full_text = sc.textFile(config.MSD_DATA)
    full_raw = full_text.map(msd_parse.parse_line)
    users, songs, _ = msd_parse.get_user_song_maps(full_raw)

    print("\nLoading MovieLens test dataset\n")
    test_parsed = (
        sc.textFile(config.MSD_TEST)
        .map(msd_parse.parse_line))
    test_prepped = msd_parse.replace_raw_ids(test_parsed, users, songs)
    test = test_prepped.map(msd_parse.rating_convert)

    if os.path.exists(config.MSD_MODEL):
        print("\n\nLoading existing recommendation model from %s\n\n"
              % config.MSD_MODEL)
        model = MatrixFactorizationModel.load(sc, config.MSD_MODEL)
    else:
        raise RuntimeError("Failed to load ALS model from %s"
                           % config.MSD_MODEL)

    mse, rmse = evaluate.evaluate_model(model, test)
    print("\nMSD ALS model performance: MSE=%0.3f RMSE=%0.3f\n" % (mse, rmse))
示例#9
0
def process(time, rdd):

    trainedModel = MatrixFactorizationModel.load(
        sc, "target/model/myCollaborativeFilter1")
    # trainedModel.cache()
    print("========= %s =========" % str(time))
    print("Processing RDD")
    sqs = SimpleQueueService()
    if rdd.count() > 0:
        # print(rdd.collect())
        # print("RDD Printed")
        predictions = trainedModel.predictAll(rdd).map(lambda r:
                                                       ((r[0], r[1]), r[2]))
        # print(predictions.collect())
        # print("PredictionsDone")
        recommendations = predictions.takeOrdered(2, key=lambda x: -x[1])
        # print(recommendations)
        for rec in recommendations:
            userNo = str(rec[0][0])
            offerNo = str(rec[0][1])
            resp = (get(
                "http://54.173.234.214:5000/api/helperclass/offermapping=true?userNo="
                + userNo + "&offerNo=" + offerNo).json())
            sqs.addMessageToQueue(resp)
        print("OKDone")
示例#10
0
def loadModel(sc):
    try:
        model = MatrixFactorizationModel.load(sc, Path + "ALSmodel")
        print("load ALSModel")
    except Exception:
        print("Can not find model, please train the model first.")
    return model
示例#11
0
def loadModel(sc):
    try:
        model = MatrixFactorizationModel.load(sc, Path + "ALSmodel")
        print("begin to load the model")
    except Exception:
        print("we can not find the model, please train the data first")
    return model
def loadModel(sc):
    try:
        model = MatrixFactorizationModel.load(sc, Path + "ALSmodel")
        print("载入ALSmodel模型")
        return model
    except Exception:
        print("找不到ALSmodel模型,请先训练!")
def load_model(spark_context):
    base_dir = "/Users/hpnhxxwn/Desktop/proj/DE/yelp/review_out3/"
    city_name = "Charlotte"
    model_file_name = "business_recomm_model_for_{}".format(city_name)
    model_full_path = os.path.join(base_dir, city_name, "mf_based_models",
                                   model_file_name)
    model = MatrixFactorizationModel.load(spark_context, model_full_path)
    return model
	def recommend(self, user, topK):
		
		# load model
		model = MatrixFactorizationModel.load(self.sc, Constants.RESOURCE_FOLDER + Constants.MODEL_FILE)
		model.recommendProducts(user, topK)
		
		#TODO: decide a model to return data
		
示例#15
0
def load_als_model(file_name):
    """
    loads ALS model based on state
    :param file_name: file
    :return: als model
    """
    return MatrixFactorizationModel.load(sc=spark.sparkContext,
                                         path='als-models/' + file_name)
def load_model(spark_context):
    try:
        model = MatrixFactorizationModel.load(spark_context,
                                              '../datas/asl-model')
        print(model)
        return model
    except Exception:
        print("加载模型出错")
示例#17
0
def loadModel(sc):
#     model = MatrixFactorizationModel.load(sc,path+"ALSmodel")
    try:
        model = MatrixFactorizationModel.load(sc,path+"ALSmodel")
        print("载入ALSModel模型")
    except Exception:
        print("找不到模型")
    return model
示例#18
0
def load_model(spark_context):
    base_dir = "/Users/sundeepblue/Bootcamp/allweek/week9/capstone/data/yelp_data/split_business_data_by_city/"
    city_name = "us_charlotte"
    model_file_name = "business_recomm_model_for_{}".format(city_name)
    model_full_path = os.path.join(base_dir, city_name, "mf_based_models",
                                   model_file_name)
    model = MatrixFactorizationModel.load(spark_context, model_full_path)
    return model
示例#19
0
def load_model(sc):
    try:
        print("加载模型...")
        model = MatrixFactorizationModel.load(sc, Path + "ALSmodel")
    except Exception:
        print("模型不存在!")

    return model
示例#20
0
def LoadModel(sc):
    try:
        model = MatrixFactorizationModel.load(sc, '/models/ALSmodel')
    except Exception:
        print("Faild to load model")
    else:
        print("model loaded")
    return model
示例#21
0
def LoadModel(sc, path):
    try:
        ALS_model = MatrixFactorizationModel.load(sc, path)
        print("载入模型成功")
        return ALS_model
    except Exception:
        print("模型不存在,请先训练模型")
        return None
示例#22
0
def loadModel(sc):
    """载入训练好的推荐模型"""
    try:
        model = MatrixFactorizationModel.load(sc, Path + "ALSmodel")
        print("载入模型成功")
    except Exception:
        print("模型不存在, 请先训练模型")
    return model
示例#23
0
def loadModel(sc):
    """load Model"""
    try:
        model = MatrixFactorizationModel.load(sc, Path + "ALSmodel")
        print("success...")
    except Exception:
        print("Failed!!")
    return model
示例#24
0
 def load_model(self):
     try:
         model = MatrixFactorizationModel.load(self.sc, self.path_model)
         print(model)
         return model
     except Exception:
         print("模型加载出错")
         return {}
示例#25
0
def loadModel(sc):
    try:
        alsmodel = os.path.join(Path, 'data', 'ALSmodel')
        #model = MatrixFactorizationModel.load(sc, Path+"ALSmodel")
        model = MatrixFactorizationModel.load(sc, alsmodel)
        print "载入ALSModel模型"
    except Exception:
        print "找不到ALSModel模型,请先训练"
    return model
示例#26
0
 def trainALS(self):
     try:
         self.model = MatrixFactorizationModel.load(self.sc, "als_final")
     except (RuntimeError, TypeError, NameError) as e:
         rank = 4
         numIterations = 20
         self.model = ALS.trainImplicit(self.implicit_ratings, rank,
                                        numIterations)
         self.model.save(self.sc, "als_final")
示例#27
0
def load_model(sc, model_path):
    try:
        model = MatrixFactorizationModel.load(sc,
                                              os.path.join(PATH, model_path))
        print("模型已加载")
    except Exception:
        print("找不到模型")
        exit(-1)
    return model
示例#28
0
 def __load_model(self):
     """Load the ALS model with the current dataset
     """
     logger.info("Loading the ALS model...")
     model_path = os.path.join("file:///usr/local/spark/mycode/recommender", 'models', 'movie_lens_als')
     # /home/201500130058/task5
     # file:///usr/local/spark/mycode/recommender
     self.model = MatrixFactorizationModel.load(self.sc, model_path)
     logger.info("ALS model loaded!")
示例#29
0
 def load_model(self):
     sc = self.spark.sparkContext
     try:
         model = MatrixFactorizationModel.load(sc, "./model_data/als-model")
         print('载入成功!')
         return model
     except Exception:
         print("模型不存在, 请先训练模型")
         return self.train()
 def __train_model(self, sc):
     """Train the ALS model with the current dataset
     """
     #load the trained model
     logger.info("Loading the ALS model from file...")
     model_path = os.path.join(
         'file:///Users/yumi.zhang/Desktop/recommendation', 'models',
         'movie_lens_als')
     self.model = MatrixFactorizationModel.load(sc, model_path)
     logger.info("ALS model loaded successfully!")
示例#31
0
def add_rating(user_id, movie_id, rating):
    if not recommendation_engine.addRating(user_id, movie_id, rating):
        return "Rating for user %s and movie %s already exists" % (user_id,
                                                                   movie_id)

    result = rerun_modeling.delay()
    result.wait()
    recommendation_engine.bestRankModel = MatrixFactorizationModel.load(
        spark.sparkContext, "modelAppended")
    return "Rating has been added"
示例#32
0
def trainModel(limit,data,rank,num_iterations):
	save_file = "models/"+str(limit)+"rank"+str(rank)+"iterations"+str(num_iterations)
	if isdir(save_file):
		print("Rank "+str(rank)+" and Iterations "+str(num_iterations)+" Model already exists, loading...")
		model = MatrixFactorizationModel.load(sc, save_file)
	else:
		print("Model does not exist, training ALS with rank "+str(rank)+" and "+str(num_iterations)+" iterations")
		model = ALS.train(data, rank, num_iterations)
		print("Saving new model")
		model.save(sc,save_file)
	return model
示例#33
0
def hello():
    if request.method == 'POST':
        conf = SparkConf().setAppName("yelp_recommendation-server")
        sc = SparkContext(conf=conf)
        global sameModel
        sameModel = MatrixFactorizationModel.load(
            sc,
            "/Users/xx/Desktop/big_data/project/Yelp-Personalized-Recommendation/flask/dataset/myCollaborativeFilter"
        )
        return redirect(url_for('send'))
    return render_template('hello.html')
	def saveModel(self):
		# Save and load model
		path = os.path.dirname(os.path.realpath(__file__))
		if os.path.exists(path+'/Model'):
			shutil.rmtree(path+'/Model')
		path = 'file:///' + path + '/Model'
		print('\n',20*'-','MODEL SAVED at',20*'-')
		print(path)
		print(50*'-')
		model.save(sc, path)
		sameModel = MatrixFactorizationModel.load(self.ctx, path)
def main(argv):
    Conf = (SparkConf().setAppName("test"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)
    dirPath = 'hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model'
    sameModel = MatrixFactorizationModel.load(sc, dirPath)

    
    b= sameModel.recommendProductsForUsers(193667506)
    print b.take(10)

    sc.stop()
示例#36
0
 def __train_model(self):
     """Train the ALS model with the current dataset
     """
     try:
         logger.info("Loading the ALS model...")
         # self.model =  MatrixFactorizationModel.load(self.sc, "als_model.data")
         self.model =  MatrixFactorizationModel.load(self.sc, "als_model.data")
     except:
         logger.info("Training the ALS model...")
         self.model = ALS.train(self.ratings_RDD, self.rank, seed=self.seed,
                            iterations=self.iterations, lambda_=self.regularization_parameter)
         self.model.save(self.sc,"als_model.data")
     logger.info("ALS model built!")
示例#37
0
文件: engine.py 项目: JGulbronson/rmc
    def load_data(self):
        # Model must be already trained and saved to recommendation folder
        model_path = os.path.join(os.path.dirname(__file__),
                                  '%s/trained_model' % c.RECOMMENDATION_DIR)
        self.model = MatrixFactorizationModel.load(self.sc, model_path)

        user_path = os.path.join(os.path.dirname(__file__),
                                 '%s/user.json' % c.RECOMMENDATION_DIR)
        with open(user_path, 'r') as f:
            self.user_lookup = json.load(f)

        course_path = os.path.join(os.path.dirname(__file__),
                                   '%s/course.json' % c.RECOMMENDATION_DIR)
        with open(course_path, 'r') as f:
            self.course_lookup = json.load(f)
def prepare_model(sc, filename, user_id, ratings_train):
    if filename is None and os.path.exists(config.MSD_MODEL):
        # load the trained model
        print("\n\nLoading existing recommendation model from %s\n\n"
              % config.MSD_MODEL)
        model = MatrixFactorizationModel.load(sc, config.MSD_MODEL)
    else:
        # train a new model
        print("\n\nRetraining recommendation model for User %s\n\n" % user_id)
        rank, lambda_val = (
            evaluate.load_best_params(config.MSD_BEST_PARAMS_FILE))
        rank, lambda_val = int(rank), float(lambda_val)
        model = ALS.trainImplicit(ratings_train, rank, evaluate.ITERATIONS,
                                  lambda_val, nonnegative=True)

    return model
示例#39
0
 def __train_model(self):
     """Train the ALS model with the current dataset
     """
     logger.info("Training the ALS model...")
     # self.model = ALS.train(self.ratings_RDD, self.rank, seed=self.seed,
     #                       iterations=self.iterations, lambda_=self.regularization_parameter)
     # self.model = MatrixFactorizationModel.load(self.sc, "s3n://patricks3db/modelsComplete")
     # self.model = MatrixFactorizationModel.load(self.sc, "s3n://patricks3db/modelsComplete100r20i")
     self.model = MatrixFactorizationModel.load(self.sc, "s3n://patricks3db/modelsComplete200r20i003a")
     userFeatures = self.model.userFeatures().repartition(1)
     # userFeatures.cache()
     userFeatures.persist(StorageLevel.MEMORY_AND_DISK_SER)
     print userFeatures
     productFeatures = self.model.productFeatures().repartition(1)
     # productFeatures.cache()
     productFeatures.persist(StorageLevel.MEMORY_AND_DISK_SER)
     print productFeatures
     logger.info("ALS model built!")
示例#40
0
def main():
    import configspark
    sc = configspark.SPARK_CONTEXT

    print("\nLoading MovieLens test dataset\n")
    test_text = sc.textFile(config.ML_RATINGS_TEST)
    ratings_test = (
        test_text.map(ml_parse.parse_line).map(ml_parse.rating_convert))

    if os.path.exists(config.ML_MODEL):
        print("\n\nLoading existing recommendation model from %s\n\n"
              % config.ML_MODEL)
        model = MatrixFactorizationModel.load(sc, config.ML_MODEL)
    else:
        raise RuntimeError("Failed to load ALS model from %s" % config.ML_MODEL)

    mse, rmse = evaluate.evaluate_model(model, ratings_test)
    print("\nML ALS model performance: MSE=%0.3f RMSE=%0.3f\n" % (mse, rmse))
    def __init__(self, sc, datapath='/media/psf/Home/CS/GIT_HUB/Movie-Recommendation-Project/frontend/', rating_file='ratings_small.csv', complete_rating_file='ratings.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'):
        self.sc = sc
        self.start = True
        self.rating_file = datapath+rating_file
        self.complete_rating_file = datapath+complete_rating_file
        self.movie_file = datapath+movie_file
        self.detail_file = datapath+detail_file
        self.integration_folder = datapath
        self.svd = SVD(filename=datapath+model)
        self.svd.load_data(filename=self.rating_file, sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids': int})
        self.svd.create_matrix()
        self.ia = imdb.IMDb(accessSystem='http')

        # als stuff
        self.sqlContext = SQLContext(self.sc)
        self.movie_data = self.sc.textFile(self.movie_file)
        self.ratings_data = self.sc.textFile(self.complete_rating_file).map(lambda line: line.split(",")).map(lambda x: (int(x[0]), int(x[1]), float(x[2])))
        self.als_model_path = datapath + 'Model_Collaborative_Filtering'
        self.als_model = MatrixFactorizationModel.load(sc, self.als_model_path)
        self.movie_df = self.sqlContext.read.load(datapath+'tables/movies')
        self.detail_df = self.sqlContext.read.load(datapath+'tables/detail')
        self.rating_df = self.sqlContext.read.load(datapath+'tables/ratings')
示例#42
0
    sc = SparkContext(conf=conf)
    # Load and parse the data
    data = sc.textFile(input)
    ratings = data.map(lambda l: l.split(',')).filter(lambda a:len(a)==4)\
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

    # Build the recommendation model using Alternating Least Squares
    rank = 10
    numIterations = 20
    model = ALS.train(ratings, rank, numIterations)

    # Evaluate the model on training data
    testdata = ratings.map(lambda p: (p[0], p[1]))

    predictions = model.predictAll(testdata)\
        .map(lambda r: ((r[0], r[1]), r[2]))
    predictions.foreach(my_print)

    ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print("mse = " + str(MSE)+", model="+output)

    # Save and load model
    model.save(sc, output)
    resume_model = MatrixFactorizationModel.load(sc, output)
    preds = resume_model.predictAll(testdata)\
        .map(lambda r: ((r[0], r[1]), r[2]))
    preds.foreach(my_print)

    sc.stop()
ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

# Build the recommendation model using Alternating Least Squares
rank = 10
numIterations = 10
model = ALS.trainImplicit(ratings, rank, numIterations)

# Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

#Save and load model
#commented out the save for now because the model already exists on hdfs
#uncomment this when you are ready to train a new model!
#model.save(sc, "target/tmp/myCollaborativeFilter")
sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
#parse the AskForRecsFor.csv file
f = open('AskForRecsForShort.csv')
fp = open("reccomendFile2.txt","w")
csv_f = csv.reader(f)
#next(csv_f, None) 
for row in csv_f:
   a = row[0]
   b = row[1]
   recommendation = model.recommendProducts(int(a),int(b))
   fp.write(str(recommendation))
fp.close()
示例#44
0
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib.recommendation import MatrixFactorizationModel


conf = SparkConf().setAppName("miniproject").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)
sqlContext = SQLContext(sc)
model = MatrixFactorizationModel.load(sc, '/tmp/model')
# model.productFeatures().cache()
# model.userFeatures().cache()
示例#45
0
os.environ['SPARK_HOME'] = "../spark-1.5.1"
sys.path.append("../spark-1.5.1/python/")
sys.path.append("../spark-1.5.1/python/lib/py4j-0.8.2.1-src.zip")
sys.path.append("../spark-1.5.1/python/lib/py4j/")

try:
    from pyspark import SparkContext, SparkConf
    from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
    print ("Apache-Spark v1.5.1 said, \"All modules found and imported successfully.")

except ImportError as e:
    print ("Couldn't import Spark Modules", e)
    sys.exit(1)

# SETTING CONFIGURATION PARAMETERS
config = SparkConf()
sc = SparkContext(conf=config)

test = sc.textFile("../data/testdata10MB")
model = MatrixFactorizationModel.load(sc, "../TrainedModel")
ratings = test.map(lambda line: array([float(x) for x in line.split('\t')]))
testdata = ratings.map(lambda p: (int(p[0]), int(p[1])))
predictionsRDD = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))

predictions = predictionsRDD.collect()
print predictions
'''
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictionsRDD)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y)/ratesAndPreds.count()
print("Mean Squared Error = " + str(MSE))
'''
def load_model(sc):
    return MatrixFactorizationModel.load(sc, MODEL_PATH)
示例#47
0
    def __load_model(self):
        ####Load the ALS model

        logger.info("Loading the ALS model...")
        self.model = MatrixFactorizationModel.load(self.sc, os.path.join("models/ALSBeerFixed"))
        logger.info("ALS model loaded!")
示例#48
0

# In[24]:

#recommendations.take(5)


# In[25]:

path = "s3n://patricks3db/modelsComplete200r20i003a"
model.save(sc, path)


# In[26]:

sameModel = MatrixFactorizationModel.load(sc, path)


# In[27]:

sameModel.recommendProducts(1,5)


# In[28]:

sameModel.userFeatures().persist(StorageLevel.MEMORY_AND_DISK_SER)


# In[29]:

sameModel.recommendProducts(100000,5)
示例#49
0
def getRecommendation(arg0, arg1, arg2):
    # set up environment
    conf = SparkConf() \
      .setAppName("MovieLensALS") \
      .set("spark.executor.memory", "2g") \
      .set("spark.app.id", '2015Project')
    sc = SparkContext(conf=conf)

    # load personal ratings
    myRatings = loadRatings(arg2)
    myRatingsRDD = sc.parallelize(myRatings, 1)
    myRatings1 = myRatings 
    # myRatingsRDD1 = myRatingsRDD

    # load ratings and movie titles

    movieLensHomeDir = arg1

    # ratings is an RDD of (last digit of timestamp, (userId, movieId, rating))
    ratings = sc.textFile(join(movieLensHomeDir, "ratings.dat")).map(parseRating)

    # movies is an RDD of (movieId, movieTitle)
    movies = dict(sc.textFile(join(movieLensHomeDir, "movies.dat")).map(parseMovie).collect())

    # your code here
    # myRatings = ratings.count()
    # myUsers = ratings.values().map(lambda r: r[0]).distinct().count()
    # myMovies = ratings.values().map(lambda r: r[1]).distinct().count()
    # myRDDCount = myRatingsRDD.count()

    modelPath = "/home/hduser/Downloads/spark-training-master/machine-learning/python/model"
    if not os.path.isdir(modelPath):
        numPartitions = 4
        training = ratings.filter(lambda x: x[0] < 6) \
          .values() \
          .union(myRatingsRDD) \
          .repartition(numPartitions) \
          .cache()

        validation = ratings.filter(lambda x: x[0] >= 6 and x[0] < 8) \
          .values() \
          .repartition(numPartitions) \
          .cache()

        test = ratings.filter(lambda x: x[0] >= 8).values().cache()

        numValidation = validation.count()
        # numTraining = training.count()
        # numTest = test.count()
        # print "Training: %d, validation: %d, test: %d" % (numTraining, numValidation, numTest)
        # # print "Got %d ratings from %d users on %d movies." % (myRatings, myUsers, myMovies)
        # print "Got %d ratings from %d users on %d movies. rdd %d, train %d" % (myRatings, myUsers, myMovies, myRDDCount, training.count())
        
    # 3
        ranks = [8, 12]
        lambdas = [1.0, 10.0]
        numIters = [10, 20]
        
        bestModel = None
        bestValidationRmse = float("inf")
        bestRank = 0
        bestLambda = -1.0
        bestNumIter = -1

        for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
            model = ALS.train(training, rank, numIter, lmbda)
            validationRmse = computeRmse(model, validation, numValidation)
            print "RMSE (validation) = %f for the model trained with " % validationRmse + \
                  "rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter)
            if (validationRmse < bestValidationRmse):
                bestModel = model
                bestValidationRmse = validationRmse
                bestRank = rank
                bestLambda = lmbda
                bestNumIter = numIter

        bestModel.save(sc, modelPath)
    else:
        bestModel = MatrixFactorizationModel.load(sc, modelPath)
    # testRmse = computeRmse(bestModel, test, numTest)
    # evaluate the best model on the test set
    # print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
    #   + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse)
# 4
    myRatedMovieIds = set([x[1] for x in myRatings1])
    candidates = sc.parallelize([m for m in movies if m not in myRatedMovieIds])
    predictions = bestModel.predictAll(candidates.map(lambda x: (0, x))).collect()
    recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:10]

    outputBuffer =  "Movies recommended for you:\n"
    for i in xrange(len(recommendations)):
        outputBuffer += ("%2d: %s\n" % (i + 1, movies[recommendations[i][1]])).encode('ascii', 'ignore')
    sc.stop()
    return outputBuffer
def collaborative_filter(train_dataFile, test_dataFile):

    conf = SparkConf() \
        .setAppName("Collaborative Filter") \
        .set("spark.executor.memory", "5g")
    sc = SparkContext(conf=conf)

    train_ratings = get_ratings(sc, train_dataFile)

    ratings_valid = train_ratings.sample(False, 0.1, 12345)
    ratings_train = train_ratings.subtract(ratings_valid)


    print(20*'-','TRAINING STARTED',20*'-')
    ranks = [8]
    lambdas = [1.0, 10.0, 5.0]
    numIters = [10]
    bestModel = None
    bestValidationMSE = float("inf")
    bestRank = 0
    bestLambda = -1.0
    bestNumIter = -1
    for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
        print(rank, lmbda, numIter)
        model = ALS.train(ratings_train, rank, numIter, lmbda)
        testdata = ratings_valid.map(lambda p: (p[0], p[1]))
        predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
        ratesAndPreds = ratings_valid.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
        MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        if (MSE < bestValidationMSE):
            bestModel = model
            bestValidationMSE = MSE
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
    # evaluate the best model on the test set
    #model = ALS.train(ratings, rank, numIterations)
    print(20*'-','TRAINING FINISHED',20*'-')



    # #             TESTING             # #
    # # Evaluate the model on testing data
    print(20*'-','TESTING STARTED',20*'-')
    test_ratings = get_ratings(sc, test_dataFile)

    testdata = test_ratings.map(lambda p: (p[0], p[1]))
    predictions = bestModel.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = test_ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    MAE = ratesAndPreds.map(lambda r: (abs(abs(r[1][0]) - abs(r[1][1])))).mean()

    print("Mean Squared Error = " + str(MSE))
    print("Mean Absolute Error = " + str(MAE))
    print("Root Mean Square Error = ", str(MSE**.5))
    print(20*'-','TESTING FINISHED',20*'-')


    # Save and load model
    path = os.path.dirname(os.path.realpath(__file__))
    if os.path.exists(path+'/myModelPath'):
        shutil.rmtree(path+'/myModelPath')
    path = 'file:///' + path + '/myModelPath'
    print('\n',20*'-','MODEL SAVED at',20*'-')
    print(path)
    print(50*'-')
    model.save(sc, path)
    sameModel = MatrixFactorizationModel.load(sc, path)
示例#51
0


# http://spark.apache.org/docs/latest/mllib-collaborative-filtering.html

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

# Load and parse the data
data = sc.textFile("data/behavior-ml-score-ints.csv")
ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

training_RDD, test_RDD = ratings.randomSplit([7, 3], seed=0L)

# Build the recommendation model using Alternating Least Squares
rank = 10
numIterations = 20
model = ALS.train(training_RDD, rank, numIterations)

# Evaluate the model on training data
testdata = test_RDD.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

# Save and load model
model.save(sc, "myModelPath")
sameModel = MatrixFactorizationModel.load(sc, "myModelPath")


    sq = SQLContext(sc)

    # load songs data
    df_songs = sq.read.parquet(database + r"\songs_data2")

    # load user play data i.e triplets
    df_triplets = sq.read.parquet(database + r"\triplets_data2")

    # load song_id and its hash_id
    df_song_hash_id = sq.read.parquet(database + r"\song_hash_id")

    # load song similarity data
    df_song_sim = sq.read.parquet(database + r"\song_similarity2")

    # load the ALS model
    ALSmodel = MatrixFactorizationModel.load(sc, args.modelpath)

    # Take an input from user
    # userid = '3cd99bb95d2baac1e910a5c847e58388d5e9b3c1'
    # userid = 'ef484f5d1c2bfe2eac0098ae460b793833b5acbc'

    userid = raw_input("\n Enter an UserID: ")

    # Convert the userid into its hash value
    user_hash = int(hash(userid) & 0xfffffff)

    #### display user statistcs
    # Find the songs listened by the user

    played_songs = df_triplets.filter(df_triplets.user_id == userid) \
        .select(df_triplets.song_id, df_triplets.play_count).cache()
示例#53
0
# Use the input RDD, new_user_unrated_movies_RDD, with new_ratings_model.predictAll() to predict new ratings for the movies
new_user_recommendations_RDD = new_ratings_model.predictAll(new_user_unrated_movies_RDD)

# Transform new_user_recommendations_RDD into pairs of the form (Movie ID, Predicted Rating)
new_user_recommendations_rating_RDD = new_user_recommendations_RDD.map(lambda x: (x.product, x.rating))
new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_RDD.join(complete_movies_titles).join(movie_rating_counts_RDD)

# flatten data, make it readable
new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))

# get top 25 rated movies
top_movies = new_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2]>=25).takeOrdered(25, key=lambda x: -x[1])

print ('TOP recommended movies (with more than 25 reviews):\n%s' %
        '\n'.join(map(str, top_movies)))

# How to get individual rating
my_movie = sc.parallelize([(0, 500)]) # Quiz Show (1994)
individual_movie_rating_RDD = new_ratings_model.predictAll(new_user_unrated_movies_RDD)
# print individual_movie_rating_RDD.take(1)

from pyspark.mllib.recommendation import MatrixFactorizationModel

model_path = os.path.join('movie_lens_als')

# Save and load model
model.save(sc, model_path)
same_model = MatrixFactorizationModel.load(sc, model_path)
    """ Recommend for a user
        usermovHistDict: (user:([movie], [rating]))
    """
    userUnratedRDD = movieRDD.flatMap(
        lambda (movID, movName): [(user, movID)] if movID not in usermovHistDict[user][0] else []).cache()
    # if not cache userUnratedRDD, might have IOError, could not find the /tmp/blablabla directory or file...
    # don't know why yet...
    predUserRDD = model.predictAll(userUnratedRDD).map(lambda x: (x[1], x[2]))# (Movie ID, Predicted Rating)
    # after join in the expression below, we get something like: (40962, (2.184925882635273, (u'"Yours', 3)))
    # we want to get (Predicted Rating, Movie Name, number of ratings)
    return predUserRDD.join(movIDNameCntRDD).map(
        lambda (x1, x2): (x2[0], x2[1][0], x2[1][1])).filter(
        lambda x: x[2] > ratedThreshold).takeOrdered(nRec, key=lambda x: -x[0])

# load model if necessary
from pyspark.mllib.recommendation import MatrixFactorizationModel
modelPath = os.path.join('models', 'movie_ALS')
try:
    bestModel
except NameError:
    bestModel = MatrixFactorizationModel.load(sc, modelPath)

print ('Recommend for user %s (movies with more than 20 ratings):\n%s' % (2, 
        '\n'.join(map(str, recommendALS(2, bestModel, moviesRDD, userMovieHistBC.value, movIDNameCntRDD, 30, 20)))))


# In[ ]: