示例#1
0
def als_grid_search_ranking(train, val, maxIters, regParams, ranks):
    '''
    GRID SEARCH FOR ALS
    Params:
        train: train set
        val: validation set 
        maxIters: list of maxIter
        regParams: list of regParams
        ranks: list ofranks
    
    return:
        models: dic of models where key is its parameter and value is the ALS obj
        precision_at_k_scores: dic of precision_at_k_scores
        maps: dic of mean avearage precision 
        NDCGs: dict of NDCG scores 
    	times: dict of time to run diff models 
    '''
    models = {}
    precision_at_k_scores = {}
    maps = {}
    NDCGs = {}
    times = {}
    sc = SparkContext.getOrCreate()
    spark = SparkSession.builder.appName("try").getOrCreate()
    # grid-search
    for r in ranks:
        for Iter in maxIters:
            for reg in regParams:
                st = time()
                # initialize and train model
                model = ALS(rank = r,maxIter=Iter, regParam=reg,userCol='user_id',\
                            itemCol='book_id',ratingCol='rating',coldStartStrategy='drop',nonnegative=True)
                model = model.fit(train)
                models[(r, Iter, reg)] = model

                # evaluate on validation
                preds = model.recommendForAllUsers(500)
                preds.createOrReplaceTempView('preds')
                val = spark.sql(
                    'SELECT user_id, book_id FROM val SORT BY rating DESC')
                val = val.groupBy('user_id').agg(
                    collect_list('book_id').alias('book_id_val'))
                val.createOrReplaceTempView('val')
                predAndTruth = spark.sql(
                    'SELECT preds.recommendations, val.book_id_val FROM val join preds on preds.user_id = val.user_id'
                )
                predAndTruth = predAndTruth.collect()
                final_predAndTruth = []
                for item in predAndTruth:
                    truth = item[1]
                    pred = [i.book_id for i in item[0]]
                    final_predAndTruth += [(pred, truth)]

                final_predAndTruth = sc.parallelize(final_predAndTruth)

                ranking_obj = RankingMetrics(final_predAndTruth)
                precision_at_k_scores[(r, Iter,
                                       reg)] = ranking_obj.precisionAt(500)
                maps[(r, Iter, reg)] = ranking_obj.meanAveragePrecision
                NDCGs[(r, Iter, reg)] = ranking_obj.ndcgAt(500)
                times[(r, Iter, reg)] = round(time() - st, 5)

                print('Model with maxIter = {}, reg = {}, rank = {} complete'.
                      format(Iter, reg, r))
    return models, precision_at_k_scores, maps, NDCGs, times
示例#2
0
ProductsDF = spark.read.csv(pathToFile, inferSchema = True, header = True)
ProductsDF.printSchema()

#Reading the metadata file
pathToFile = 'Products_meta_preprocessed.csv'
metadataDF = spark.read.csv(pathToFile, inferSchema = True, header = True)
metadataDF.printSchema()

#cross validation on the products ratings data
cvs_model = cvs.fit(ProductsDF)

#defining best model 
recsys = cvs_model.bestModel

#Getting top 10 recommendations for all users
userRecs = recsys.recommendForAllUsers(10)
userRecs.show()

#printing schema for the recommendation engine
userRecs.printSchema()

#itertaing list of lists for recommedations
df = userRecs.toPandas()
list = df.values.tolist()
print(list[3])

#Converting list to dataframe
import numpy as np
new =[]
dfi = pd.DataFrame(columns = ['a','b','c'])
for i in list:
class RecommendationEngineALS:
    def __init__(self):
        self.spark = SparkSession \
            .builder \
            .master("local") \
            .appName("Recommend") \
            .config('spark.mongodb.input.uri', 'mongodb+srv://Monika:[email protected]/test?retryWrites=true&w=majority&authSource=admin') \
            .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1') \
            .getOrCreate()
        self.ratingDF = self.__load_to_df().select('reviewer_id', 'listing_id',
                                                   'rating')
        self.model = ALS(seed=1,
                         nonnegative=True,
                         userCol="reviewer_id",
                         itemCol="listing_id",
                         ratingCol="rating",
                         coldStartStrategy="drop")

    def __load_to_df(self):
        ratings_df = self.spark.read \
            .format("com.mongodb.spark.sql.DefaultSource") \
            .option("spark.mongodb.input.uri",
                    "mongodb+srv://Monika:[email protected]/test?retryWrites=true&w=majority&authSource=admin") \
            .option("database", "recommendation_system") \
            .option("collection", "reviews") \
            .load()
        return ratings_df

    def tune_ALS(self, model, train_data, validation_data, maxIter, regParams,
                 ranks):
        """
        grid search function to select the best model based on RMSE of
        validation data
        Parameters
        ----------
        model: spark ML model, ALS
        train_data: spark DF with columns ['reviewer_id', 'listing_id', 'rating']
        validation_data: spark DF with columns ['reviewer_id', 'listing_id', 'rating']
        maxIter: int, max number of learning iterations
        regParams: list of float, one dimension of hyper-param tuning grid
        ranks: list of float, one dimension of hyper-param tuning grid
        Return
        ------
        The best fitted ALS model with lowest RMSE score on validation data
        """
        # initial
        min_error = float('inf')
        best_rank = -1
        best_regularization = 0
        best_model = None
        for rank in ranks:
            for reg in regParams:
                # get ALS model
                als = model.setMaxIter(maxIter).setRank(rank).setRegParam(reg)

                # train ALS model
                model_f = als.fit(train_data)
                # evaluate the model by computing the RMSE on the validation data
                predictions = model_f.transform(validation_data)
                evaluator = RegressionEvaluator(metricName="rmse",
                                                labelCol="rating",
                                                predictionCol="prediction")
                rmse = evaluator.evaluate(predictions)
                print('{} latent factors and regularization = {}: '
                      'validation RMSE is {}'.format(rank, reg, rmse))
                if rmse < min_error:
                    min_error = rmse
                    best_rank = rank
                    best_regularization = reg
                    best_model = model_f
        print('\nThe best model has {} latent factors and '
              'regularization = {}'.format(best_rank, best_regularization))
        return best_model

    def tune_model(self,
                   maxIter,
                   regParams,
                   ranks,
                   split_ratio=[0.8, 0.1, 0.1]):
        """
        Hyperparameter tuning for ALS model
        Parameters
        ----------
        maxIter: int, max number of learning iterations
        regParams: list of float, regularization parameter
        ranks: list of float, number of latent factors
        split_ratio: tuple, (train, validation, test)
        """
        # split the data set between a training data set,validation and test set.
        (trainingData, validationData,
         testData) = self.ratingDF.randomSplit(split_ratio, seed=1)
        self.model = self.tune_ALS(self.model, trainingData, validationData,
                                   maxIter, regParams, ranks)

        prediction = self.model.transform(testData)
        evaluator = RegressionEvaluator(metricName="rmse",
                                        labelCol="rating",
                                        predictionCol="prediction")
        evaluator2 = RegressionEvaluator(metricName="mae",
                                         labelCol="rating",
                                         predictionCol="prediction")
        rmse = evaluator.evaluate(prediction)
        mae = evaluator2.evaluate(prediction)
        print('RMSE is %s' % rmse)
        print('MAE is %s' % mae)

    def get_top3_recommendations(self,
                                 maxIter=12,
                                 regParam=[0.1, 0.2],
                                 ranks=[8, 12, 16]):

        self.tune_model(maxIter, regParam, ranks)

        userRecommendations = self.model.recommendForAllUsers(3).toPandas()

        return userRecommendations