def benchmark_spark(ratings, factors, iterations=5): conf = (SparkConf() .setAppName("implicit_benchmark") .setMaster('local[*]') .set('spark.driver.memory', '16G') ) context = SparkContext(conf=conf) spark = SparkSession(context) times = {} try: ratings = convert_sparse_to_dataframe(spark, context, ratings) for rank in factors: als = ALS(rank=rank, maxIter=iterations, alpha=1, implicitPrefs=True, userCol="row", itemCol="col", ratingCol="data") start = time.time() als.fit(ratings) elapsed = time.time() - start times[rank] = elapsed / iterations print("spark. factors=%i took %.3f" % (rank, elapsed/iterations)) finally: spark.stop() return times
def test_storage_levels(self): df = self.spark.createDataFrame( [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)], ["user", "item", "rating"]) als = ALS().setMaxIter(1).setRank(1) # test default params als.fit(df) self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_AND_DISK") self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_AND_DISK") self.assertEqual(als.getFinalStorageLevel(), "MEMORY_AND_DISK") self.assertEqual(als._java_obj.getFinalStorageLevel(), "MEMORY_AND_DISK") # test non-default params als.setIntermediateStorageLevel("MEMORY_ONLY_2") als.setFinalStorageLevel("DISK_ONLY") als.fit(df) self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_ONLY_2") self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_ONLY_2") self.assertEqual(als.getFinalStorageLevel(), "DISK_ONLY") self.assertEqual(als._java_obj.getFinalStorageLevel(), "DISK_ONLY")
#os.chdir("/Users/kponnambalam/Dropbox/V2Maestros/Courses/Spark n X - Do Big Data Analytics and ML/Python") #os.curdir #Load the data file in ALS format (user, item, rating) ratingsData = SpContext.textFile("UserItemData.txt") ratingsData.collect() #Convert the strings into a proper vector ratingVector=ratingsData.map(lambda l: l.split(','))\ .map(lambda l:(int(l[0]), int(l[1]), float(l[2]))) #Build a SQL Dataframe ratingsDf=SpSession.createDataFrame(ratingVector, \ ["user","item","rating"]) #build the model based on ALS from pyspark.ml.recommendation import ALS als = ALS(rank=10, maxIter=5) model = als.fit(ratingsDf) model.userFactors.orderBy("id").collect() #Create a test data set of users and items you want ratings for testDf = SpSession.createDataFrame( \ [(1001, 9003),(1001,9004),(1001,9005)], \ ["user","item"]) #Predict predictions = (model.transform(testDf).collect()) predictions
# ## Recommender system # In[10]: from pyspark.ml.recommendation import ALS als = ALS(maxIter=15, regParam=0.1, userCol='reviewerIndex', itemCol='asinIndex', ratingCol='label', rank=24, seed=1800009193L) # ## Evaluating the model # In[14]: recommender_system = als.fit(train_reviews) # In[15]: predictions = recommender_system.transform(test) # In[16]: evaluation = evaluator.evaluate( predictions.filter(col('prediction') != float('nan'))) print('The RMSE of the recommender system is {0}'.format(evaluation))
ratings = spark.read.format("csv")\ .options(header='false') \ .option("delimiter","\\t") \ .schema(schema) \ .load("resources/sample_movies_users.data") print(ratings.describe().toPandas().transpose()) (training, test) = ratings.randomSplit([0.8, 0.2]) als = ALS(rank=10, maxIter=10, userCol='userId', itemCol='movieId', ratingCol='rating', regParam=0.1, coldStartStrategy="drop") alsModel = als.fit(training) predictions = alsModel.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) testDF = spark.createDataFrame([(0, 50, -1), (0, 172, -1), (0, 133, -1)], ["userId", "movieId", "rating"]) predictionDF = alsModel.transform(testDF) predictionDF.show(5) tvs = TrainValidationSplit( estimator=pipeline,
def post(self): global als_m global new_user_unrated_movies_DF global small_ratings_DF global small_ratings_DF_upd global model_tr global new_user_unrated_movies_DF global new_user_recommendations_DF ### Load request content = request.get_json(force=True) df = pd.DataFrame.from_dict(content) sp_df = sqlContext.createDataFrame(df) ### Load best hyper-parameters with open('./GenDataCollection/best_params.json') as f: import_param = json.load(f) ### CREATE AN EMPTY MODEL WITH THE BEST hyPARAMS als = ALS(maxIter=3, regParam=import_param['regParam'], rank=import_param['rank'], userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") ###GET PARSED USER ID new_user_ID = sp_df.first().userId ### UPDATE THE EXISTING TABLE WITH THE NEW RATINGS small_ratings_DF_upd = small_ratings_DF.union(sp_df) ### CREATE A LIST WITH MOVIE_IDS THAT THE USER HAS RATED sp_df_rated = sp_df.select('movieId').rdd.map(list).map(lambda x: x[0]) sp_df_rated_list = sp_df_rated.collect() ### CREATE A DF WITH THE MOVIES THAT THE USER HAS NOT RATED (ALL MOVIES - RATED MOVIES) new_user_unrated_movies_DF = small_movies_data_DF.filter( ~small_movies_data_DF.movieId.isin(sp_df_rated_list)) #.map(lambda x: (new_user_ID, x[0])) #.map(lambda x: (198, x[0]))) ###Sanity Checks #small_movies_data_DF.count() ###Preprocessing new_user_unrated_movies_DF = new_user_unrated_movies_DF.drop('title') new_user_unrated_movies_DF = new_user_unrated_movies_DF.withColumn( 'userId', lit(new_user_ID)) new_user_unrated_movies_DF = new_user_unrated_movies_DF.select( 'userId', 'movieId') #re-arrange columns #### TRAIN THE MODEL WITH ALL THE PREVIOUS RATINGS + NEW RECEIVED RATINGS model = als.fit(small_ratings_DF_upd) ### PREDICT #use the model to predict ratings for the rest movies of the user new_user_recommendations_DF = model.transform( new_user_unrated_movies_DF) #get the total number of pre-existing reviews for each movie new_user_recommendations_DF = new_user_recommendations_DF.join( rati_count, new_user_recommendations_DF.movieId == rati_count.movieId).drop( rati_count.movieId) #order by the highest rated predictions new_user_recommendations_DF = new_user_recommendations_DF.orderBy( new_user_recommendations_DF.prediction.desc()) #filter out movies with less than 30 reviews new_user_recommendations_DF = new_user_recommendations_DF.filter( new_user_recommendations_DF.TotalReviews > 30) #(returns around 10%) resp = new_user_recommendations_DF.na.drop(subset=["prediction"]) resp = resp.limit(20) #get movie title resp = resp.join(small_movies_data_DF, resp.movieId == small_movies_data_DF.movieId).drop( small_movies_data_DF.movieId) resp = resp.select('userId', 'title', 'prediction') resp = resp.orderBy(resp.prediction.desc()) #### CONVERT PYSPARK DF--> PANDAS DF --> DICT --> JSON RESPONSE resp_pd = resp.toPandas() resp_json = resp_pd.to_json() ### Write the new ratings to the parquet file (update existing ones) small_ratings_DF_upd.repartition(1).write.csv( path="./GenDataCollection/ratings_upd.csv", mode="append", header=True) #small_ratings_DF_upd.write.parquet("ratings_upd.parquet", mode='append') return resp_json
splits = df.randomSplit([1.0,1.0,1.0,1.0,1.0], 111) (training1, test1) = (splits[0].union(splits[1]).union(splits[2]).union(splits[3]), splits[4]) (training2, test2) = (splits[0].union(splits[1]).union(splits[2]).union(splits[4]), splits[3]) (training3, test3) = (splits[0].union(splits[1]).union(splits[4]).union(splits[3]), splits[2]) (training4, test4) = (splits[0].union(splits[4]).union(splits[2]).union(splits[3]), splits[1]) (training5, test5) = (splits[4].union(splits[1]).union(splits[2]).union(splits[3]), splits[0]) # ALS V1 als = ALS(maxIter=10, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") rmse = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction") mae = RegressionEvaluator(metricName="mae", labelCol="rating",predictionCol="prediction") model1 = als.fit(training1) model2 = als.fit(training2) model3 = als.fit(training3) model4 = als.fit(training4) model5 = als.fit(training5) predictions1 = model1.transform(test1) predictions2 = model2.transform(test2) predictions3 = model3.transform(test3) predictions4 = model4.transform(test4) predictions5 = model5.transform(test5) rmse1 = rmse.evaluate(predictions1) mae1 = mae.evaluate(predictions1) print("RMSE = " + str(rmse1) + " MAE = " + str(mae1)) rmse2 = rmse.evaluate(predictions2)
def get_als_model(df, rank, regParam=1, split=[0.8, 0.2], model='ALS', evaluator='Regression', use_cache=True): cache_path = os.path.join(CACHE_PATH, f'get_als_model.msgpack') if use_cache and os.path.exists(cache_path): print(f'Loading from {cache_path}') (predictions, model, rmse_train, rmse_test, coverage_train, coverage_test, running_time, train, test) = pd.read_msgpack(cache_path) print(f'Loaded from {cache_path}') else: le1 = LabelEncoder() le1.fit(df['user_id']) df['user_id'] = le1.transform(df['user_id']) print(len(df['user_id'])) le2 = LabelEncoder() le2.fit(df['business_id']) df['business_id'] = le2.transform(df['business_id']) print(len(df['business_id'])) df = pandas_to_spark(df) train, test = df.randomSplit(split, seed=1) total_unique_businessids_train = train.select( 'business_id').distinct().toPandas().values total_unique_businessids_test = test.select( 'business_id').distinct().toPandas().values if model == 'ALS': model = ALS(maxIter=5, regParam=regParam, rank=rank, userCol="user_id", itemCol="business_id", ratingCol="rating", coldStartStrategy="drop", nonnegative=True) if evaluator == 'Regression': evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") start = time() model = model.fit(train) running_time = time() - start predictions = model.transform(test) rmse_test = evaluator.evaluate(model.transform(test)) rmse_train = evaluator.evaluate(model.transform(train)) pred_unique_businessids = calculate_coverage(model) subset_pred_train = [ i for i in pred_unique_businessids if i in total_unique_businessids_train ] subset_pred_test = [ i for i in pred_unique_businessids if i in total_unique_businessids_test ] coverage_train = len(subset_pred_train) / len( total_unique_businessids_train) coverage_test = len(subset_pred_test) / len( total_unique_businessids_test) # pd.to_msgpack(cache_path, (predictions, model, rmse_train, rmse_test, coverage_train, # coverage_test, running_time, train, test)) print(f'Dumping to {cache_path}') # breakpoint() return (predictions, model, rmse_train, rmse_test, coverage_train, coverage_test, running_time, train, test)
def fit(self, tensor, timer=False): # add check that each dimensions_col start at 0 self.tensor = tensor self.dims = dict.fromkeys(self.dimensions_col) for col in self.dimensions_col: self.dims[col] = self.tensor.shape[self.dimensions_col.index(col)] #============================================================================== # recuparation of the (user,item,rate) of the unfold matrix #============================================================================== unfolded_matrix = dict.fromkeys(self.dimensions_col) datas = dict.fromkeys(self.dimensions_col) for dim in self.dimensions_col: ind = self.dimensions_col.index(dim) unfolded_matrix[dim] = csr_matrix(self.tensor.unfold(ind)) y = list(unfolded_matrix[dim].indices) indptr = unfolded_matrix[dim].indptr r = list(unfolded_matrix[dim].data) tmp = indptr[1:len(indptr)] - indptr[0:(len(indptr) - 1)] x = [] for i in np.arange(len(tmp)): x.extend(np.repeat(i, tmp[i])) datas[dim] = pd.DataFrame({'row': x, 'col': y, 'rating': r}) #============================================================================== # Factorization #============================================================================== res = dict.fromkeys(self.dimensions_col) self.features = dict.fromkeys(self.dimensions_col) features_star = dict.fromkeys(self.dimensions_col) if timer: times = [] for mode in self.dimensions_col: print("\t Start " + mode + " learning") ind = self.dimensions_col.index(mode) local_dataset = sqlContext.createDataFrame(datas[mode]) # Build the recommendation model using Alternating Least Squares if timer: t0 = time.time() if self.model == 'tucker': rank = self.get(mode, 'ranks') else: rank = self.get('rank', None) local_als = ALS(rank=rank, maxIter=self.get('maxIter'), regParam=self.get('lbda'), alpha=self.get('alpha'), implicitPrefs=self.implicitPrefs, userCol='row', itemCol='col', ratingCol='rating', seed=self.seed) res[mode] = local_als.fit(local_dataset) if timer: t1 = time.time() delta = t1 - t0 print('\t \t time :', delta, "seconds") times.append(delta) latentFactors = res[mode].userFactors #.orderBy("id") latentFactors_index = latentFactors.select('id').toPandas() latentFactors = latentFactors.select('features') for k in range(rank): latentFactors = latentFactors.withColumn( 'factor' + str(k), latentFactors.features[k]) latentFactors = latentFactors.drop('features') latentFactors = latentFactors.toPandas() latentFactors.index = latentFactors_index['id'] unknowns = list( set(range(self.dims[mode])) - set(latentFactors_index['id'])) for unknown in unknowns: latentFactors.loc[unknown] = 0 latentFactors = latentFactors.sort_index() self.features[mode] = np.array(latentFactors) if timer: print('\t \t longest mode time :', np.max(times), "seconds") if self.model.lower() == "tucker": print("\t Get core tensor") # get W if self.implicitPrefs: self.tensor.vals = np.repeat(1, len(self.tensor.vals)) self.W = deepcopy(self.tensor) for mode in self.dimensions_col: ind = self.dimensions_col.index(mode) self.W = self.W.ttm(np.linalg.pinv(self.features[mode]), mode=ind)
return False @staticmethod def rmse(dataset,predictionCol,targetCol): return sqrt(dataset.dropna().map(lambda x: (x[targetCol] - x[predictionCol]) ** 2).reduce(add) / float(dataset.count())) lr1 = ALS() grid1 = ParamGridBuilder().addGrid(lr1.regParam, [1.0,0.5,2.0]).build() evaluator1 = MiEvaluador(predictionCol=lr1.getPredictionCol(),targetCol=lr1.getRatingCol()) cv1 = CrossValidator(estimator=lr1, estimatorParamMaps=grid1, evaluator=evaluator1, numFolds=2) cvModel1 = cv1.fit(dfRatings) a=cvModel1.transform(dfRatings) error_cross_validation=MiEvaluador.rmse(a,lr1.getPredictionCol(),lr1.getRatingCol()) print ('ERROR de validacion: {}'.format(error_cross_validation)) error_models=[] for reg_param in (1.0,0.5,2.0): lr = ALS(regParam=reg_param) model = lr.fit(dfRatings) error=MiEvaluador.rmse(model.transform(dfRatings),lr.getPredictionCol(),lr.getRatingCol()) error_models.append(error) print ('reg_param: {}, rmse: {}'.format(reg_param,error)) import numpy as np if np.isclose(error_models[np.argmin(error_models)],error_cross_validation): print("***\nFunciona correctamente pyspark\n****") else: raise RuntimeError("Deberia coincidir con el modulo donde reg_param = 0.5")
def train(self): if self.check_if_necessary() is False: return for media in self.__media__: st_time = datetime.utcnow() m = media(logger=self.logger) if m.content_type in [ ContentType.GAME, # no ratings ContentType.SERIE, # too much ratings ContentType.MOVIE # too much ratings ]: continue sqlContext = SQLContext(sc) df = m.get_meta(cols=['user_id', m.id, 'rating']) # Convert Pandas DF to PySpark DF sparkDF = sqlContext.createDataFrame(df) als = ALS(userCol="user_id", itemCol=m.id, ratingCol="rating", coldStartStrategy="drop") model = als.fit(sparkDF) user_df = User.get() # Check if is empty if user_df.shape[0] == 0: continue modelGest = model.recommendForUserSubset( sqlContext.createDataFrame(user_df), self.max_nb_elem) len_values = 0 for user in modelGest.collect(): # Do not recommend already recommended content already_recommended_media = [] with db as session: result = session.execute( 'SELECT %s FROM "%s" WHERE user_id = \'%s\' AND engine <> \'%s\'' % (m.id, m.tablename_recommended, user.user_id, self.__class__.__name__)) already_recommended_media = [ dict(row)[m.id] for row in result ] values = [] for rating in user.recommendations: id = int(rating[m.id]) if id in already_recommended_media: continue values.append({ "user_id": int(user.user_id), m.id: id, # divide by 5 to get a score between 0 and 1 "score": float(rating.rating / 5), "engine": self.__class__.__name__, "engine_priority": self.__engine_priority__, }) len_values += len(values) with db as session: # Reset list of recommended `media` for this engine session.execute( text( 'DELETE FROM "%s" WHERE user_id = %s AND engine = \'%s\' AND content_type = \'%s\'' % (m.tablename_recommended, user.user_id, self.__class__.__name__, str( m.content_type).upper()))) if len(values) > 0: markers = ':user_id, :%s, :score, :engine, :engine_priority' % m.id ins = 'INSERT INTO {tablename} VALUES ({markers}) ON CONFLICT ON CONSTRAINT recommended_content_pkey DO NOTHING' ins = ins.format(tablename=m.tablename_recommended, markers=markers) session.execute(ins, values) self.logger.info( "%s recommendation from collaborative filtering performed in %s (%s lines)" % (m.content_type, datetime.utcnow() - st_time, len_values)) self.store_date(m.content_type)
tolerance = 0.02 min_error = float('inf') best_rank = -1 best_iteration = -1 training_df, validation_df, test_df = ratings_df.randomSplit([.6, .2, .2], seed=42) for rank in ranks: als = ALS(maxIter=iterations, regParam=regularization_parameter, rank=rank, userCol="userId", itemCol="movieId", ratingCol="rating") model = als.fit(training_df) predictions = model.transform(validation_df) new_predictions = predictions.filter(col('prediction') != np.nan) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(new_predictions) errors.append(rmse) print('For rank %s the RMSE is %s' % (rank, rmse)) if rmse < min_error: min_error = rmse best_rank = rank print('The best model was trained with rank %s' % best_rank) final_als = ALS(maxIter=iterations,
movieName = loadMovieNames() lines = spark.read.text( 'hdfs:///user/maria_dev/ml-100k/u.data' ).rdd # spark.read.text() returns a data frame, use .rdd to get rdd object ratingsRDD = lines.map(parseInput) ratings = spark.createDataFrame(ratingsRDD).cache( ) # call .cache() so that Spark won't recreate this DataFrame more than once als = ALS(maxIter=5, regParam=0.01, userCol='userID', itemCol='movieID', ratingCol='rating') model = als.fit(ratings) # use the ratings DataFrame to fit the ALS model # Print out ratings from user 6: print('\nRatings for userID 6:') userRatings = ratings.filter('userID=6') for rating in userRatings.collect(): print(movieName[rating['movieID']], rating['rating']) print('\nTop 20 recommendations:') ratingCounts = ratings.groupBy('movieID').count().filter('count>100') popularMovies = ratingCounts.select('movieID').withColumn('userID', lit(6)) recommendations = model.transform(popularMovies) topRecommendations = recommendations.sort( recommendations.prediction.desc()).take(20)
# Create an RMSE evaluator using the label and predicted columns reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse") tolerance = 0.03 ranks = [4, 8, 12] errors = [0, 0, 0] models = [0, 0, 0] err = 0 min_error = float('inf') best_rank = -1 for rank in ranks: # Set the rank here: als.setRank(rank) # Create the model with these parameters. model = als.fit(training_df) # Run the model to create a prediction. Predict against the validation_df. predict_df = model.transform(validation_df) # Remove NaN values from prediction (due to SPARK-14489) predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan')) # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame error = reg_eval.evaluate(predicted_ratings_df) errors[err] = error models[err] = model print 'For rank %s the RMSE is %s' % (rank, error) if error < min_error: min_error = error best_rank = err err += 1
def train_model(training_df, rank): iterations = 10 als = ALS(rank=rank, maxIter=iterations, implicitPrefs=True) return als.fit(training_df)
def generate_predictions(training_df, prediction_df, rank, model=None): iterations = 10 als = ALS(rank=rank, maxIter=iterations, implicitPrefs=True) if model == None: model = als.fit(training_df) return model.transform(prediction_df).dropna()
def train_als(params, data): symbol = ALS(**params) with Timer() as t: model = symbol.fit(data) return model, t
lines = spark.read.text('ratings.dat').rdd ratingsRDD = lines.map(parse_rating) lines = spark.read.text('gender.dat').rdd users = dict(lines.map(parse_user).collect()) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) num_training = training.count() num_validation = test.count() print('Training: %d' % num_training) print('Validation: %d' % num_validation) # setup ALS rank = 8 num_iterations = 8 lambda_ = 0.1 als = ALS(maxIter=num_interations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) spark.stop()
def build_recommendation_model(self): logging.info("getting distinct users") print_with_time("getting distinct users") users = self.df.select(["user_id"]).distinct() logging.info("getting distinct items") print_with_time("getting distinct items") items = self.df.select(["item_id"]).distinct() logging.info("mapping user_id to number") print_with_time("mapping user_id to number") user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_no") self.user_indexed = user_indexer.fit(users).transform(users) self.user_indexed = self.user_indexed.select( self.user_indexed.user_id.cast("string"), self.user_indexed.user_id_no.cast("int")) logging.info("mapping item_id to number") print_with_time("mapping item_id to number") item_indexer = StringIndexer(inputCol="item_id", outputCol="item_id_no") self.item_indexed = item_indexer.fit(items).transform(items) self.item_indexed = self.item_indexed.select( self.item_indexed.item_id.cast("string"), self.item_indexed.item_id_no.cast("int")) logging.info("joining df with user_indexed rdd") print_with_time("joining df with user_indexed rdd") self.df = self.df.join(self.user_indexed, ["user_id"], 'inner') logging.info("joining df with item_indexed rdd") print_with_time("joining df with item_indexed rdd") self.df = self.df.join(self.item_indexed, ["item_id"], 'inner') self.df = self.df.select(["item_id_no", "user_id_no", "rating"]) ############ logging.info("splitting dataset into training and testing") print_with_time("splitting dataset into training and testing") (training, validation, test) = self.df.randomSplit([0.6, 0.2, 0.2]) ###### ranks = [25, 50, 100] regParam = [0.1, 0.01, 0.001] all_params = [(rank, reg) for rank in ranks for reg in regParam] min_mpr = float('inf') best_rank = -1 best_reg = -1 for (iteration_no, (rank, reg)) in enumerate(all_params): logging.info(iteration_no) print_with_time(str(iteration_no)) logging.info("rank=%s, reg=%s " % (rank, reg)) print_with_time("rank=%s, reg=%s " % (rank, reg)) als = ALS(rank=rank, regParam=reg, nonnegative=True, implicitPrefs=True, userCol="user_id_no", itemCol="item_id_no", checkpointInterval=-1, coldStartStrategy="drop", ratingCol="rating") self.model = als.fit(training) logging.info("transforming the validation set") print_with_time("transforming the validation set") predictions = self.model.transform(validation) logging.info("getting rmse on validation set") print_with_time("getting rmse on validation set") evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) logging.info("Root-mean-square error = " + str(rmse)) print_with_time("Root-mean-square error = " + str(rmse)) logging.info("getting MPR on validation set") print_with_time("getting MPR on validation set") ev = RankBasedEvaluator2("user_id_no", "rating", "prediction") mpr = ev.evaluate(sqlContext, predictions) logging.info("Mean Percentile Ranking = " + str(mpr)) print_with_time("Mean Percentile Ranking = " + str(mpr)) if mpr < min_mpr: min_mpr = mpr best_rank = rank best_reg = reg logging.info('The best model was trained with rank %s and reg %s' % (best_rank, best_reg)) print_with_time('The best model was trained with rank %s and reg %s' % (best_rank, best_reg)) ###### logging.info("starting model training") print_with_time("starting model training") als = ALS(rank=best_rank, regParam=best_reg, nonnegative=True, implicitPrefs=True, userCol="user_id_no", itemCol="item_id_no", checkpointInterval=-1, coldStartStrategy="drop", ratingCol="rating") self.model = als.fit(training) logging.info("transforming the test set") print_with_time("transforming the test set") predictions = self.model.transform(test) logging.info("getting rmse on test set") print_with_time("getting rmse on test set") evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) logging.info("Root-mean-square error = " + str(rmse)) print_with_time("Root-mean-square error = " + str(rmse)) logging.info("getting MPR on test set") print_with_time("getting MPR on test set") ev = RankBasedEvaluator2("user_id_no", "rating", "prediction") mpr = ev.evaluate(sqlContext, predictions) logging.info("Mean Percentile Ranking = " + str(mpr)) print_with_time("Mean Percentile Ranking = " + str(mpr))
for column in list(set(df.columns)) ] pipeline = Pipeline(stages=indexers) ratings = pipeline.fit(df).transform(df) train, validation, test = ratings.randomSplit([0.6, 0.2, 0.2], seed=427471138) als_model = ALS(userCol='user_index', itemCol='hotel id', ratingCol='ratings', nonnegative=True, regParam=0.1, rank=10) recommender = als_model.fit(train) # Build a single row DataFrame data = [(1, 100)] columns = ('user', 'movie') one_row_spark_df = spark.createDataFrame(data, columns) user_factor_df = recommender.userFactors.filter('id = 1') item_factor_df = recommender.itemFactors.filter('id = 100') user_factors = user_factor_df.collect()[0]['features'] item_factors = item_factor_df.collect()[0]['features'] # Get the recommender's prediction recommender.transform(one_row_spark_df).show()
def train_ALS(train, test, evaluator, num_iters, reg_params, ranks, alphas): """ Grid Search Function to select the best model based on RMSE of hold-out data Inspired by https://github.com/KevinLiao159/MyDataSciencePortfolio/blob/master /movie_recommender/movie_recommendation_using_ALS.ipynb Parameters ---------- train : pyspark dataframe with training data test : pyspark dataframe with test data num_iters: list of iterations to test reg_params: list of regularization parameters to test ranks: list of # of latent factors to test alphas: list of alphas to test Returns ------- fitted alsModel object """ # initial min_error = float('inf') # best_rank = -1 # best_regularization = 0 # best_alpha = 1 best_model = None # tuple up the lists combos = [num_iters, reg_params, ranks, alphas] combos_tup = list(itertools.product(*combos)) # Init list for list of combos params_errs = [] # Loop though combos for tup in combos_tup: num_iter = tup[0] reg = tup[1] rank = tup[2] alpha = tup[3] # train ALS model als = ALS( maxIter=num_iter, rank=rank, userCol='account_id', itemCol='comic_id', ratingCol='bought', implicitPrefs=True, regParam=reg, alpha=alpha, coldStartStrategy='drop', # Just for CV seed=41916) model = als.fit(train) # Generate predictions on Test predictions = model.transform(test) predictions.persist() error = evaluator.evaluate(predictions) print('{} iterations, '.format(num_iter) + '{} latent factors, regularization='.format(rank) + '{}, and alpha @ {} : '.format(reg, alpha) + 'validation error is {:.4f}'.format(error)) # Save best model to date if error < min_error: # best_rank = rank # best_regularization = reg # best_alpha = alpha best_model = model # Add error to tuple, append to list of param and their errors tup_list = list(tup) _ = tup_list.append(error) params_errs.append(tup_list) return best_model, params_errs
lines = spark.read.text("hdfs:///user/maria_dev/ml-100k/u.data").rdd # Convert it to a RDD of Row objects with (userID, movieID, rating) ratingsRDD = lines.map(parseInput) # Convert to a DataFrame and cache it # this need to be used more than once ratings = spark.createDataFrame(ratingsRDD).cache() # Create an ALS collaborative filtering model from the complete data set als = ALS(maxIter=5, regParam=0.01, userCol="userID", itemCol="movieID", ratingCol="rating") model = als.fit(ratings) #train # fabricate a user 0 in u.data, who likes science fiction but not like historical drama # recommend movies to this user 0( actually predict user 0's rating on each movie that he has never seen) # Print out ratings from user 0: print("\nRatings for user ID 0:") userRatings = ratings.filter("userID = 0") for rating in userRatings.collect(): print movieNames[rating['movieID']], rating['rating'] print("\nTop 20 recommendations:") # only predict user 0's rating on movies with more than 100 ratings, so hvae a reasonabale amount of data # Find movies rated more than 100 times
from pyspark.sql import SparkSession from pyspark.ml.recommendation import ALS # ALTERNATIVE # from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics # from pyspark.mllib.recommendation import ALS, Rating spark = SparkSession.builder.master("local").appName("SQL").getOrCreate() print("\033[36mInitial data\033[0m") columns = ["user", "item", "rating"] data = [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)] df = spark.createDataFrame(data, columns) df.show() print("\033[36mTraining model...\033[0m") als = ALS() model = als.fit(df) output_model_path = "data/peliculas0_trained_model" print("\033[36mSaving model to '{}'...\033[0m".format(output_model_path)) model.write().overwrite().save(output_model_path) print("\033[36mTesting some user/item pairs...:\033[0m") test = spark.createDataFrame([(0, 2), (1, 0), (2, 0), (3, 0)], ["user", "item"]) model.transform(test).show()
test.show(10) print("Train data loaded") ############################################################################################################ ######################### Question 2.A.2 ALS with lab settings ############################################# ############################################################################################################ myseed = 200206518 als_50 = ALS(userCol="userId", itemCol="movieId", seed=myseed, coldStartStrategy="drop") # Trainnig the model model_50 = als_50.fit(train) #Perdictions predictions_50 = model_50.transform(test) print("Evaluation for 50/50 split") ## Question 2.A.3 for time-split 50% evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse_50 = evaluator_rmse.evaluate(predictions_50) print("Root-mean-square error = " + str(rmse_50)) evaluator_mse = RegressionEvaluator(metricName="mse", labelCol="rating", predictionCol="prediction")
from pyspark.ml.recommendation import ALS from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql.types import * import pandas as pd sc = SparkContext() sql_sc = SQLContext(sc) pd_df_ratings = pd.read_csv('./ratings_small.csv') pyspark_df_ratings = sql_sc.createDataFrame(pd_df_ratings) pyspark_df_ratings = pyspark_df_ratings.drop('Timestamp') #print(pyspark_df_ratings.show(5, truncate=False)) # 创建ALS模型 als = ALS(rank=3, maxIter = 10, regParam=0.1, userCol= 'userId', itemCol='movieId', ratingCol='rating') model = als.fit(pyspark_df_ratings) # 对userId=100进行Top-N推荐 recommendations = model.recommendForAllUsers(5) print(recommendations.where(recommendations.userId == 100).collect())
from pyspark.ml.recommendation import ALS from pyspark.sql import Row ratings_df = (spark.read.table("retail_features").selectExpr( "CAST(invoice_num AS INT) as user_id", "CAST(stock_code AS INT) as item_id", "CAST(quantity AS INT) as rating").where( "user_id is NOT NULL AND item_id is NOT NULL")) #ratings_df.display() (train_df, test_df) = ratings_df.randomSplit([0.7, 0.3]) als = ALS(maxIter=3, regParam=0.03, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop") als_model = als.fit(train_df) predictions = model.transform(test_df) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) user_recs = als_model.recommendForAllUsers(5) user_recs.display() item_recs = als_model.recommendForAllItems(5) item_recs.display()
class RecommendationEngine: """A movie recommendation engine """ def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratingsdf) logger.info("ALS model built!") def get_top_ratings(self, user_id, movies_count): """Recommends up to movies_count top unrated movies to user_id """ users = self.ratingsdf.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model.recommendForUserSubset(users, movies_count) userSubsetRecs = userSubsetRecs.withColumn("recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_movie_recommend(self, movie_id, user_count): """Recommends up to movies_count top unrated movies to user_id """ movies = self.ratingsdf.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model.recommendForItemSubset(movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load ratings data for later use logger.info("Loading Ratings data...") ratings_file_path = os.path.join(dataset_path, 'ratings.csv') self.ratingsdf = spark_session.read.csv(ratings_file_path, header=True, inferSchema=True).na.drop() self.ratingsdf = self.ratingsdf.drop("timestamp") # Load movies data for later use logger.info("Loading Movies data...") movies_file_path = os.path.join(dataset_path, 'items.csv') self.moviesdf = spark_session.read.csv(movies_file_path, header=True, inferSchema=True).na.drop() #self.moviesdf = self.moviesdf.drop("genres",) # Train the model self.__train_model()
predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan')) # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame error = reg_eval.evaluate(predicted_ratings_df) errors[err] = error models[err] = model print('For rank %s the RMSE is %s' % (rank, error)) if error < min_error: min_error = error best_rank = err err += 1 """ als.setRank(12) my_model = als.fit(training_df) predict_df = my_model.transform(validation_df) predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan')) error = reg_eval.evaluate(predicted_ratings_df) #print('The best model was trained with rank %s' % ranks[best_rank]) #my_model = models[best_rank] # Run the best model with test dataset predict_test_df = my_model.transform(test_df) # Remove NaN values from prediction (due to SPARK-14489) predicted_test_df = predict_df.filter(predict_test_df.prediction != float('nan')) # Run the previously created RMSE evaluator, reg_eval, on the predicted_test_df DataFrame test_RMSE = reg_eval.evaluate(predicted_test_df)
.config('spark.executor.memoryOverhead', memory) \ .config("spark.sql.broadcastTimeout", "36000") \ .config("spark.storage.memoryFraction","0") \ .config("spark.memory.offHeap.enabled","true") \ .config("spark.memory.offHeap.size",memory).getOrCreate() train = spark.read.parquet(sys.argv[1]) val = spark.read.parquet(sys.argv[1]) results = [] for rank in [2, 5, 10, 20]: for reg in [0.05, 0.1, 0.5, 1]: als = ALS(rank=rank, maxIter=10, regParam=reg, seed=seed) model = als.fit(train.toDF('user', 'item', 'rating')) predictions_val = model.transform(val.toDF('user', 'item', 'rating')) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions_val) print('Rank = ', rank, ' regParam = ', reg) print("Root-mean-square error = " + str(rmse)) val.createOrReplaceTempView('val') val_true = spark.sql('select user, book from val where rating > 2 sort by rating desc') labels = val_true.groupby('user').agg(collect_list('book')) val_recommendations = model.recommendForUserSubset(labels.select('user'), 500) preds = val_recommendations.withColumn('recommendations', explode('recommendations')).select('user', 'recommendations.item').groupBy('user').agg(collect_list('item'))
def main(spark, train_file, val_file, model_file): train_df = spark.read.parquet(train_file) val_df = spark.read.parquet(val_file) train_df = train_df.select('user_label', 'track_label', 'count') val_df = val_df.select('user_label', 'track_label', 'count') val_grouped = val_df.groupBy('user_label').agg( F.collect_list(F.col('track_label')).alias('track_label')) # ALS for implicit feedback als = ALS(maxIter = 5, regParam = 0.01, implicitPrefs = True, \ userCol = 'user_label', itemCol = 'track_label', ratingCol = 'count') als_model = als.fit(train_df) predictions = als_model.recommendForAllUsers(10) prediction_df = predictions.rdd.map( lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF() prediction_df = prediction_df.selectExpr('_1 as user_label', '_2 as recommendations') # Join table val_pred = val_grouped.join(prediction_df, 'user_label', 'inner') rdd = val_pred.select('recommendations', 'track_label').rdd ranking_metrics = RankingMetrics(rdd) print('Before tuning, MAP = %s' % ranking_metrics.meanAveragePrecision) # hyperparameter tuning ranks = [10, 20] reg_params = [0.001] alphas = [0.10, 0.20, 0.40] best_rank = None best_reg_param = None best_alpha = None best_model = None best_map = 0 for rank_i, alpha_i, reg_param_i in itertools.product( ranks, alphas, reg_params): print('Running on rank:', rank_i) print('Running on alpha:', alpha_i) print('Running on reg:', reg_param_i) als = ALS(maxIter=5, regParam=reg_param_i, implicitPrefs=True, alpha=alpha_i, rank=rank_i, userCol='user_label', itemCol='track_label', ratingCol='count') als_model = als.fit(train_df) predictions = als_model.recommendForAllUsers(100) prediction_df = predictions.rdd.map(lambda r: ( r.user_label, [i[0] for i in r.recommendations])).toDF() prediction_df = prediction_df.selectExpr('_1 as user_label', '_2 as recommendations') # Join table val_pred = val_grouped.join(prediction_df, 'user_label', 'inner') rdd = val_pred.select('recommendations', 'track_label').rdd ranking_metrics = RankingMetrics(rdd) map_ = ranking_metrics.meanAveragePrecision print('MAP:', map_) if map_ > best_map: best_rank = rank_i best_reg_param = reg_param_i best_alpha = alpha_i best_model = als_model best_map = map_ print('Best rank:', best_rank) print('Best regParam:', best_reg_param) print('Best alpha:', best_alpha) print('Best map:', best_map) # save the best model best_model.save(model_file)
# ----------------------------------------------------------------------------- # Modeling # ----------------------------------------------------------------------------- # Imports from pyspark.ml.recommendation import ALS from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.feature import StringIndexer from pyspark.mllib.evaluation import RankingMetrics # Modeling als = ALS(maxIter=5, regParam=0.01, userCol="user_id_encoded", itemCol="song_id_encoded", ratingCol="plays") als_model = als.fit(training) predictions = als_model.transform(test) predictions = predictions.orderBy(col("user_id"), col("song_id"), col("prediction").desc()) predictions.cache() predictions.show(50, False) # +------------------+----------------------------------------+-----+---------------+---------------+-----------+ # |song_id |user_id |plays|user_id_encoded|song_id_encoded|prediction | # +------------------+----------------------------------------+-----+---------------+---------------+-----------+ # |SORDKNX12A8C13A45F|00000b722001882066dff9d2da8a775658053ea0|1 |856763.0 |50622.0 |0.63414586 | # |SOBFEDK12A8C13BB25|00001638d6189236866af9bbf309ae6c2347ffdc|1 |859779.0 |17821.0 |-1.0087988 | # |SOLOYFG12A8C133391|00001638d6189236866af9bbf309ae6c2347ffdc|1 |859779.0 |19812.0 |-0.74704367| # |SOOEPEG12A6D4FC7CA|00001638d6189236866af9bbf309ae6c2347ffdc|1 |859779.0 |4703.0 |-0.5360813 | # |SOWOTHK12A67AD818B|00001638d6189236866af9bbf309ae6c2347ffdc|24 |859779.0 |192657.0 |0.38297927 |
als = ALS(userCol='user_id', itemCol='book_id', ratingCol='rating', coldStartStrategy='drop') for rank in RANKS: for maxIter in MAX_ITERS: for regParam in REG_PARAMS: rank = int(rank) maxIter = int(maxIter) print("Running for " + str((rank, maxIter, regParam))) als.setParams(rank=rank, maxIter=maxIter, regParam=regParam) model = als.fit(interactions_train) # model.save(os.path.join(MODELS_DIRECTORY, 'als')) predictions = model.transform(interactions_val) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) f = open("validation_errors.csv", "a+") f.write( str(rank) + "," + str(maxIter) + "," + str(regParam) + "," + str(rmse) + "\n") f.close() print("Finshed running for " + str((rank, maxIter, regParam)))
user_ids = ratings.select("userid").distinct().rdd.zipWithUniqueId() user_map = user_ids.map(lambda (x, y): Row(userid=x.userid, userid_int=y)).toDF().cache() # same as above - this is a UUID/int mapping video_ids = ratings.select("videoid").distinct().rdd.zipWithUniqueId().cache() video_map = video_ids.map(lambda (x, y): Row(videoid=x.videoid, videoid_int=y)).toDF().cache() print "Recommending based on {0} users and {1} videos.".format(user_map.count(), video_map.count()) training_data = ratings.join(user_map, ratings.userid == user_map.userid).\ join(video_map, ratings.videoid == video_map.videoid).\ select(user_map.userid, user_map.userid_int, video_map.videoid, video_map.videoid_int, "rating") # Create ALS transformer and train with the ratings from our C* table als = ALS(rank=10, maxIter=10).setUserCol("userid_int").setItemCol("videoid_int").setRatingCol("rating") model = als.fit(training_data) users = user_map.collect() user_map.unpersist() count = 0 length = len(users) for user in users: videos_and_user = video_map.withColumn("userid", lit(user.userid)).\ withColumn("userid_int", lit(user.userid_int)) model.transform(videos_and_user).\ sort("prediction", ascending=False).limit(30).\ select("videoid", "userid", col("prediction").alias("rating")).\ write.format("org.apache.spark.sql.cassandra").\ options(keyspace="killrvideo", table="video_recommendations_by_video").\ save(mode="append")
def main(input_1, input_2, input_3): business_df = spark.read.json(input_1) user_df = spark.read.json(input_2) review_df = spark.read.json(input_3) # Spark ALS implementation requires the rating matrix to have the follwoing data types user_df_schema = StructType([ StructField("user_id", StringType(), True), StructField("userId", IntegerType(), True) ]) user_id = user_df.select('user_id') user_newid_df = spark.createDataFrame( user_id.rdd.map(lambda x: x[0]).zipWithIndex(), user_df_schema) # add the new userId column the user dataframe user_new_df = user_df.join(user_newid_df, 'user_id', 'inner').select('userId', 'user_id', 'name') bus_df_schema = StructType([ StructField("business_id", StringType(), True), StructField("businessId", IntegerType(), True) ]) bus_id = business_df.select('business_id') business_newid_df = spark.createDataFrame( bus_id.rdd.map(lambda x: x[0]).zipWithIndex(), bus_df_schema) business_new_df = business_df.join(business_newid_df, 'business_id', 'inner').select('businessId', 'business_id', 'name', 'categories', 'latitude', 'longitude') # map new userId and businessId in the review dataframe review_df = review_df.select('user_id', 'business_id', 'stars') review_userId_df = review_df.join(user_newid_df, "user_id", 'inner').select('business_id', 'userId', 'user_id', 'stars') # map the businessId review_userId_df = review_userId_df.join(business_newid_df, "business_id", 'inner').select( 'user_id', 'business_id', 'stars', 'userId', 'businessId') #create the rating dataframe required by the ALS model rating_df = review_userId_df.select( 'userId', 'businessId', review_userId_df.stars.cast('float').alias('rating')) rating_df.cache() #print(' Rating matrx no. of rows :', rating_df.count()) (train, test) = rating_df.randomSplit([0.8, 0.2], seed=123) # Cross Validation als = ALS(userCol="userId", itemCol="businessId", ratingCol="rating", coldStartStrategy="drop") param_grid = ParamGridBuilder().addGrid(als.rank, [10, 15, 20]).addGrid( als.maxIter, [10, 15, 20]).build() evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating") cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5, seed=123) cv_als_model = cv.fit(train) # Evaluate the model by compu als_predictions = cv_als_model.bestModel.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(als_predictions) print("Root-mean-square error" + str(rmse)) #rmse = 1.559099 #best_model = cv_als_model.bestModel #best_rank is 20 #best_model.rank #best_maxIter is 20 #(best_model._java_obj.parent().getMaxIter()) # drop columns for Nan values (ColdStrategy parameter) and tune ALS model als = ALS(rank=20, maxIter=20, regParam=0.3, userCol="userId", itemCol="businessId", ratingCol="rating", coldStartStrategy="drop", seed=123) alsb_model = als.fit(train) alsb_predictions = alsb_model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(alsb_predictions) # save the ALS model alsb_model.write().overwrite().save('als_model') print("alsb_model Root-mean-square error = " + str(rmse)) # rmse is 1.45023 alsn_model = ALSModel.load('als_model') # generate top 10 business recommendations for each user userRecoms = alsn_model.recommendForAllUsers(10) all_userRecoms = userRecoms.join(user_newid_df, 'userId', 'inner').select('userId', 'recommendations', 'user_id') all_userRecoms.cache() # test and show recommendations u_id = 'ZWD8UH1T7QXQr0Eq-mcWYg' userFlatRec = spark.createDataFrame( all_userRecoms.filter( col('user_id') == u_id).rdd.flatMap(lambda p: p[1])) # businessId| rating| #+----------+------------------+ #| 171476|5.4555559158325195| #| 25624|5.3495965003967285| #| 14049| 5.271500110626221| #show the recommeded restaurants details collab_df = business_new_df.join(userFlatRec, 'businessId', 'inner').drop('businessId') result = getCollabRecom(u_id, all_userRecoms, business_new_df) result.show()
def run_train_and_validation(spark, train, test_input, test_output, rank_list): evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="TKET", predictionCol="prediction") evaluator_mse = RegressionEvaluator(metricName="mse", labelCol="TKET", predictionCol="prediction") evaluator_mae = RegressionEvaluator(metricName="mae", labelCol="TKET", predictionCol="prediction") evaluators = [evaluator_rmse, evaluator_mse, evaluator_mae] error_list_als = {} error_list_als_nn = {} error_list_als_ibcf = {} error_list_als_nn_ibcf = {} error_list_combine = {} error_list_combine_nn = {} error_list_ibcf = {} error_list_nbcf = {} error_models = {} best_models = {} ibcf_ranks = [3, 5, 7, 9, 11, 13, 15, 17, 20, 23, 25] # user_col = "MASV1" # item_col = "F_MAMH" # item_index_col = "F_MAMH_index" # grade_col = "TKET" # prediction_col = "prediction" # # #IBCF prediction model # ibcf_model = IBCF(spark, user_col, item_col, item_index_col, grade_col, prediction_col) # train_part_df = ibcf_model.remove_unknown_item(train, test_input) # validate_part_df = ibcf_model.remove_unknown_item(train, test_output) # item_similarity_df = ibcf_model.fit(train.drop(item_col)) # # for rank in ibcf_ranks: # result_df = ibcf_model.predict(validate_part_df, item_similarity_df, train_part_df, rank) # result_df.show() # error_ibcf = evaluate(result_df,evaluators) # error_list_ibcf[rank] = error_ibcf # # #NBCF prediction model # nbcf_model = NBCF(spark, user_col, item_col, item_index_col, grade_col, prediction_col) # train_df = train.unionAll(test_input) # user_similarity = nbcf_model.fit(train_df.drop(item_col)) # for rank in ibcf_ranks: # result_df = nbcf_model.predict(test_output, user_similarity, train_df, rank) # result_df.show() # error_nbcf = evaluate(result_df,evaluators) # error_list_nbcf[rank] = error_nbcf for i in range(len(rank_list)): als_input = train.unionAll(test_input) #als non negative false als = ALS(rank=rank_list[i], maxIter=15, regParam=0.01, userCol="MASV1", itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False) als_model = als.fit(als_input) predict_als = als_model.transform(test_output) #als non negative true als_nn = ALS(rank=rank_list[i], maxIter=15, regParam=0.01, userCol="MASV1", itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=True) als_nn_model = als_nn.fit(als_input) predict_als_nn = als_nn_model.transform(test_output) error_als = evaluate(predict_als, evaluators) error_als_nn = evaluate(predict_als_nn, evaluators) error_list_als[rank_list[i]] = error_als error_list_als_nn[rank_list[i]] = error_als_nn best_models = put_best_model(best_models, "als", Model_Error_Wrapper("als_{}".format(rank_list[i]), als_model, error_als[0])) best_models = put_best_model(best_models, "als_nn", Model_Error_Wrapper("als_nn_{}".format(rank_list[i]), als_nn_model, error_als_nn[0])) #combine mf_ibcf_model for ibcf_rank in ibcf_ranks: #als_ibcf als_ibcf_model = IBCFWithItemFactor(spark, als_model.itemFactors) \ .setUserCol("MASV1") \ .setItemCol("F_MAMH_index") \ .setValueCol("TKET") \ .setRank(ibcf_rank) predict_als_ibcf = als_ibcf_model.transform(test_input, test_output.drop("TKET")) predict_als_ibcf_with_gt = predict_als_ibcf.join(test_output, ["MASV1", "F_MAMH_index"]) error_als_ibcf = evaluate(predict_als_ibcf_with_gt, evaluators) error_list_als_ibcf["{}_{}".format(rank_list[i], ibcf_rank)] = error_als_ibcf best_models = put_best_model(best_models, "als_ibcf", Model_Error_Wrapper("als_ibcf_{}_{}".format(rank_list[i], ibcf_rank), als_ibcf_model, error_als_ibcf[0])) #als_ibcf_mean als_ibcf_mean_model = ALSIBCFMeanModel(spark, als_ibcf_model, als_model) combine = als_ibcf_mean_model.transform(test_input, test_output.drop("TKET")).join(test_output, ["MASV1", "F_MAMH_index"]) #combine with als # combine = predict_als_ibcf.withColumnRenamed("prediction", "prediction_ibcf") \ # .join(predict_als.withColumnRenamed("prediction", "prediction_als"), ["MASV1", "F_MAMH_index"]) \ # .withColumn("prediction", (col("prediction_ibcf") + col("prediction_als")) / 2) error_combine = evaluate(combine, evaluators) error_list_combine["{}_{}".format(rank_list[i], ibcf_rank)] = error_combine best_models = put_best_model(best_models, "als_ibcf_mean", Model_Error_Wrapper("als_ibcf_mean_{}_{}".format(rank_list[i], ibcf_rank), als_ibcf_mean_model, error_combine[0])) #als_nn_ibcf als_nn_ibcf_model = IBCFWithItemFactor(spark, als_nn_model.itemFactors) \ .setUserCol("MASV1") \ .setItemCol("F_MAMH_index") \ .setValueCol("TKET") \ .setRank(ibcf_rank) predict_als_nn_ibcf = als_nn_ibcf_model.transform(test_input, test_output.drop("TKET")) predict_als_nn_ibcf_with_gt = predict_als_nn_ibcf.join(test_output, ["MASV1", "F_MAMH_index"]) error_als_nn_ibcf = evaluate(predict_als_nn_ibcf_with_gt, evaluators) error_list_als_nn_ibcf["{}_{}".format(rank_list[i], ibcf_rank)] = error_als_nn_ibcf best_models = put_best_model(best_models, "als_nn_ibcf", Model_Error_Wrapper("als_nn_ibcf_{}_{}".format(rank_list[i], ibcf_rank), als_nn_ibcf_model, error_als_nn_ibcf[0])) #als_nn_ibcf_mean als_nn_ibcf_mean_model = ALSIBCFMeanModel(spark, als_nn_ibcf_model, als_nn_model) combine_nn = als_nn_ibcf_mean_model.transform(test_input, test_output.drop("TKET")).join(test_output, ["MASV1", "F_MAMH_index"]) #combine with als_nn # combine_nn = predict_als_nn_ibcf.withColumnRenamed("prediction", "prediction_ibcf") \ # .join(predict_als_nn.withColumnRenamed("prediction", "prediction_als"), ["MASV1", "F_MAMH_index"]) \ # .withColumn("prediction", (col("prediction_ibcf") + col("prediction_als")) / 2) error_combine_nn = evaluate(combine_nn, evaluators) error_list_combine_nn["{}_{}".format(rank_list[i], ibcf_rank)] = error_combine_nn best_models = put_best_model(best_models, "als_nn_ibcf_mean", Model_Error_Wrapper("als_nn_ibcf_mean_{}_{}".format(rank_list[i], ibcf_rank), als_nn_ibcf_mean_model, error_combine_nn[0])) error_models["als"] = error_list_als error_models["als_nn"] = error_list_als_nn error_models["als_ibcf"] = error_list_als_ibcf error_models["als_nn_ibcf"] = error_list_als_nn_ibcf error_models["als_ibcf_mean"] = error_list_combine error_models["als_nn_ibcf_mean"] = error_list_combine_nn # error_models["ibcf"] = error_list_ibcf # error_models["nbcf"] = error_list_nbcf return error_models, best_models
# Read train and test data to Pandas dataframes train_raw = pd.read_csv(train_input) test_raw = pd.read_csv(test_input) # Delete unused columns test_raw = test_raw.drop(['date'],axis=1) train_raw = train_raw.drop(['train_id','date'],axis=1) # Create Spark Dataframes train = spark.createDataFrame(train_raw) test = spark.createDataFrame(test_raw) # Create ALS predictor, fit the model and generate the predictions als = ALS(userCol="user_id", itemCol="business_id", ratingCol="rating") model = als.fit(train) predictions = model.transform(test) # Store result in a pandas dataframe predict_df = predictions.select('test_id','prediction').coalesce(1).orderBy('test_id').toPandas() ### Scaling the model to range 1-5 max_value = predict_df.prediction.max() min_value = predict_df.prediction.min() predict_df['rating'] = predict_df.prediction.apply(normalize) predict_df = predict_df[['test_id','rating']] # Save the predictions in a file to submit to Kaggle predict_df.sort_values('test_id').to_csv(output,index=False)
def main(spark, data_file_train, data_file_val): start = time.time() # reading training and validation files df_train = spark.read.parquet(data_file_train) df_val = spark.read.parquet(data_file_val) window_user_ordered = Window.partitionBy('user_id').orderBy('rating') window_user = Window.partitionBy('user_id') actual_df_val = df_val.withColumn( 'actual_books', F.collect_list('book_id').over(window_user_ordered)).groupBy( 'user_id').agg(F.max('actual_books').alias('actual_books')) print("Datasets loaded | Time taken: {}".format(time.time() - start)) ranks = [10, 15, 25, 50, 100] regParam = [1, 0.1, 0.01, 0.001] max_score = 0.0 best_model = None for r in ranks: for reg in regParam: start = time.time() als = ALS(maxIter=10, regParam=reg, userCol="user_id", itemCol="book_id", ratingCol="rating", rank=r) model = als.fit(df_train) print( "Done with model fitting | Time taken: {}".format(time.time() - start)) start = time.time() # predictions = model.transform(df_val) # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") # rmse = evaluator.evaluate(predictions) # print("RMSE: {}".format(rmse)) recommendations = model.recommendForUserSubset(df_val, 500) userPredictions = recommendations.select( 'user_id', F.explode('recommendations.book_id')).withColumn( 'pred_books', F.collect_list('col').over(window_user)).groupBy( 'user_id').agg( F.max('pred_books').alias('pred_books')) predAndLabels = userPredictions.join(actual_df_val, on='user_id').select( 'pred_books', 'actual_books') metrics = RankingMetrics(predAndLabels.rdd) score = metrics.meanAveragePrecision print('Regularization: {} | Rank: {} | MAP: {}'.format( reg, r, score)) print('Time taken: {}'.format(time.time() - start)) if score > max_score: max_score = score best_model = model best_rank = r best_reg = reg best_model.itemFactors.rdd.saveAsTextFile('recom/iF') best_model.userFactors.rdd.saveAsTextFile('recom/uF') best_model.save("recom/best_model") print('Best Regularization: {} | Best Rank: {} | Best MAP: {}'.format( best_reg, best_r, best_score))
def main(spark, data_file, validation_file, test_file, model_file, tuning=False): # load data and create dataframe # train data train_df = spark.read.parquet(data_file) train_df.createOrReplaceTempView('train_df') # validation data validation_df = spark.read.parquet(validation_file) validation_df.createOrReplaceTempView('validation_df') # test data test_df = spark.read.parquet(test_file) test_df.createOrReplaceTempView('test_df') # omit data that not contains users in the validation and test data train_df = spark.sql( "SELECT DISTINCT(user_id), book_id, rating FROM train_df " "WHERE user_id IN ((SELECT user_id FROM validation_df) UNION (SELECT user_id FROM test_df)) AND rating!=0" ) # sub sample 60% of data (train_df, train_rest) = train_df.randomSplit([0.6, 0.4], seed=20) print('data has been preprocessed. ') try: # load saved Model Indexer. If haven't created, then create indexer print('load Model Indexer') model_indexer = PipelineModel.load( './home/hj1325/final-project-final-project/model_indexer') except: # create indexer print('create Model Indexer') user_indexer = StringIndexer( inputCol='user_id', outputCol='user_label').setHandleInvalid('skip') book_indexer = StringIndexer( inputCol='book_id', outputCol='book_label').setHandleInvalid('skip') training_pipeline = Pipeline(stages=[user_indexer, book_indexer]) model_indexer = training_pipeline.fit(train_df) model_indexer.write().overwrite().save( './home/hj1325/final-project-final-project/model_indexer') print('Model indexer has been created.') # use indexer to transform dataframe for training and validation train_df = model_indexer.transform(train_df) validation_df = model_indexer.transform(validation_df) validation_user = validation_df.select('user_label').distinct().alias( 'userCol') validation_t_df = validation_df.select(['user_label', 'book_label' ]).repartition(800, 'user_label') # use panda udf to save run time user_truth = validation_t_df.groupby('user_label').agg( F.collect_list('book_label').alias('truth')).cache() print('Training and Validation dataframe have been transformed.') # set tuning to true to tune using hyper-parameter, by default use the the following hyper-parameter to save running # time # regularization parameter = 0.1, alpha = 1, rank = 100(handling implicit feedback) if tuning: RegParam = [0.1, 1, 10, 100] Alpha = [0.1, 1] Rank = [10, 100] else: RegParam = [0.1] Alpha = [1] Rank = [100] # precision_at_k store precision and average corresponding to each regparam, alpha and rank PRECISION_AT_K = {} RMSE_list = {} count = 0 total = len(RegParam) * len(Alpha) * len(Rank) for a in RegParam: for b in Alpha: for c in Rank: print('currently using model with regParam =' + str(a) + ', Alpha =' + str(b) + ', Rank =' + str(c)) # use train_df to fit ALS model als_train = ALS(maxIter=10, regParam=a, alpha=b, rank=c, userCol='user_label', itemCol='book_label', ratingCol='rating', coldStartStrategy='drop', implicitPrefs=True) als_model = als_train.fit(train_df) # evaluate model predict = als_model.transform(validation_df) evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction') rmse = evaluator.evaluate(predict) RMSE_list[rmse] = [rmse, als_model, als_train] count += 1 print(str(count) + 'of the total ' + str(total) + ' finished.') print(' RMSE value= ' + str(rmse) + ' RegParam= ' + str(a) + ' Alpha= ' + str(b) + ' rank= ' + str(c)) # predict based on the top 500 item of each user # recommend = als_model.recommendForUserSubset(validation_df, 500) # prediction = spark.sql('SELECT * FROM recommend INNER JOIN user_truth WHERE recommend.user_label=user_truth.user_label') # after running panda udf is faster than using sparksql # prediction = recommend.join(user_truth, recommend.user_label == user_truth.user_label, 'inner') #score = prediction.select('recommendations.book_label', 'truth') #score = score.rdd.map(tuple).repartition(800) #rank_metric = RankingMetrics(score) #mean_precision = rank_metric.meanAveragePrecision #precision = rank_metric.precisionAt(500) #PRECISION_AT_K[mean_precision] = [precision, als_model, als_train] #count += 1 #print(str(count) + 'of the total' + str(total) + 'finished.') #print(str(precision) + str(mean_precision)) # store model with the best root square mean error statistic best_rmse = min(list(RMSE_list.keys())) lowest_rmse, best_model, best_als_model = RMSE_list[best_rmse] best_model.write().overwrite().save(model_file) # store model with the best precision statistic #best_mean_precision = max(list(PRECISION_AT_K.keys())) #highest_precision, best_model, best_als_model = PRECISION_AT_K[best_mean_precision] #best_model.write().overwrite().save(model_file) # save best ALS model # best_als_model.save('./recommender/alsFile') #print('Best model with the mean average precision of' + str(best_mean_precision) + #'and the best precision of ' + str(highest_precision) + #'regParam=' + str(best_als_model.getregParam) + #'Alpha=' + str(best_als_model.getAlpha) + #'Rank=' + str(best_als_model.getRank)) print('Best model with the root mean square error of ' + str(lowest_rmse) + ' and the regParam of ' + {best_als_model.getRegParam} + ' Alpha of ' + {best_als_model.getAlpha} + ' Rank of ' + {best_als_model.getRank})