def main(spark, log_comp=False, drop_low=False, drop_thr=0): ''' Parameters ---------- spark : SparkSession object train_path : string, path to the training parquet file to load val_path : string, path to the validation parquet file to load test_path : string, path to the validation parquet file to load ''' ## Load in datasets train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet' val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet' test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet' train = spark.read.parquet(train_path) val = spark.read.parquet(val_path) test = spark.read.parquet(test_path) ## Downsample the data # Pick out user list in training set user_train = set(row['user_id'] for row in train.select('user_id').distinct().collect()) # Pick out user list in validation set user_val = set(row['user_id'] for row in val.select('user_id').distinct().collect()) # Get the previous 1M users user_prev = list(user_train - user_val) # Random sampling to get 20% k = int(0.2 * len(user_prev)) user_prev_filtered = random.sample(user_prev, k) train = train.where(train.user_id.isin(user_prev_filtered + list(user_val))) ## Create StringIndexer indexer_user = StringIndexer(inputCol="user_id", outputCol="user_id_indexed", handleInvalid='skip') indexer_user_model = indexer_user.fit(train) indexer_track = StringIndexer(inputCol="track_id", outputCol="track_id_indexed", handleInvalid='skip') indexer_track_model = indexer_track.fit(train) train = indexer_user_model.transform(train) train = indexer_track_model.transform(train) val = indexer_user_model.transform(val) val = indexer_track_model.transform(val) test = indexer_user_model.transform(test) test = indexer_track_model.transform(test) ## ALS model rank_ = [5, 10, 20] regParam_ = [0.1, 1, 10] alpha_ = [1, 5, 10] param_grid = it.product(rank_, regParam_, alpha_) ## Pick out users from validation set user_id = val.select('user_id_indexed').distinct() true_label = val.select('user_id_indexed', 'track_id_indexed')\ .groupBy('user_id_indexed')\ .agg(expr('collect_list(track_id_indexed) as true_item')) ## Log-Compression ## count -> log(1+count) if log_comp == True: train = train.select('*', F.log1p('count').alias('count_log1p')) val = val.select('*', F.log1p('count').alias('count_log1p')) rateCol = "count_log1p" else: rateCol = "count" ## Drop interactions that have counts lower than specified threhold if drop_low == True: train = train.filter(train['count'] > drop_thr) val = val.filter(val['count'] > drop_thr) for i in param_grid: print('Start Training for {}'.format(i)) als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \ alpha=i[2], nonnegative=True, coldStartStrategy="drop") model = als.fit(train) print('Finish Training for {}'.format(i)) # Make top 500 recommendations for users in validation test res = model.recommendForUserSubset(user_id, 500) pred_label = res.select('user_id_indexed', 'recommendations.track_id_indexed') pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) print('Start Evaluating for {}'.format(i)) metrics = RankingMetrics(pred_true_rdd) map_ = metrics.meanAveragePrecision ndcg = metrics.ndcgAt(500) mpa = metrics.precisionAt(500) print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa) pass
# In[ ]: # partition dataframe (training, test) = sp_df_1000.randomSplit([0.8, 0.2]) # ### Model # 1 # Build the recommendation model using ALS on the training data # In[ ]: # build ALS recommendation model als = ALS( maxIter=5, regParam=0.01, rank=10, # number of latent topics- ME-10? alpha=30, implicitPrefs=True, # # implicitPrefs=True b/c ratings are implicit userCol="patent_firstnamed_assignee_id", itemCol="patent_number", ratingCol="rating", coldStartStrategy="nan") # coldStartStrategy="nan" to retain NaNs # In[ ]: # fit ALS model to the training set model = als.fit(training) # #### Model #1 - Evaluation - Compare to naive baseline # Compare model evaluation result with naive baseline model that only outputs (for explicit - the average rating (or you may try one that outputs the average rating per movie). # #### Model #1 - Optimize model
final_indexed_save = os.path.join('dataset', 'review_vegas_als.parquet') reviewDF = ss.read.parquet(final_indexed_save) logger.error('Number of reviews for Las Vegas is {}'.format( reviewDF.count())) #Split data into training and testing sets training_set, testing_set = reviewDF.randomSplit([0.97, 0.03]) logger.error('Size of Training set is {}'.format(training_set.count())) logger.error('Size of Testing set is {}'.format(testing_set.count())) logger.error('{} seconds has elapsed'.format(time.time() - start_time)) #build ALS learning model als = ALS(rank=8, maxIter=20, regParam=0.25, userCol="user_id_int", itemCol="business_id_int", ratingCol="stars_long", coldStartStrategy="drop") model = als.fit(training_set) logger.error('Model fitting done') logger.error('{} seconds has elapsed'.format(time.time() - start_time)) #save model to file model_save = os.path.join('dataset', 'als_model_vegas.parquet') #model.write().mode('overwrite').parquet(model_save) #Evaluate the modle using RMSE on test set predictions = model.transform(testing_set) logger.error('Model transformation done') logger.error('{} seconds has elapsed'.format(time.time() - start_time)) logger.error('Calculating RMSE value') evaluator = RegressionEvaluator(metricName="rmse",
def tune_ALS_map(train_read, val_read, val_true_list, iteration, regParams, current_rank): """ grid search function to select the best model based on RMSE of validation data Parameters ---------- train_data: spark DF with columns ['userId', 'movieId', 'rating'] validation_data: spark DF with columns ['userId', 'movieId', 'rating'] maxIter: int, max number of learning iterations regParams: list of float, one dimension of hyper-param tuning grid ranks: list of float, one dimension of hyper-param tuning grid Return ------ The best fitted ALS model with lowest RMSE score on validation data """ # initial min_error = float('inf') best_iter1 = -1 best_rank1 = -1 best_regularization1 = 0 best_model_rmse = None max_map = 0.0 best_iter2 = -1 best_rank2 = -1 best_regularization2 = 0 best_model_map = None for current_rank in ranks: for reg in regParams: # get ALS model #als = ALS().setMaxIter(iteration).setRank(rank).setRegParam(reg) als = ALS(maxIter=iteration, regParam=reg, rank=current_rank, userCol='user_id', itemCol='book_id', ratingCol='rating', coldStartStrategy="drop", nonnegative=True) # train ALS model train_read.checkpoint() model_read = als.fit(train_read) # evaluate the model by computing the RMSE on the validation read data predictions_read = model_read.transform(val_read) # combine predictions on read and unread data predictions_all = predictions_read.union(predictions_unread) # select top 500 books for each use to evaluate window = Window.partitionBy(predictions_all['user_id']).orderBy( predictions_all['prediction'].desc()) val_pred_order = predictions_all.select( '*', rank().over(window).alias('rank')).filter(col('rank') <= 500) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(val_pred_order) if rmse < min_error: min_error = rmse best_rank1 = current_rank best_regularization1 = reg best_iter1 = iteration best_model_rmse = model_read # evaluate the model by computing the MAP on the validation data val_pred_list = val_pred_order.select( 'user_id', 'book_id').groupBy('user_id').agg( expr('collect_list(book_id) as books')) val_RDD = val_pred_list.join( val_true_list, 'user_id').rdd.map(lambda row: (row[1], row[2])) val_RDD.checkpoint() rankingMetrics = RankingMetrics(val_RDD) current_map = rankingMetrics.meanAveragePrecision if current_map > max_map: max_map = current_map best_rank2 = current_rank best_regularization2 = reg best_iter2 = iteration best_model_map = model_read print('{} latent factors and regularization = {} with maxIter {}: ' 'validation RMSE is {}' 'validation MAP is {}'.format(current_rank, reg, iteration, rmse, current_map)) with open('train01_read_eval.csv', 'ab') as f: np.savetxt(f, [ np.array([iteration, current_rank, reg, rmse, current_map]) ], delimiter=",") print('\nThe best model select by RMSE has {} latent factors and ' 'regularization = {}' 'with maxIter = {}'.format(best_rank1, best_regularization1, best_iter1)) print('\nThe best model select by MAP has {} latent factors and ' 'regularization = {}' 'with maxIter = {}'.format(best_rank2, best_regularization2, best_iter2)) return best_model_rmse, best_model_map
stringIndexer = StringIndexer(inputCols=["item","user"], outputCols=["itemIndex","userIndex"]) model = stringIndexer.fit(df) df_indexed = model.transform(df) df_indexed.show(n=5) # In[5]: #split the data into training and testing set (training, test) = df_indexed.randomSplit([0.8, 0.2]) #training the model #define the model parameters als = ALS(maxIter=5, implicitPrefs=False, userCol="userIndex", itemCol="itemIndex", ratingCol="rating", coldStartStrategy="drop") #train the model model = als.fit(training) # In[6]: # predict using the testing datatset predictions = model.transform(test) predictions.show() # In[8]:
|summary| movie_id| Customer_id| Rating| Release_Year| Movie_Title| +-------+------------------+------------------+------------------+------------------+-------------------+ | count| 4810288| 4810288| 4810288| 4810288| 4810288| | mean|2308.3037520830353|1321990.7265673075| 3.599072030614383|1994.4039521126385| Infinity| | stddev| 1303.490951113763| 764565.5373457455|1.0861053257733961|12.602776332955534| NaN| | min| 1| 6| 1| 1915|'N Sync: 'N the Mix| | max| 4499| 2649429| 5| 2005| s-Cry-ed| +-------+------------------+------------------+------------------+------------------+-------------------+ """ # In[11]: ## Building the Recommendation Model (ALS) using Collaborative Filtering # using coldStartStrategy= drop, to avoid NaN value of RMSE Recommendation = ALS(userCol="Customer_id", itemCol="movie_id", ratingCol="Rating", coldStartStrategy="drop") Recommendation_model = Recommendation.fit(training) # In[12]: ##Evaluating the model predictions = Recommendation_model.transform(test) predictions.select('Rating', 'prediction').show() evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("") print("Root-mean-square error (RMSE) = " + str(rmse))
options(header='true', \ delimiter='\t',\ inferschema='true').\ load("u.data",header=False) #Renaming the column header df2 = df.withColumnRenamed("_c0","userid").withColumnRenamed("_c1","itemid").withColumnRenamed("_c2","rating").withColumnRenamed("_c3","timestamp") #Displaying the source file df2.show() """Step 2: Build a recommendation model using Alternating Least Squares""" # split training and testing (training, test) = df2.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data als = ALS(userCol="userid", itemCol="itemid", ratingCol="rating", nonnegative=True) model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # Model results in RMSE = nan due to cold start problem """RMSE for first model is nan due to cold start problem Step 4 : Resolving Cold start problem and improving performance by cross validation
items, item_map = id_map(df, 'itemID') def mapper(row): return Row(userID=user_map[row['userID']], itemID=item_map[row['itemID']], rating=row['rating_cat']) df_mapped = df.map(mapper).toDF() training, validation, test = df_mapped.randomSplit([0.6, 0.2, 0.2], seed=12345) numTraining = training.count() numValidation = validation.count() numTest = test.count() als = ALS(rank=50, userCol="userId", itemCol="itemId", ratingCol="rating") model = als.fit(training) valPredictions = model.transform(validation) val_predictions = valPredictions\ .withColumn("rating", valPredictions.rating.cast("str"))\ .withColumn("prediction", valPredictions.prediction.cast("str")) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") validation_score = evaluator.evaluate(val_predictions) testPredictions = model.transform(test) test_predictions = testPredictions\ .withColumn("rating", testPredictions.rating.cast("str"))\ .withColumn("prediction", testPredictions.prediction.cast("str")) test_score = evaluator.evaluate(test_predictions) print "Training: %d, validation: %d, test: %d" % (numTraining,
def create_spark_model_vectors_df( self, df: DataFrame) -> _UserResourceFeatureVectorMapping: tenant_col = self.tenant_col indexed_user_col = self.indexed_user_col user_vec_col = self.user_vec_col indexed_res_col = self.indexed_res_col res_vec_col = self.res_vec_col max_iter = self.max_iter distinct_tenants = df.select(tenant_col).distinct().cache() num_tenants = distinct_tenants.count() separate_tenants = self.separate_tenants num_blocks = self.num_blocks if self.num_blocks is not None else ( num_tenants if not separate_tenants else 10) als = ALS(rank=self.rank_param, maxIter=max_iter, regParam=self.reg_param, numUserBlocks=num_blocks, numItemBlocks=num_blocks, implicitPrefs=self.apply_implicit_cf, userCol=self.indexed_user_col, itemCol=self.indexed_res_col, ratingCol=self.scaled_likelihood_col, nonnegative=True, coldStartStrategy='drop') alpha = self.alpha_param if alpha is not None: als.setAlpha(alpha) if separate_tenants: tenants = [ row[tenant_col] for row in distinct_tenants.orderBy(tenant_col).collect() ] user_mapping_df: Optional[DataFrame] = None res_mapping_df: Optional[DataFrame] = None for curr_tenant in tenants: curr_df = df.filter(f.col(tenant_col) == curr_tenant).cache() curr_user_mapping_df, curr_res_mapping_df = self._train_cf( als, curr_df) user_mapping_df = user_mapping_df.union( curr_user_mapping_df ) if user_mapping_df is not None else curr_user_mapping_df res_mapping_df = res_mapping_df.union( curr_res_mapping_df ) if res_mapping_df is not None else curr_res_mapping_df else: user_mapping_df, res_mapping_df = self._train_cf(als, df) assert user_mapping_df is not None and res_mapping_df is not None return _UserResourceFeatureVectorMapping(tenant_col, indexed_user_col, user_vec_col, indexed_res_col, res_vec_col, None, None, None, user_mapping_df, res_mapping_df)
# MAGIC Using the ML Pipeline's [CrossValidator](http://spark.apache.org/docs/1.6.2/api/python/pyspark.ml.html#pyspark.ml.tuning.CrossValidator) with ALS is thus problematic, because cross validation involves dividing the training data into a set of folds (e.g., three sets) and then using those folds for testing and evaluating the parameters during the parameter grid search process. It is likely that some of the folds will contain users that are not in the other folds, and, as a result, ALS produces NaN values for those new users. When the CrossValidator uses the Evaluator (RMSE) to compute an error metric, the RMSE algorithm will return NaN. This will make *all* of the parameters in the parameter grid appear to be equally good (or bad). # MAGIC # MAGIC You can read the discussion on [Spark JIRA 14489](https://issues.apache.org/jira/browse/SPARK-14489) about this issue. There are proposed workarounds of having ALS provide default values or having RMSE drop NaN values. Both introduce potential issues. We have chosen to have RMSE drop NaN values. While this does not solve the underlying issue of ALS not predicting a value for a new user, it does provide some evaluation value. We manually implement the parameter grid search process using a for loop (below) and remove the NaN values before using RMSE. # MAGIC # MAGIC For a production application, you would want to consider the tradeoffs in how to handle new users. # MAGIC # MAGIC **Note**: This cell will likely take a couple of minutes to run. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code # This step is broken in ML Pipelines: https://issues.apache.org/jira/browse/SPARK-14489 from pyspark.ml.recommendation import ALS # Let's initialize our ALS learner als = ALS() # Now we set the parameters for the method als.setMaxIter(5)\ .setSeed(seed)\ .setRegParam(0.1)\ .setItemCol("movieId")\ .setUserCol("userId")\ .setRatingCol("rating") # Now let's compute an evaluation metric for our test dataset from pyspark.ml.evaluation import RegressionEvaluator # Create an RMSE evaluator using the label and predicted columns reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating",
"countScaled").rdd.map(extract).toDF( ['StreetName', 'ViolationCode', 'count']) dfTest = dfTest.select("StreetName", "ViolationCode", "countScaled").rdd.map(extract).toDF( ['StreetName', 'ViolationCode', 'count']) # indexModel = indexer.setHandleInvalid("skip").fit(dfTrain) dfTrain = indexModel.transform(dfTrain) dfTest = indexModel.transform(dfTest) #Building the recommendation model using ALS on the training data als = ALS(rank=200, maxIter=2, regParam=0.01, userCol="StreetCode", itemCol="ViolationCode", ratingCol="count", coldStartStrategy="drop", implicitPrefs=True) recommModel = als.fit(dfTrain) #Evaluate the model by computing the RMSE on the test data predictions = recommModel.transform(dfTest) evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction") RMSE = evaluator.evaluate(predictions) print("RMSE of Test:", RMSE) #Observed value: 0.44057324770617523 #Let's now predict the possible Violation codes for 10 users using the model built streetWithPossibleViolations = recommModel.recommendForAllUsers(10)
from pyspark.ml.recommendation import ALS from pyspark.sql import SparkSession from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from pyspark.sql.functions import col, expr from pyspark.mllib.evaluation import RankingMetrics spark = SparkSession.builder.appName('Recommendation_system').getOrCreate() df_training = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/training_sample_10p.parquet') df_validation = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/validation_sample_10p.parquet') df_test = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/testing_sample_10p.parquet') als=ALS(userCol="user_id",itemCol="book_id",ratingCol="rating",coldStartStrategy="drop",nonnegative=True) param_grid = ParamGridBuilder().addGrid(als.rank, [15,25,35]).addGrid(als.maxIter, [5,8,10]).addGrid(als.regParam, [0.08,0.09,0.10]).build() evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction") cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3) model=cv.fit(df_training) best_model = model.bestModel print("Tuned Hyperparameters:-------------") print("Rank: ", best_model._java_obj.parent().getRank()) print("MaxIter: ", best_model._java_obj.parent().getMaxIter()) print("RegParam: ", best_model._java_obj.parent().getRegParam()) print("Recommendations: ------------------------------")
# Read in data df = pd.read_csv('downloads/ml-20m/ratings.csv',sep = ',',usecols = ['userId','movieId','rating']) dev = df.sample(n=8000) # Enable Arrow-based columnar data transfers spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate() spark.conf.set("spark.sql.execution.arrow.enabled", "true") # Create a Spark DataFrame from a pandas DataFrame using Arrow ratings = spark.createDataFrame(dev) (training, test) = ratings.randomSplit([0.8,0.2]) # ALS als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Grid search paramGrid = ParamGridBuilder()\ .addGrid(als.rank, [4,8,12]) \ .addGrid(als.regParam, [0.1,1,10])\ .addGrid(als.maxIter, [5,10,15])\ .addGrid(als.alpha, [1,2,3])\ .build() # Tune hyper param tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=paramGrid, evaluator=rmse, trainRatio=0.8)
# for rank in ranks: # for alpha in alphas: # model = ALS(implicitPrefs=True, userCol="userId", itemCol="artistId", ratingCol="count", rank=rank, seed=seed, maxIter=iterations,regParam=lambda_,alpha=alpha).fit(training) # print('for rank {0} at alpha: {1} and lambda: {2},'.format(rank, alpha, lambda_)) # print('meanAUC =', areaUnderCurve(test, bTopItemIDs, model.transform), 'for ALS-PREDICTION') # ### Now using the best parameters obtained by gridsearch to calculate meanAUC for all artists # In[12]: # Model 1- rank= 40 ; alpha= 20; lambda= 0.01 model1 = ALS(implicitPrefs=True, userCol="userId", itemCol="artistId", ratingCol="count", rank=40, seed=seed, maxIter=iterations, regParam=0.01, alpha=20).fit(training) print('meanAUC =', areaUnderCurve(test, bAllItemIDs, model1.transform), 'for ALS-PREDICTION for Model 1') # Model 2- rank= 40; alpha= 20; lambda= 0.1 model2 = ALS(implicitPrefs=True, userCol="userId", itemCol="artistId", ratingCol="count", rank=40, seed=seed, maxIter=iterations,
ratingsRDD = lines.map(parse_rating) print(ratingsRDD.count()) lines = spark.read.text('gender.dat').rdd users = dict(lines.map(parse_user).collect()) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) num_training = training.count() num_validation = test.count() print('Training: %d' % num_training) print('Validation: %d' % num_validation) # setup ALS rank = 8 num_iterations = 8 lambda_ = 0.1 als = ALS(maxIter=num_interations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) spark.stop()
from pyspark.ml.recommendation import ALS # split the data into training and test datatset train, test = indexed.randomSplit([0.75, 0.25]) # count number of records in train set train.count() # count number of records in test set test.count() # Training the recommender model using train datatset rec = ALS(maxIter=10, regParam=0.01, userCol='userId', itemCol='title_new', ratingCol='rating', nonnegative=True, coldStartStrategy="drop") # fit the model on train set rec_model = rec.fit(train) # making predictions on test set predicted_ratings = rec_model.transform(test) # columns in predicted ratings dataframe predicted_ratings.printSchema() # predicted vs actual ratings for test set predicted_ratings.orderBy(rand()).show(10)
books_file = os.path.join(os.getcwd(), 'csv', 'books.csv') books_file_converted = os.path.join(os.getcwd(), 'csv', 'books_converted.csv') reader = list(csv.reader(open(books_file, "r"), delimiter=',')) writer = csv.writer(open(books_file_converted, 'w'), delimiter=';') writer.writerows(row for row in reader) # Load the books dataset books_df = spark.read.load(books_file_converted, format="csv", sep=";", inferSchema="true", header="true").na.drop() # Just take book_id and title columns books_df.createOrReplaceTempView("books") books_df_selected = spark.sql("SELECT `book_id`, `title` \ FROM books") # Check number of data print ("There are " + str(ratings_df.count()) + " ratings in this dataset") print ("There are " + str(books_df_selected.count()) + " books in this dataset") ### ALS Algorithm # Split ratings data become training set and test set (training, test) = ratings_df.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse))
def getALSReco(UID): spark = SparkSession.builder.appName('Recommendation_system').getOrCreate() df = spark.read.csv("C:\\Users\HP ITFAC\\Desktop\\FYP\\datasets\\MovieManytoMany_dataframe.csv") df = df.withColumnRenamed("_c0", "UID").withColumnRenamed("_c1", "MID").withColumnRenamed("_c2", "score").withColumnRenamed( "_c3", "score") # df.show(100,truncate=True) nd = df.select(df['UID'], df['MID'], df['score']) nd.show() # data frame is ok now # transform the dataset to int values indexer = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in list(set(nd.columns) - set(['score']))] pipeline = Pipeline(stages=indexer) transformed = pipeline.fit(nd).transform(nd) changedTypedf = transformed.withColumn("score", transformed["score"].cast(DoubleType())) transformed = changedTypedf transformed.printSchema() transformed.show() trans = transformed.toPandas() # split traing and test dataset (training, test) = transformed.randomSplit([0.8, 0.2]) # fit dataset to als als = ALS(maxIter=5, regParam=0.09, rank=25, userCol="UID_index", itemCol="MID_index", ratingCol="score", coldStartStrategy="drop", nonnegative=True) model = als.fit(training) # evaluate model evaluator = RegressionEvaluator(metricName="rmse", labelCol="score", predictionCol="prediction") predictions = model.transform(test) rmse = evaluator.evaluate(predictions) print("RMSE=" + str(rmse)) predictions.show() #user_recs = model.recommendForAllUsers(50).show(10) dataset = model.recommendForAllUsers(50).toPandas() dataset1 = predictions.toPandas() dataset1.query('UID=='+str(UID)).head() result = pd.merge( dataset, dataset1, left_on='UID_index', right_on='UID_index') result.query('UID=='+str(UID)).head() recos = result['recommendations'].iloc[0] reco = str(recos) MID_indexList = [] ratingList = [] aa = reco.replace('[', '') moviestr3 = aa.replace("]", "") moviestr0 = moviestr3.replace("(", "") moviestr2 = moviestr0.replace("rating=", "") moviestr1 = moviestr2.replace("RowMID_index=", "") moviestr = moviestr1.split('),') # print(moviestr) for i in moviestr: # print(i) rates = i.replace(")", "") score = rates.split(',') MID_indexList.append(score[0]) ratingList.append(score[1]) data = {'MID_index': [], "rating": []} recoDF = pd.DataFrame(data) recoDF["MID_index"] = MID_indexList recoDF["rating"] = ratingList recoDF["MID_index"] = pd.to_numeric(recoDF["MID_index"]) recoDF["rating"] = pd.to_numeric(recoDF["rating"]) tran = trans[['MID_index','MID']] Finalresult = pd.merge( recoDF, trans, left_on='MID_index', right_on='MID_index') Finalresult = Finalresult[['MID','rating']] Finalresult.drop_duplicates(keep=False,inplace=True) print("*******************************************************************************************") print(Finalresult) return Finalresult
def als_model(training): als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) model.save("./../data/als_model11") return model
'funding_id', 'backedAmount', )) ratings.filter("user_id is NULL").show() (training, test) = ratings.randomSplit([0.8, 0.2], seed=13) training.show() test.show() # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(rank=50, maxIter=20, regParam=0.01, userCol="user_id", itemCol="funding_id", ratingCol="backedAmount", coldStartStrategy="drop", implicitPrefs=False) model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="backedAmount", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) userRecs = model.recommendForAllUsers(10)
val_new = val.withColumn( 'rating', when(val.is_read == 0, float('nan')).otherwise(val.rating)) val_read = val_new.na.drop() val_unread = val.subtract(val_read) #num_iters = [5,10,15,20] iteration = 20 reg_params = [0.001, 0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0] ranks = [8, 10, 12, 14, 16, 18, 20] #built unread model first als = ALS(maxIter=5, regParam=0.0, rank=10, userCol='user_id', itemCol='book_id', ratingCol='rating', coldStartStrategy="drop", nonnegative=True) model_unread = als.fit(train_unread) predictions_unread = model_unread.transform(val_unread) def tune_ALS_map(train_read, val_read, val_true_list, iteration, regParams, current_rank): """ grid search function to select the best model based on RMSE of validation data Parameters ---------- train_data: spark DF with columns ['userId', 'movieId', 'rating']
spark.conf.set("spark.sql.execution.arrow.enabled", "true") # read data client = boto3.client('s3') obj = client.get_object(Bucket='yelpdatacf', Key='yelp_public_cf.csv') df = pd.read_csv(obj['Body']) df.review_stars = (df.review_stars - df.review_stars.mean()) ratings = spark.createDataFrame(df) # use the model that has min RMSE num_iter, param = 100, 0.2 als = ALS( maxIter=num_iter, regParam=param, userCol="user_int_id", #implicitPrefs=True, itemCol="bus_id", ratingCol="review_stars", coldStartStrategy="drop") model = als.fit(ratings) user_feature = model.userFactors business_feature = model.itemFactors k = 10 kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(user_feature) transformed = model.transform(user_feature).select('id', 'prediction') rows = transformed.collect() df = spark.createDataFrame(rows)
# convert to dataframe userIdDf = spark.createDataFrame(userIdRdd) \ .withColumnRenamed('_1', 'user_id') \ .withColumnRenamed('_2', 'user_id_indexed') businessIdDf = spark.createDataFrame(businessIdRdd) \ .withColumnRenamed('_1', 'business_id') \ .withColumnRenamed('_2', 'business_id_indexed') # join user id zipped with index and business id with index training = training.join(userIdDf, ['user_id'], 'left').join(businessIdDf, ['business_id'], 'left') als = ALS( maxIter=5, rank=70, # ORIGINAL # rank=3, regParam=0.01, # regParam=0.1, userCol='user_id_indexed', itemCol='business_id_indexed', ratingCol='user-business-interaction', coldStartStrategy='drop') als.setSeed(seed) model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) print("this is what the regular als input and output looks like") test.show() # I should make my cross product have the same columns predictions.show() print("this is what the regular als input and output looks like") # sys.exit() predictions = predictions.join(user_mean, ['user_id'], 'left')
train = train.withColumn("book_id", train["book_id"].cast(IntegerType())) test = test.withColumn("rating", test["rating"].cast(FloatType())) test = test.withColumn("user_id", test["user_id"].cast(IntegerType())) test = test.withColumn("book_id", test["book_id"].cast(IntegerType())) ## evaluate baseline model # best param for baseline model iteration = 20 reg = 0.1 current_rank = 20 als = ALS(maxIter=iteration, regParam=reg, rank=current_rank, userCol='user_id', itemCol='book_id', ratingCol='rating', coldStartStrategy="drop", nonnegative=True) # train ALS model model = als.fit(train) # evaluate the model by computing the RMSE on the validation data predictions = model.transform(test) window = Window.partitionBy(predictions['user_id']).orderBy( predictions['prediction'].desc()) test_pred_order = predictions.select( '*', rank().over(window).alias('rank')).filter(col('rank') <= 500) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
if __name__ == '__main__': spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate() movie_name = loadMovieNames('./ml-100k/u.item') text = spark.read.text('./ml-100k/u.data').rdd # Remember to add .rdd data = text.map(parseInput) data_frame = spark.createDataFrame( data).cache() # Remember to cache to avoid reload als = ALS(maxIter=5, regParam=0.01, userCol="userID", itemCol="movieID", ratingCol="rating") model = als.fit(data_frame) user_rating = data_frame.filter("userID = 0") rating_count = data_frame.groupBy("movieID").count().filter("count > 100") # Create new column popular_movie = rating_count.select("movieID").withColumn('userID', lit(0)) # Use data to predict recommendation using ALS model recommendations = model.transform(popular_movie) topRecommendation = recommendations.sort( recommendations.prediction.desc()).take(20)
indexed = item_indexer.fit(indexed).transform(indexed) # Join metadata for looking movie title meta = spark.table("amazon_meta").select("product_id", "title") indexed = indexed.join(meta, "product_id") # Split data into train and test data set (training, test) = indexed. \ select("user_id_index", "product_id_index", "score", "reviewed_at", "title"). \ randomSplit([0.6, 0.4], seed=0) # Train and evaluate with ALS from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder als = ALS(maxIter=5, userCol="user_id_index", itemCol="product_id_index", ratingCol="score") param_grid = ParamGridBuilder().addGrid(als.regParam, [0.01, 0.1, 1.0]).build() evaluator = RegressionEvaluator(metricName="rmse", labelCol="score", predictionCol="prediction") tvs = TrainValidationSplit( estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, ) model = tvs.fit(training)
spark = SparkSession.builder.appName("ALSExample").getOrCreate() moviesSchema = StructType([ \ StructField("userID", IntegerType(), True), \ StructField("movieID", IntegerType(), True), \ StructField("rating", IntegerType(), True), \ StructField("timestamp", LongType(), True)]) names = loadMovieNames() ratings = spark.read.option("sep", "\t").schema(moviesSchema) \ .csv("file:///SparkCourse/ml-100k/u.data") print("Training recommendation model...") als = ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userID").setItemCol("movieID") \ .setRatingCol("rating") model = als.fit(ratings) # Manually construct a dataframe of the user ID's we want recs for userID = int(sys.argv[1]) userSchema = StructType([StructField("userID", IntegerType(), True)]) users = spark.createDataFrame([[ userID, ]], userSchema) recommendations = model.recommendForUserSubset(users, 10).collect() print("Top 10 recommendations for user ID " + str(userID)) for userRecs in recommendations:
print('seting ml pipeline') rank=8 maxIter=10 regParam=0.03 alpha = 1 implicitPrefs = 'True' mlflow.log_param("rank", rank) mlflow.log_param("maxIter", maxIter) mlflow.log_param("regParam", regParam) mlflow.log_param("alpha", alpha) mlflow.log_param("implicitPrefs", implicitPrefs) als = ALS( maxIter=maxIter, rank=rank, regParam=regParam, alpha = alpha, implicitPrefs=True, userCol="user_id", itemCol="product_id", ratingCol="count", coldStartStrategy="drop") # Define evaluator as RMSE evaluator = RegressionEvaluator(metricName="rmse",labelCol="count",predictionCol="prediction") print('training model...') tunedModel = als.fit(train_data) print('testing model ....') predictions = tunedModel.transform(test_data) test_metric = evaluator.evaluate(predictions) mlflow.log_metric('test_' + evaluator.getMetricName(), test_metric) print('get all predictions....') all_predictions = bestModel.transform(data)
rev_usr_bus = spark.read.options(inferSchema=True).json( "/home/tanmay/IdeaProjects/BigData_Project/Output/rev_usr_bus") # Index business and user IDs and align with (review) rating. user_reviews = review_df.select("review_id", "text").withColumnRenamed("review_id", "r_review_id") \ .join(rev_usr_bus, col("review_id") == col("r_review_id")) \ .select("review_id", "business_id", "user_id", "r_stars") processed = process_review_data(user_reviews) user_reviews_train, user_reviews_test = processed.randomSplit([0.90, 0.1], seed=100) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="user_id_ind", itemCol="business_id_ind", ratingCol="r_stars", coldStartStrategy="drop") model = als.fit(user_reviews_train) # Divide the training set to get a sample of how the model works on a sample of the training data. sample_train, z = user_reviews_train.randomSplit([0.11, .89], seed=100) predictions = model.transform(sample_train) final = predictions.withColumn("score", col("prediction") - lit(3)).select( "review_id", "business_id", "user_id", "r_stars", "score") final.write.json( "/home/tanmay/IdeaProjects/BigData_Project/Output/als_similarities_train", mode='overwrite')
def main(spark, train_data, val_data, downsample=True, extension=None): '''Main routine for supervised training Parameters ---------- spark : SparkSession object train_data : string, path to the training parquet file to load val_data: string, path to the validation parquet file to load test_data: string, path to the testing parquet file to load downsample: TRUE or FALSE. To indicate if we should downsample the data or not ''' ### read in the files train = spark.read.parquet(train_data) val = spark.read.parquet(val_data) ### if down-sample: down-sample train data to random 0.1% if downsample: train = train.sample(False, 0.00001, seed=0) #val = val.sample(False, 0.00001, seed = 0) if extension != None: if extension == "log": # log-compression train = train.withColumn("log_count", log("count")) # apply log-compression elif extension == "drop": # drop low counts lower_bound = train.approxQuantile("count", [ 0.1 ], 0.25) # treat the 0.1 quantile of count data as the lower bound train = train.filter( train["count"] > int(lower_bound[0]) ) # filter out count data rows lower than the lower bound ### transform dataframe columns: user_id, track_id from string to float and put them in the pipeline user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_indexed", handleInvalid='skip') track_indexer = StringIndexer(inputCol="track_id", outputCol="track_id_indexed", handleInvalid='skip') pipeline = Pipeline(stages=[user_indexer, track_indexer]) indexing_model = pipeline.fit(train) #learn (return: pipeline model) ### transform the datasets and create the view train = indexing_model.transform( train) # return a dataframe with new columns train.createOrReplaceTempView("train") val = indexing_model.transform(val) # return a dataframe with new columns val.createOrReplaceTempView("val") # group by user_id, aggregate track_id_indexed for train and val val_groupby = spark.sql( "select user_id_indexed, collect_list(track_id_indexed) track_id_indexed_collections from val group by user_id_indexed" ) val_groupby.createOrReplaceTempView("val_groupby") # Build the recommendation model using ALS on the training data rank = np.arange(4, 10, 2) regParam = np.linspace(0.01, 0.2, 3) alpha = np.linspace(0.5, 2, 3) paramGrid = list(itertools.product(rank, regParam, alpha)) MAP_lst = [] # store MAP results precision_at_500_lst = [] # store precision at 500 results for combo in paramGrid: rank_, regParam_, alpha_ = combo # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics if extension == "log": ratingCol = "log_count" else: ratingCol = "count" als = ALS(rank=rank_, regParam=regParam_, alpha=alpha_, implicitPrefs=True, userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=ratingCol, coldStartStrategy="drop") # Save the model model = als.fit(train) # fit the pipeline onto training data # get top 500 recommendations userRecs = model.recommendForAllUsers( 500 ) # return: dataframe (columns: user_id_indexed, recommendations) # [('user_id_indexed', 'int'), ('recommendations', 'array<struct<track_id_indexed:int,rating:float>>')] userRecs = userRecs.select( userRecs.user_id_indexed, userRecs.recommendations.track_id_indexed.alias( "pred_list")) # with track_id_indexed only, no track_id userRecs.createOrReplaceTempView("userRecs") # create temporary view combined_df = spark.sql( '''select val_groupby.user_id_indexed user_id_indexed, userRecs.pred_list pred_list, val_groupby.track_id_indexed_collections track_id_indexed_collections from userRecs inner join val_groupby on val_groupby.user_id_indexed = userRecs.user_id_indexed''' ) # combine dfs wrg to user_id_indexed # use ranking metrics for evaluations predLabelsTuple = combined_df.rdd.map( lambda r: (r.pred_list, r.track_id_indexed_collections)) # result: tuple metrics = RankingMetrics(predLabelsTuple) MAP = metrics.meanAveragePrecision precision_at_500 = metrics.precisionAt(500) MAP_lst.append(MAP) # store MAP for each config precision_at_500_lst.append( precision_at_500) # store precision at 500 for each config # print out validation evaluation result print("---------------------------------------") print("configs: \n") print("rank = " + str(rank_) + " , regParam = " + str(regParam_) + " , alpha = " + str(alpha_)) print("\n") print("MAP = " + str(MAP)) print("Precision at 500 = " + str(precision_at_500)) min_index = MAP_lst.index(np.max(MAP_lst)) rank_opt, regParam_opt, alpha_opt = paramGrid[min_index] print("---------------------------------------") print("optimal configs: \n") print("rank = " + str(rank_opt) + " , regParam = " + str(regParam_opt) + " , alpha = " + str(alpha_opt)) print("\n") print("MAP = " + str(np.max(MAP_lst))) print("Precision at 500 =" + str(precision_at_500_lst[min_index]))