示例#1
0
def main(spark, log_comp=False, drop_low=False, drop_thr=0):
    '''

    Parameters
    ----------
    spark : SparkSession object

    train_path : string, path to the training parquet file to load

    val_path : string, path to the validation parquet file to load

    test_path : string, path to the validation parquet file to load
    '''
    ## Load in datasets
    train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet'
    val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet'
    test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet'

    train = spark.read.parquet(train_path)
    val = spark.read.parquet(val_path)
    test = spark.read.parquet(test_path)

    ## Downsample the data
    # Pick out user list in training set
    user_train = set(row['user_id']
                     for row in train.select('user_id').distinct().collect())
    # Pick out user list in validation set
    user_val = set(row['user_id']
                   for row in val.select('user_id').distinct().collect())
    # Get the previous 1M users
    user_prev = list(user_train - user_val)
    # Random sampling to get 20%
    k = int(0.2 * len(user_prev))
    user_prev_filtered = random.sample(user_prev, k)
    train = train.where(train.user_id.isin(user_prev_filtered +
                                           list(user_val)))

    ## Create StringIndexer
    indexer_user = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_indexed",
                                 handleInvalid='skip')
    indexer_user_model = indexer_user.fit(train)
    indexer_track = StringIndexer(inputCol="track_id",
                                  outputCol="track_id_indexed",
                                  handleInvalid='skip')
    indexer_track_model = indexer_track.fit(train)

    train = indexer_user_model.transform(train)
    train = indexer_track_model.transform(train)

    val = indexer_user_model.transform(val)
    val = indexer_track_model.transform(val)

    test = indexer_user_model.transform(test)
    test = indexer_track_model.transform(test)

    ## ALS model
    rank_ = [5, 10, 20]
    regParam_ = [0.1, 1, 10]
    alpha_ = [1, 5, 10]
    param_grid = it.product(rank_, regParam_, alpha_)

    ## Pick out users from validation set
    user_id = val.select('user_id_indexed').distinct()
    true_label = val.select('user_id_indexed', 'track_id_indexed')\
                    .groupBy('user_id_indexed')\
                    .agg(expr('collect_list(track_id_indexed) as true_item'))

    ## Log-Compression
    ## count -> log(1+count)
    if log_comp == True:
        train = train.select('*', F.log1p('count').alias('count_log1p'))
        val = val.select('*', F.log1p('count').alias('count_log1p'))
        rateCol = "count_log1p"
    else:
        rateCol = "count"

    ## Drop interactions that have counts lower than specified threhold
    if drop_low == True:
        train = train.filter(train['count'] > drop_thr)
        val = val.filter(val['count'] > drop_thr)

    for i in param_grid:
        print('Start Training for {}'.format(i))
        als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \
            alpha=i[2], nonnegative=True, coldStartStrategy="drop")
        model = als.fit(train)
        print('Finish Training for {}'.format(i))

        # Make top 500 recommendations for users in validation test
        res = model.recommendForUserSubset(user_id, 500)
        pred_label = res.select('user_id_indexed',
                                'recommendations.track_id_indexed')

        pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \
                    .rdd \
                    .map(lambda row: (row[1], row[2]))

        print('Start Evaluating for {}'.format(i))
        metrics = RankingMetrics(pred_true_rdd)
        map_ = metrics.meanAveragePrecision
        ndcg = metrics.ndcgAt(500)
        mpa = metrics.precisionAt(500)
        print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa)

    pass
示例#2
0
# In[ ]:

# partition dataframe
(training, test) = sp_df_1000.randomSplit([0.8, 0.2])

# ### Model # 1
# Build the recommendation model using ALS on the training data

# In[ ]:

# build ALS recommendation model
als = ALS(
    maxIter=5,
    regParam=0.01,
    rank=10,  # number of latent topics- ME-10?
    alpha=30,
    implicitPrefs=True,  # # implicitPrefs=True b/c ratings are implicit
    userCol="patent_firstnamed_assignee_id",
    itemCol="patent_number",
    ratingCol="rating",
    coldStartStrategy="nan")  # coldStartStrategy="nan" to retain NaNs

# In[ ]:

# fit ALS model to the training set
model = als.fit(training)

# #### Model #1 - Evaluation - Compare to naive baseline
# Compare model evaluation result with naive baseline model that only outputs (for explicit - the average rating (or you may try one that outputs the average rating per movie).

# #### Model #1 - Optimize model
    final_indexed_save = os.path.join('dataset', 'review_vegas_als.parquet')
    reviewDF = ss.read.parquet(final_indexed_save)

    logger.error('Number of reviews for Las Vegas is {}'.format(
        reviewDF.count()))
    #Split data into training and testing sets
    training_set, testing_set = reviewDF.randomSplit([0.97, 0.03])
    logger.error('Size of Training set is {}'.format(training_set.count()))
    logger.error('Size of Testing set is {}'.format(testing_set.count()))
    logger.error('{} seconds has elapsed'.format(time.time() - start_time))
    #build ALS learning model
    als = ALS(rank=8,
              maxIter=20,
              regParam=0.25,
              userCol="user_id_int",
              itemCol="business_id_int",
              ratingCol="stars_long",
              coldStartStrategy="drop")
    model = als.fit(training_set)
    logger.error('Model fitting done')
    logger.error('{} seconds has elapsed'.format(time.time() - start_time))
    #save model to file
    model_save = os.path.join('dataset', 'als_model_vegas.parquet')
    #model.write().mode('overwrite').parquet(model_save)
    #Evaluate the modle using RMSE on test set
    predictions = model.transform(testing_set)
    logger.error('Model transformation done')
    logger.error('{} seconds has elapsed'.format(time.time() - start_time))
    logger.error('Calculating RMSE value')
    evaluator = RegressionEvaluator(metricName="rmse",
def tune_ALS_map(train_read, val_read, val_true_list, iteration, regParams,
                 current_rank):
    """
    grid search function to select the best model based on RMSE of
    validation data
    Parameters
    ----------
    train_data: spark DF with columns ['userId', 'movieId', 'rating']
    
    validation_data: spark DF with columns ['userId', 'movieId', 'rating']
    
    maxIter: int, max number of learning iterations
    
    regParams: list of float, one dimension of hyper-param tuning grid
    
    ranks: list of float, one dimension of hyper-param tuning grid
    
    Return
    ------
    The best fitted ALS model with lowest RMSE score on validation data
    """
    # initial
    min_error = float('inf')
    best_iter1 = -1
    best_rank1 = -1
    best_regularization1 = 0
    best_model_rmse = None
    max_map = 0.0
    best_iter2 = -1
    best_rank2 = -1
    best_regularization2 = 0
    best_model_map = None
    for current_rank in ranks:
        for reg in regParams:
            # get ALS model
            #als = ALS().setMaxIter(iteration).setRank(rank).setRegParam(reg)
            als = ALS(maxIter=iteration,
                      regParam=reg,
                      rank=current_rank,
                      userCol='user_id',
                      itemCol='book_id',
                      ratingCol='rating',
                      coldStartStrategy="drop",
                      nonnegative=True)
            # train ALS model
            train_read.checkpoint()
            model_read = als.fit(train_read)
            # evaluate the model by computing the RMSE on the validation read data
            predictions_read = model_read.transform(val_read)
            # combine predictions on read and unread data
            predictions_all = predictions_read.union(predictions_unread)
            # select top 500 books for each use to evaluate
            window = Window.partitionBy(predictions_all['user_id']).orderBy(
                predictions_all['prediction'].desc())
            val_pred_order = predictions_all.select(
                '*',
                rank().over(window).alias('rank')).filter(col('rank') <= 500)
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(val_pred_order)

            if rmse < min_error:
                min_error = rmse
                best_rank1 = current_rank
                best_regularization1 = reg
                best_iter1 = iteration
                best_model_rmse = model_read

                # evaluate the model by computing the MAP on the validation data

            val_pred_list = val_pred_order.select(
                'user_id', 'book_id').groupBy('user_id').agg(
                    expr('collect_list(book_id) as books'))
            val_RDD = val_pred_list.join(
                val_true_list, 'user_id').rdd.map(lambda row: (row[1], row[2]))
            val_RDD.checkpoint()
            rankingMetrics = RankingMetrics(val_RDD)
            current_map = rankingMetrics.meanAveragePrecision

            if current_map > max_map:
                max_map = current_map
                best_rank2 = current_rank
                best_regularization2 = reg
                best_iter2 = iteration
                best_model_map = model_read

            print('{} latent factors and regularization = {} with maxIter {}: '
                  'validation RMSE is {}'
                  'validation MAP is {}'.format(current_rank, reg, iteration,
                                                rmse, current_map))
            with open('train01_read_eval.csv', 'ab') as f:
                np.savetxt(f, [
                    np.array([iteration, current_rank, reg, rmse, current_map])
                ],
                           delimiter=",")

        print('\nThe best model select by RMSE has {} latent factors and '
              'regularization = {}'
              'with maxIter = {}'.format(best_rank1, best_regularization1,
                                         best_iter1))
        print('\nThe best model select by MAP has {} latent factors and '
              'regularization = {}'
              'with maxIter = {}'.format(best_rank2, best_regularization2,
                                         best_iter2))

        return best_model_rmse, best_model_map
示例#5
0
stringIndexer = StringIndexer(inputCols=["item","user"], outputCols=["itemIndex","userIndex"])
model = stringIndexer.fit(df)
df_indexed = model.transform(df)
df_indexed.show(n=5)


# In[5]:


#split the data into training and testing set
(training, test) = df_indexed.randomSplit([0.8, 0.2])
#training the model
#define the model parameters
als = ALS(maxIter=5, 
          implicitPrefs=False,
          userCol="userIndex", 
          itemCol="itemIndex", 
          ratingCol="rating",
          coldStartStrategy="drop")
#train the model
model = als.fit(training)


# In[6]:


# predict using the testing datatset
predictions = model.transform(test)
predictions.show()


# In[8]:
示例#6
0
|summary|          movie_id|       Customer_id|            Rating|      Release_Year|        Movie_Title|
+-------+------------------+------------------+------------------+------------------+-------------------+
|  count|           4810288|           4810288|           4810288|           4810288|            4810288|
|   mean|2308.3037520830353|1321990.7265673075| 3.599072030614383|1994.4039521126385|           Infinity|
| stddev| 1303.490951113763| 764565.5373457455|1.0861053257733961|12.602776332955534|                NaN|
|    min|                 1|                 6|                 1|              1915|'N Sync: 'N the Mix|
|    max|              4499|           2649429|                 5|              2005|           s-Cry-ed|
+-------+------------------+------------------+------------------+------------------+-------------------+
"""

# In[11]:

## Building the Recommendation Model (ALS) using Collaborative Filtering
# using coldStartStrategy= drop, to avoid NaN value of RMSE
Recommendation = ALS(userCol="Customer_id",
                     itemCol="movie_id",
                     ratingCol="Rating",
                     coldStartStrategy="drop")
Recommendation_model = Recommendation.fit(training)

# In[12]:

##Evaluating the model
predictions = Recommendation_model.transform(test)
predictions.select('Rating', 'prediction').show()

evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="Rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("")
print("Root-mean-square error (RMSE) = " + str(rmse))
                               options(header='true', \
                                       delimiter='\t',\
                               inferschema='true').\
                load("u.data",header=False)
#Renaming the column header                
df2 = df.withColumnRenamed("_c0","userid").withColumnRenamed("_c1","itemid").withColumnRenamed("_c2","rating").withColumnRenamed("_c3","timestamp")
#Displaying the source file
df2.show()

"""Step 2: Build a recommendation model using Alternating Least Squares"""

# split training and testing
(training, test) = df2.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
als = ALS(userCol="userid", itemCol="itemid", ratingCol="rating", nonnegative=True)
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Model results in RMSE = nan due to cold start problem

"""RMSE for first model is nan due to cold start problem

Step 4 : Resolving Cold start problem and improving performance by cross validation
示例#8
0
    items, item_map = id_map(df, 'itemID')

    def mapper(row):
        return Row(userID=user_map[row['userID']],
                   itemID=item_map[row['itemID']],
                   rating=row['rating_cat'])

    df_mapped = df.map(mapper).toDF()
    training, validation, test = df_mapped.randomSplit([0.6, 0.2, 0.2],
                                                       seed=12345)

    numTraining = training.count()
    numValidation = validation.count()
    numTest = test.count()

    als = ALS(rank=50, userCol="userId", itemCol="itemId", ratingCol="rating")
    model = als.fit(training)
    valPredictions = model.transform(validation)
    val_predictions = valPredictions\
                  .withColumn("rating", valPredictions.rating.cast("str"))\
                  .withColumn("prediction", valPredictions.prediction.cast("str"))
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    validation_score = evaluator.evaluate(val_predictions)

    testPredictions = model.transform(test)
    test_predictions = testPredictions\
                       .withColumn("rating", testPredictions.rating.cast("str"))\
                       .withColumn("prediction", testPredictions.prediction.cast("str"))
    test_score = evaluator.evaluate(test_predictions)

    print "Training: %d, validation: %d, test: %d" % (numTraining,
示例#9
0
    def create_spark_model_vectors_df(
            self, df: DataFrame) -> _UserResourceFeatureVectorMapping:
        tenant_col = self.tenant_col
        indexed_user_col = self.indexed_user_col
        user_vec_col = self.user_vec_col
        indexed_res_col = self.indexed_res_col
        res_vec_col = self.res_vec_col
        max_iter = self.max_iter
        distinct_tenants = df.select(tenant_col).distinct().cache()
        num_tenants = distinct_tenants.count()
        separate_tenants = self.separate_tenants
        num_blocks = self.num_blocks if self.num_blocks is not None else (
            num_tenants if not separate_tenants else 10)

        als = ALS(rank=self.rank_param,
                  maxIter=max_iter,
                  regParam=self.reg_param,
                  numUserBlocks=num_blocks,
                  numItemBlocks=num_blocks,
                  implicitPrefs=self.apply_implicit_cf,
                  userCol=self.indexed_user_col,
                  itemCol=self.indexed_res_col,
                  ratingCol=self.scaled_likelihood_col,
                  nonnegative=True,
                  coldStartStrategy='drop')

        alpha = self.alpha_param

        if alpha is not None:
            als.setAlpha(alpha)

        if separate_tenants:
            tenants = [
                row[tenant_col]
                for row in distinct_tenants.orderBy(tenant_col).collect()
            ]

            user_mapping_df: Optional[DataFrame] = None
            res_mapping_df: Optional[DataFrame] = None

            for curr_tenant in tenants:
                curr_df = df.filter(f.col(tenant_col) == curr_tenant).cache()
                curr_user_mapping_df, curr_res_mapping_df = self._train_cf(
                    als, curr_df)

                user_mapping_df = user_mapping_df.union(
                    curr_user_mapping_df
                ) if user_mapping_df is not None else curr_user_mapping_df

                res_mapping_df = res_mapping_df.union(
                    curr_res_mapping_df
                ) if res_mapping_df is not None else curr_res_mapping_df
        else:
            user_mapping_df, res_mapping_df = self._train_cf(als, df)

        assert user_mapping_df is not None and res_mapping_df is not None

        return _UserResourceFeatureVectorMapping(tenant_col, indexed_user_col,
                                                 user_vec_col, indexed_res_col,
                                                 res_vec_col, None, None, None,
                                                 user_mapping_df,
                                                 res_mapping_df)
# MAGIC Using the ML Pipeline's [CrossValidator](http://spark.apache.org/docs/1.6.2/api/python/pyspark.ml.html#pyspark.ml.tuning.CrossValidator) with ALS is thus problematic, because cross validation involves dividing the training data into a set of folds (e.g., three sets) and then using those folds for testing and evaluating the parameters during the parameter grid search process. It is likely that some of the folds will contain users that are not in the other folds, and, as a result, ALS produces NaN values for those new users. When the CrossValidator uses the Evaluator (RMSE) to compute an error metric, the RMSE algorithm will return NaN. This will make *all* of the parameters in the parameter grid appear to be equally good (or bad).
# MAGIC
# MAGIC You can read the discussion on [Spark JIRA 14489](https://issues.apache.org/jira/browse/SPARK-14489) about this issue. There are proposed workarounds of having ALS provide default values or having RMSE drop NaN values. Both introduce potential issues. We have chosen to have RMSE drop NaN values. While this does not solve the underlying issue of ALS not predicting a value for a new user, it does provide some evaluation value. We manually implement the parameter grid search process using a for loop (below) and remove the NaN values before using RMSE.
# MAGIC
# MAGIC For a production application, you would want to consider the tradeoffs in how to handle new users.
# MAGIC
# MAGIC **Note**: This cell will likely take a couple of minutes to run.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
# This step is broken in ML Pipelines: https://issues.apache.org/jira/browse/SPARK-14489
from pyspark.ml.recommendation import ALS

# Let's initialize our ALS learner
als = ALS()

# Now we set the parameters for the method
als.setMaxIter(5)\
   .setSeed(seed)\
   .setRegParam(0.1)\
   .setItemCol("movieId")\
   .setUserCol("userId")\
   .setRatingCol("rating")

# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction",
                               labelCol="rating",
                         "countScaled").rdd.map(extract).toDF(
                             ['StreetName', 'ViolationCode', 'count'])
dfTest = dfTest.select("StreetName", "ViolationCode",
                       "countScaled").rdd.map(extract).toDF(
                           ['StreetName', 'ViolationCode', 'count'])

#
indexModel = indexer.setHandleInvalid("skip").fit(dfTrain)
dfTrain = indexModel.transform(dfTrain)
dfTest = indexModel.transform(dfTest)

#Building the recommendation model using ALS on the training data
als = ALS(rank=200,
          maxIter=2,
          regParam=0.01,
          userCol="StreetCode",
          itemCol="ViolationCode",
          ratingCol="count",
          coldStartStrategy="drop",
          implicitPrefs=True)
recommModel = als.fit(dfTrain)

#Evaluate the model by computing the RMSE on the test data
predictions = recommModel.transform(dfTest)
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="count",
                                predictionCol="prediction")
RMSE = evaluator.evaluate(predictions)
print("RMSE of Test:", RMSE)  #Observed value: 0.44057324770617523

#Let's now predict the possible Violation codes for 10 users using the model built
streetWithPossibleViolations = recommModel.recommendForAllUsers(10)
示例#12
0
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col, expr
from pyspark.mllib.evaluation import RankingMetrics

spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()


df_training = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/training_sample_10p.parquet')
df_validation = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/validation_sample_10p.parquet')
df_test = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/testing_sample_10p.parquet')



als=ALS(userCol="user_id",itemCol="book_id",ratingCol="rating",coldStartStrategy="drop",nonnegative=True)

param_grid = ParamGridBuilder().addGrid(als.rank, [15,25,35]).addGrid(als.maxIter, [5,8,10]).addGrid(als.regParam, [0.08,0.09,0.10]).build()
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)
model=cv.fit(df_training)

best_model = model.bestModel

print("Tuned Hyperparameters:-------------")
print("Rank: ", best_model._java_obj.parent().getRank())
print("MaxIter: ", best_model._java_obj.parent().getMaxIter())
print("RegParam: ", best_model._java_obj.parent().getRegParam())

print("Recommendations: ------------------------------")
示例#13
0
# Read in data
df = pd.read_csv('downloads/ml-20m/ratings.csv',sep = ',',usecols = ['userId','movieId','rating'])
dev = df.sample(n=8000)

# Enable Arrow-based columnar data transfers
spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Create a Spark DataFrame from a pandas DataFrame using Arrow
ratings = spark.createDataFrame(dev)
(training, test) = ratings.randomSplit([0.8,0.2])



# ALS
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)

# Grid search
paramGrid = ParamGridBuilder()\
    .addGrid(als.rank, [4,8,12]) \
    .addGrid(als.regParam, [0.1,1,10])\
    .addGrid(als.maxIter, [5,10,15])\
    .addGrid(als.alpha, [1,2,3])\
    .build()
    
# Tune hyper param
tvs = TrainValidationSplit(estimator=als,
                           estimatorParamMaps=paramGrid,
                           evaluator=rmse,
                           trainRatio=0.8)
示例#14
0
#    for rank in ranks:
#        for alpha in alphas:
#            model = ALS(implicitPrefs=True, userCol="userId", itemCol="artistId", ratingCol="count", rank=rank, seed=seed, maxIter=iterations,regParam=lambda_,alpha=alpha).fit(training)
#            print('for rank {0} at alpha: {1} and lambda: {2},'.format(rank, alpha, lambda_))
#            print('meanAUC =', areaUnderCurve(test, bTopItemIDs, model.transform), 'for ALS-PREDICTION')

# ### Now using the best parameters obtained by gridsearch to calculate meanAUC for all artists

# In[12]:

# Model 1- rank= 40 ; alpha= 20; lambda= 0.01
model1 = ALS(implicitPrefs=True,
             userCol="userId",
             itemCol="artistId",
             ratingCol="count",
             rank=40,
             seed=seed,
             maxIter=iterations,
             regParam=0.01,
             alpha=20).fit(training)
print('meanAUC =', areaUnderCurve(test, bAllItemIDs, model1.transform),
      'for ALS-PREDICTION for Model 1')

# Model 2- rank= 40; alpha= 20; lambda= 0.1
model2 = ALS(implicitPrefs=True,
             userCol="userId",
             itemCol="artistId",
             ratingCol="count",
             rank=40,
             seed=seed,
             maxIter=iterations,
示例#15
0
  ratingsRDD = lines.map(parse_rating)
  print(ratingsRDD.count())
  lines = spark.read.text('gender.dat').rdd
  users = dict(lines.map(parse_user).collect())

  ratings = spark.createDataFrame(ratingsRDD)
  (training, test) = ratings.randomSplit([0.8, 0.2])

  num_training = training.count()
  num_validation = test.count()

  print('Training: %d' % num_training)
  print('Validation: %d' % num_validation)

  # setup ALS
  rank = 8
  num_iterations = 8
  lambda_ = 0.1

  als = ALS(maxIter=num_interations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating")
  model = als.fit(training)

  # Evaluate the model by computing the RMSE on the test data
  predictions = model.transform(test)
  evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                  predictionCol="prediction")
  rmse = evaluator.evaluate(predictions)
  print("Root-mean-square error = " + str(rmse))

  spark.stop()
示例#16
0
from pyspark.ml.recommendation import ALS

# split the data into training and test datatset
train, test = indexed.randomSplit([0.75, 0.25])

# count number of records in train set
train.count()

# count number of records in test set
test.count()

# Training the recommender model using train datatset
rec = ALS(maxIter=10,
          regParam=0.01,
          userCol='userId',
          itemCol='title_new',
          ratingCol='rating',
          nonnegative=True,
          coldStartStrategy="drop")

# fit the model on train set
rec_model = rec.fit(train)

# making predictions on test set
predicted_ratings = rec_model.transform(test)

# columns in predicted ratings dataframe
predicted_ratings.printSchema()

# predicted vs actual ratings for test set
predicted_ratings.orderBy(rand()).show(10)
示例#17
0
books_file = os.path.join(os.getcwd(), 'csv', 'books.csv')
books_file_converted = os.path.join(os.getcwd(), 'csv', 'books_converted.csv')
reader = list(csv.reader(open(books_file, "r"), delimiter=','))
writer = csv.writer(open(books_file_converted, 'w'), delimiter=';')
writer.writerows(row for row in reader)

# Load the books dataset
books_df = spark.read.load(books_file_converted, format="csv", sep=";", inferSchema="true", header="true").na.drop()
# Just take book_id and title columns
books_df.createOrReplaceTempView("books")
books_df_selected = spark.sql("SELECT `book_id`, `title` \
                                FROM books")

# Check number of data
print ("There are " + str(ratings_df.count()) + " ratings in this dataset")
print ("There are " + str(books_df_selected.count()) + " books in this dataset")

### ALS Algorithm
# Split ratings data become training set and test set
(training, test) = ratings_df.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))
示例#18
0
def getALSReco(UID):
    spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

    df = spark.read.csv("C:\\Users\HP ITFAC\\Desktop\\FYP\\datasets\\MovieManytoMany_dataframe.csv")
    df = df.withColumnRenamed("_c0", "UID").withColumnRenamed("_c1", "MID").withColumnRenamed("_c2",
                                                                                                 "score").withColumnRenamed(
        "_c3", "score")
    # df.show(100,truncate=True)

    nd = df.select(df['UID'], df['MID'], df['score'])

    nd.show()
    # data frame is ok now
    # transform the dataset to int values
    indexer = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in
               list(set(nd.columns) - set(['score']))]
    pipeline = Pipeline(stages=indexer)
    transformed = pipeline.fit(nd).transform(nd)

    changedTypedf = transformed.withColumn("score", transformed["score"].cast(DoubleType()))
    transformed = changedTypedf
    transformed.printSchema()
    transformed.show()
    trans = transformed.toPandas()

    # split traing and test dataset
    (training, test) = transformed.randomSplit([0.8, 0.2])

    # fit dataset to als
    als = ALS(maxIter=5, regParam=0.09, rank=25, userCol="UID_index", itemCol="MID_index", ratingCol="score",
              coldStartStrategy="drop", nonnegative=True)
    model = als.fit(training)

    # evaluate model
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="score", predictionCol="prediction")
    predictions = model.transform(test)
    rmse = evaluator.evaluate(predictions)
    print("RMSE=" + str(rmse))
    predictions.show()

    #user_recs = model.recommendForAllUsers(50).show(10)

    dataset = model.recommendForAllUsers(50).toPandas()
    dataset1 = predictions.toPandas()


    dataset1.query('UID=='+str(UID)).head()
    result = pd.merge( dataset, dataset1, left_on='UID_index', right_on='UID_index')
    result.query('UID=='+str(UID)).head()
    recos = result['recommendations'].iloc[0]
    reco = str(recos)
    MID_indexList = []
    ratingList = []
    aa = reco.replace('[', '')
    moviestr3 = aa.replace("]", "")
    moviestr0 = moviestr3.replace("(", "")
    moviestr2 = moviestr0.replace("rating=", "")
    moviestr1 = moviestr2.replace("RowMID_index=", "")
    moviestr = moviestr1.split('),')
    # print(moviestr)
    for i in moviestr:
        # print(i)
        rates = i.replace(")", "")
        score = rates.split(',')
        MID_indexList.append(score[0])
        ratingList.append(score[1])


    data = {'MID_index': [], "rating": []}
    recoDF = pd.DataFrame(data)
    recoDF["MID_index"] = MID_indexList
    recoDF["rating"] = ratingList

    recoDF["MID_index"] = pd.to_numeric(recoDF["MID_index"])
    recoDF["rating"] = pd.to_numeric(recoDF["rating"])

    tran = trans[['MID_index','MID']]
    Finalresult = pd.merge( recoDF, trans, left_on='MID_index', right_on='MID_index')
    Finalresult = Finalresult[['MID','rating']]
    Finalresult.drop_duplicates(keep=False,inplace=True)

    print("*******************************************************************************************")
    print(Finalresult)
    return Finalresult
def als_model(training):
    als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)
    model.save("./../data/als_model11")
    return model
示例#20
0
    'funding_id',
    'backedAmount',
))

ratings.filter("user_id is NULL").show()

(training, test) = ratings.randomSplit([0.8, 0.2], seed=13)
training.show()
test.show()

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(rank=50,
          maxIter=20,
          regParam=0.01,
          userCol="user_id",
          itemCol="funding_id",
          ratingCol="backedAmount",
          coldStartStrategy="drop",
          implicitPrefs=False)
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="backedAmount",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

userRecs = model.recommendForAllUsers(10)
val_new = val.withColumn(
    'rating',
    when(val.is_read == 0, float('nan')).otherwise(val.rating))
val_read = val_new.na.drop()
val_unread = val.subtract(val_read)

#num_iters = [5,10,15,20]
iteration = 20
reg_params = [0.001, 0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
ranks = [8, 10, 12, 14, 16, 18, 20]

#built unread model first
als = ALS(maxIter=5,
          regParam=0.0,
          rank=10,
          userCol='user_id',
          itemCol='book_id',
          ratingCol='rating',
          coldStartStrategy="drop",
          nonnegative=True)
model_unread = als.fit(train_unread)
predictions_unread = model_unread.transform(val_unread)


def tune_ALS_map(train_read, val_read, val_true_list, iteration, regParams,
                 current_rank):
    """
    grid search function to select the best model based on RMSE of
    validation data
    Parameters
    ----------
    train_data: spark DF with columns ['userId', 'movieId', 'rating']
示例#22
0
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# read data
client = boto3.client('s3')
obj = client.get_object(Bucket='yelpdatacf', Key='yelp_public_cf.csv')
df = pd.read_csv(obj['Body'])

df.review_stars = (df.review_stars - df.review_stars.mean())
ratings = spark.createDataFrame(df)

# use the model that has min RMSE
num_iter, param = 100, 0.2
als = ALS(
    maxIter=num_iter,
    regParam=param,
    userCol="user_int_id",  #implicitPrefs=True,
    itemCol="bus_id",
    ratingCol="review_stars",
    coldStartStrategy="drop")
model = als.fit(ratings)

user_feature = model.userFactors
business_feature = model.itemFactors

k = 10
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
model = kmeans.fit(user_feature)
transformed = model.transform(user_feature).select('id', 'prediction')
rows = transformed.collect()
df = spark.createDataFrame(rows)
示例#23
0
# convert to dataframe
userIdDf = spark.createDataFrame(userIdRdd) \
    .withColumnRenamed('_1', 'user_id') \
    .withColumnRenamed('_2', 'user_id_indexed')
businessIdDf = spark.createDataFrame(businessIdRdd) \
    .withColumnRenamed('_1', 'business_id') \
    .withColumnRenamed('_2', 'business_id_indexed')
# join user id zipped with index and business id with index
training = training.join(userIdDf, ['user_id'],
                         'left').join(businessIdDf, ['business_id'], 'left')
als = ALS(
    maxIter=5,
    rank=70,  # ORIGINAL
    # rank=3,
    regParam=0.01,
    # regParam=0.1,
    userCol='user_id_indexed',
    itemCol='business_id_indexed',
    ratingCol='user-business-interaction',
    coldStartStrategy='drop')
als.setSeed(seed)
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
print("this is what the regular als input and output looks like")
test.show()  # I should make my cross product have the same columns
predictions.show()
print("this is what the regular als input and output looks like")
# sys.exit()
predictions = predictions.join(user_mean, ['user_id'], 'left')
train = train.withColumn("book_id", train["book_id"].cast(IntegerType()))

test = test.withColumn("rating", test["rating"].cast(FloatType()))
test = test.withColumn("user_id", test["user_id"].cast(IntegerType()))
test = test.withColumn("book_id", test["book_id"].cast(IntegerType()))

## evaluate baseline model
# best param for baseline model
iteration = 20
reg = 0.1
current_rank = 20

als = ALS(maxIter=iteration,
          regParam=reg,
          rank=current_rank,
          userCol='user_id',
          itemCol='book_id',
          ratingCol='rating',
          coldStartStrategy="drop",
          nonnegative=True)
# train ALS model
model = als.fit(train)
# evaluate the model by computing the RMSE on the validation data
predictions = model.transform(test)
window = Window.partitionBy(predictions['user_id']).orderBy(
    predictions['prediction'].desc())
test_pred_order = predictions.select(
    '*',
    rank().over(window).alias('rank')).filter(col('rank') <= 500)

evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",

if __name__ == '__main__':

    spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()

    movie_name = loadMovieNames('./ml-100k/u.item')
    text = spark.read.text('./ml-100k/u.data').rdd  # Remember to add .rdd
    data = text.map(parseInput)

    data_frame = spark.createDataFrame(
        data).cache()  # Remember to cache to avoid reload

    als = ALS(maxIter=5,
              regParam=0.01,
              userCol="userID",
              itemCol="movieID",
              ratingCol="rating")
    model = als.fit(data_frame)

    user_rating = data_frame.filter("userID = 0")

    rating_count = data_frame.groupBy("movieID").count().filter("count > 100")
    # Create new column
    popular_movie = rating_count.select("movieID").withColumn('userID', lit(0))

    # Use data to predict recommendation using ALS model
    recommendations = model.transform(popular_movie)
    topRecommendation = recommendations.sort(
        recommendations.prediction.desc()).take(20)
示例#26
0
indexed = item_indexer.fit(indexed).transform(indexed)

# Join metadata for looking movie title
meta = spark.table("amazon_meta").select("product_id", "title")
indexed = indexed.join(meta, "product_id")

# Split data into train and test data set
(training, test) = indexed. \
  select("user_id_index", "product_id_index", "score", "reviewed_at", "title"). \
  randomSplit([0.6, 0.4], seed=0)

# Train and evaluate with ALS

from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
als = ALS(maxIter=5,
          userCol="user_id_index",
          itemCol="product_id_index",
          ratingCol="score")

param_grid = ParamGridBuilder().addGrid(als.regParam, [0.01, 0.1, 1.0]).build()

evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="score",
                                predictionCol="prediction")

tvs = TrainValidationSplit(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
)

model = tvs.fit(training)
spark = SparkSession.builder.appName("ALSExample").getOrCreate()

moviesSchema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

names = loadMovieNames()

ratings = spark.read.option("sep", "\t").schema(moviesSchema) \
    .csv("file:///SparkCourse/ml-100k/u.data")

print("Training recommendation model...")

als = ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userID").setItemCol("movieID") \
    .setRatingCol("rating")

model = als.fit(ratings)

# Manually construct a dataframe of the user ID's we want recs for
userID = int(sys.argv[1])
userSchema = StructType([StructField("userID", IntegerType(), True)])
users = spark.createDataFrame([[
    userID,
]], userSchema)

recommendations = model.recommendForUserSubset(users, 10).collect()

print("Top 10 recommendations for user ID " + str(userID))

for userRecs in recommendations:
示例#28
0
	print('seting ml pipeline')
	rank=8
	maxIter=10
	regParam=0.03
	alpha = 1
	implicitPrefs = 'True'
	mlflow.log_param("rank", rank)
	mlflow.log_param("maxIter", maxIter)
	mlflow.log_param("regParam", regParam)
	mlflow.log_param("alpha", alpha)
	mlflow.log_param("implicitPrefs", implicitPrefs)

	als = ALS(
		maxIter=maxIter, 
		rank=rank, 
		regParam=regParam,
		alpha = alpha, 
		implicitPrefs=True,
		userCol="user_id", itemCol="product_id", ratingCol="count",
		coldStartStrategy="drop")
	# Define evaluator as RMSE
	evaluator = RegressionEvaluator(metricName="rmse",labelCol="count",predictionCol="prediction")

	print('training model...')
	tunedModel = als.fit(train_data)
	print('testing model ....')
	predictions = tunedModel.transform(test_data)
	test_metric = evaluator.evaluate(predictions)
	mlflow.log_metric('test_' + evaluator.getMetricName(), test_metric)

	print('get all predictions....')
	all_predictions = bestModel.transform(data)
示例#29
0
    rev_usr_bus = spark.read.options(inferSchema=True).json(
        "/home/tanmay/IdeaProjects/BigData_Project/Output/rev_usr_bus")

    # Index business and user IDs and align with (review) rating.
    user_reviews = review_df.select("review_id", "text").withColumnRenamed("review_id", "r_review_id") \
        .join(rev_usr_bus, col("review_id") == col("r_review_id")) \
        .select("review_id", "business_id", "user_id", "r_stars")
    processed = process_review_data(user_reviews)
    user_reviews_train, user_reviews_test = processed.randomSplit([0.90, 0.1],
                                                                  seed=100)

    # Build the recommendation model using ALS on the training data
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5,
              regParam=0.01,
              userCol="user_id_ind",
              itemCol="business_id_ind",
              ratingCol="r_stars",
              coldStartStrategy="drop")
    model = als.fit(user_reviews_train)

    # Divide the training set to get a sample of how the model works on a sample of the training data.
    sample_train, z = user_reviews_train.randomSplit([0.11, .89], seed=100)
    predictions = model.transform(sample_train)
    final = predictions.withColumn("score",
                                   col("prediction") - lit(3)).select(
                                       "review_id", "business_id", "user_id",
                                       "r_stars", "score")
    final.write.json(
        "/home/tanmay/IdeaProjects/BigData_Project/Output/als_similarities_train",
        mode='overwrite')
示例#30
0
def main(spark, train_data, val_data, downsample=True, extension=None):
    '''Main routine for supervised training
    Parameters
    ----------
    spark : SparkSession object
    train_data : string, path to the training parquet file to load
    val_data: string, path to the validation parquet file to load
    test_data: string, path to the testing parquet file to load
    downsample: TRUE or FALSE. To indicate if we should downsample the data or not
    '''

    ### read in the files
    train = spark.read.parquet(train_data)
    val = spark.read.parquet(val_data)

    ### if down-sample: down-sample train data to random 0.1%
    if downsample:
        train = train.sample(False, 0.00001, seed=0)
        #val = val.sample(False, 0.00001, seed = 0)

    if extension != None:
        if extension == "log":  # log-compression
            train = train.withColumn("log_count",
                                     log("count"))  # apply log-compression

        elif extension == "drop":  # drop low counts
            lower_bound = train.approxQuantile("count", [
                0.1
            ], 0.25)  # treat the 0.1 quantile of count data as the lower bound
            train = train.filter(
                train["count"] > int(lower_bound[0])
            )  # filter out count data rows lower than the lower bound

    ### transform dataframe columns: user_id, track_id from string to float and put them in the pipeline
    user_indexer = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_indexed",
                                 handleInvalid='skip')
    track_indexer = StringIndexer(inputCol="track_id",
                                  outputCol="track_id_indexed",
                                  handleInvalid='skip')
    pipeline = Pipeline(stages=[user_indexer, track_indexer])
    indexing_model = pipeline.fit(train)  #learn (return: pipeline model)

    ### transform the datasets and create the view
    train = indexing_model.transform(
        train)  # return a dataframe with new columns
    train.createOrReplaceTempView("train")
    val = indexing_model.transform(val)  # return a dataframe with new columns
    val.createOrReplaceTempView("val")

    # group by user_id, aggregate track_id_indexed for train and val
    val_groupby = spark.sql(
        "select user_id_indexed, collect_list(track_id_indexed) track_id_indexed_collections from val group by user_id_indexed"
    )
    val_groupby.createOrReplaceTempView("val_groupby")

    # Build the recommendation model using ALS on the training data
    rank = np.arange(4, 10, 2)
    regParam = np.linspace(0.01, 0.2, 3)
    alpha = np.linspace(0.5, 2, 3)
    paramGrid = list(itertools.product(rank, regParam, alpha))
    MAP_lst = []  # store MAP results
    precision_at_500_lst = []  # store precision at 500 results

    for combo in paramGrid:
        rank_, regParam_, alpha_ = combo

        # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
        if extension == "log":
            ratingCol = "log_count"
        else:
            ratingCol = "count"
        als = ALS(rank=rank_,
                  regParam=regParam_,
                  alpha=alpha_,
                  implicitPrefs=True,
                  userCol="user_id_indexed",
                  itemCol="track_id_indexed",
                  ratingCol=ratingCol,
                  coldStartStrategy="drop")

        # Save the model
        model = als.fit(train)  # fit the pipeline onto training data

        # get top 500 recommendations
        userRecs = model.recommendForAllUsers(
            500
        )  # return: dataframe (columns: user_id_indexed, recommendations)
        # [('user_id_indexed', 'int'), ('recommendations', 'array<struct<track_id_indexed:int,rating:float>>')]
        userRecs = userRecs.select(
            userRecs.user_id_indexed,
            userRecs.recommendations.track_id_indexed.alias(
                "pred_list"))  # with track_id_indexed only, no track_id
        userRecs.createOrReplaceTempView("userRecs")  # create temporary view

        combined_df = spark.sql(
            '''select val_groupby.user_id_indexed user_id_indexed, userRecs.pred_list pred_list, 
        val_groupby.track_id_indexed_collections track_id_indexed_collections from userRecs inner join val_groupby on val_groupby.user_id_indexed = userRecs.user_id_indexed'''
        )  # combine dfs wrg to user_id_indexed

        # use ranking metrics for evaluations
        predLabelsTuple = combined_df.rdd.map(
            lambda r:
            (r.pred_list, r.track_id_indexed_collections))  # result: tuple
        metrics = RankingMetrics(predLabelsTuple)
        MAP = metrics.meanAveragePrecision
        precision_at_500 = metrics.precisionAt(500)
        MAP_lst.append(MAP)  # store MAP for each config
        precision_at_500_lst.append(
            precision_at_500)  # store precision at 500 for each config
        # print out validation evaluation result
        print("---------------------------------------")
        print("configs: \n")
        print("rank = " + str(rank_) + " , regParam = " + str(regParam_) +
              " , alpha = " + str(alpha_))
        print("\n")
        print("MAP = " + str(MAP))
        print("Precision at 500 = " + str(precision_at_500))

    min_index = MAP_lst.index(np.max(MAP_lst))
    rank_opt, regParam_opt, alpha_opt = paramGrid[min_index]
    print("---------------------------------------")
    print("optimal configs: \n")
    print("rank = " + str(rank_opt) + " , regParam = " + str(regParam_opt) +
          " , alpha = " + str(alpha_opt))
    print("\n")
    print("MAP = " + str(np.max(MAP_lst)))
    print("Precision at 500 =" + str(precision_at_500_lst[min_index]))