示例#1
0
def benchmark_spark(ratings, factors, iterations=5):
    conf = (SparkConf()
            .setAppName("implicit_benchmark")
            .setMaster('local[*]')
            .set('spark.driver.memory', '16G')
            )
    context = SparkContext(conf=conf)
    spark = SparkSession(context)

    times = {}
    try:
        ratings = convert_sparse_to_dataframe(spark, context, ratings)

        for rank in factors:
            als = ALS(rank=rank, maxIter=iterations,
                      alpha=1, implicitPrefs=True,
                      userCol="row", itemCol="col", ratingCol="data")
            start = time.time()
            als.fit(ratings)
            elapsed = time.time() - start
            times[rank] = elapsed / iterations
            print("spark. factors=%i took %.3f" % (rank, elapsed/iterations))
    finally:
        spark.stop()

    return times
示例#2
0

spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

df = pd.read_csv("query_results.csv")

clusterNames = df["clusterName"].unique().tolist()
clusterToIdMapper = dict(zip(clusterNames, range(len(clusterNames))))
df["clusterName"] = df["clusterName"].apply(lambda x: clusterToIdMapper[x])

propertyValues = df["propertyValue"].unique().tolist()
propertyValueToIdMapper = dict(zip(propertyValues, range(len(propertyValues))))
df["propertyValue"] = df["propertyValue"].apply(lambda x: propertyValueToIdMapper[x])

print("len(clusterToIdMapper)", len(clusterToIdMapper))
print("len(propertyValueToIdMapper)", len(propertyValueToIdMapper))

als = ALS(maxIter=5,
          regParam=0.1,
          userCol="propertyValue",
          itemCol="clusterName",
          ratingCol="dcount_targetId",
          coldStartStrategy="drop")


sparkDF=spark.createDataFrame(df)

sparkDF.show(10)
model=als.fit(sparkDF)
model.itemFactors.show(10, truncate=False)
    # Load up our movie ID -> name dictionary
    movieNames = loadMovieNames()

    # Get the raw data
    lines = spark.read.text("hdfs:///user/maria_dev/ml-100k/u.data.1").rdd

    # Convert it to a RDD of Row objects with (userID, movieID, rating)
    ratingsRDD = lines.map(parseInput)

    # Convert to a DataFrame and cache it
    ratings = spark.createDataFrame(ratingsRDD).cache()

    # Create an ALS collaborative filtering model from the complete dataset
    als = ALS(maxIter=5,
              regParam=0.01,
              userCol="userID",
              itemCol="movieID",
              ratingCol="rating")
    model = als.fit(ratings)

    # Print out ratings from user 0
    print("\nRatings for user ID 0:")
    userRatings = ratings.filter("userID = 0")
    for rating in userRatings.collect():
        print(movieNames[rating['movieID']], rating['rating'])

    print("\nTop 20 recommendations:")
    # Find movies rated more than 100 times
    ratingCounts = ratings.groupBy("movieID").count().filter("count > 100")

    # Construct a "test" dataframe for user 0 with every movie rated more than 100 times
示例#4
0
#create gridsearch to find optimal hyperparameters

try_rank = [30, 35, 40]
try_alpha = [2, 5, 12]
try_reg = [2, 3, 3.5]
auc_res = []

for rank in try_rank:
    for alpha in try_alpha:
        for reg in try_reg:
            #fit model with params for this iteration
            loop_model = ALS(implicitPrefs=True,
                             userCol="userId",
                             itemCol="artistId",
                             ratingCol="song_count",
                             rank=rank,
                             alpha=alpha,
                             regParam=reg).fit(training)
            #evaluate AUC
            loop_auc = areaUnderCurve(
                test, bTopItemIDs, loop_model.transform
            )  #AUC for test data w/pred from iteration's model
            #add tuple of hyperparams and AUC to initalized results list
            auc_res_content = (rank, alpha, reg, loop_auc)
            print(auc_res_content)
            auc_res += tuple([auc_res_content])

final_rank = max(auc_res, key=lambda item: item[3])[0]
final_alpha = max(auc_res, key=lambda item: item[3])[1]
final_reg = max(auc_res, key=lambda item: item[3])[2]
示例#5
0
    open(input_dir + 'business_avg.json'))
#%%
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
train = sc.textFile(input_dir + 'train_review.json').map(
    json.loads).map(lambda x: (x['user_id'], x['business_id'], x['stars']))
userInt = sc.broadcast(train.keys().distinct().zipWithIndex().collectAsMap())
bizInt = sc.broadcast(
    train.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap())
train = train.map(
    lambda x: (userInt.value.get(x[0]), bizInt.value.get(x[1]), x[2])).toDF(
        ['user_id', 'business_id', 'stars'])
# Model 1
als_model = ALS(maxIter=20,
                regParam=0.4,
                userCol='user_id',
                itemCol='business_id',
                ratingCol='stars',
                coldStartStrategy="nan")
als_model = als_model.fit(train)

del train

#%%
weekdays = [
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
    'Sunday'
]


def modifyHours(x):
    if x is not None:
示例#6
0
 def test_storage_levels(self):
     df = self.spark.createDataFrame(
         [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],
         ["user", "item", "rating"])
     als = ALS().setMaxIter(1).setRank(1)
     # test default params
     als.fit(df)
     self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_AND_DISK")
     self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_AND_DISK")
     self.assertEqual(als.getFinalStorageLevel(), "MEMORY_AND_DISK")
     self.assertEqual(als._java_obj.getFinalStorageLevel(), "MEMORY_AND_DISK")
     # test non-default params
     als.setIntermediateStorageLevel("MEMORY_ONLY_2")
     als.setFinalStorageLevel("DISK_ONLY")
     als.fit(df)
     self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_ONLY_2")
     self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_ONLY_2")
     self.assertEqual(als.getFinalStorageLevel(), "DISK_ONLY")
     self.assertEqual(als._java_obj.getFinalStorageLevel(), "DISK_ONLY")
示例#7
0
  lines = spark.read.text('ratings.dat').rdd
  ratingsRDD = lines.map(parse_rating)
  lines = spark.read.text('gender.dat').rdd
  users = dict(lines.map(parse_user).collect())

  ratings = spark.createDataFrame(ratingsRDD)
  (training, test) = ratings.randomSplit([0.8, 0.2])

  num_training = training.count()
  num_validation = test.count()

  print('Training: %d' % num_training)
  print('Validation: %d' % num_validation)

  # setup ALS
  rank = 8
  num_iterations = 8
  lambda_ = 0.1

  als = ALS(maxIter=num_interations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating")
  model = als.fit(training)

  # Evaluate the model by computing the RMSE on the test data
  predictions = model.transform(test)
  evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                  predictionCol="prediction")
  rmse = evaluator.evaluate(predictions)
  print("Root-mean-square error = " + str(rmse))

  spark.stop()
示例#8
0
def train_model(training_df, rank):
    iterations = 10
    als = ALS(rank=rank, maxIter=iterations, implicitPrefs=True)
    return als.fit(training_df)
示例#9
0
def main(spark, df_train, df_val, model_file):

    # import train data
    train = spark.read.parquet(df_train)
    print("imported train data")
    
    # import validation data
    val = spark.read.parquet(df_val)
    print("imported validation data")
    
    # index users and tracks
    indexer1 = StringIndexer(inputCol = "user_id", outputCol = "user_index", handleInvalid = "skip") #skip null values

    # grid search
    bestModel = None
    bestValidationMAP = -1
    best_rank, best_regparam, best_alpha = None, None, None
    list_regParam = [0.05]
    list_rank = [10,20,50] 
    list_alpha = [1,15]

    # drop count threshold
    #drop_count = [1,2,3]
    
    # select records with count > 1
    train = train.filter(train["count"] > 1)
    val = val.filter(val["count"] > 1)
    print("kept records with count > 1")
    
    # Build the recommendation model using ALS on the train data
    for reg, rank, alpha in itertools.product(list_regParam, list_rank, list_alpha):

        als = ALS(seed = 1, rank = rank, regParam = reg, alpha = alpha, userCol = "user_index", itemCol = "track_index", ratingCol = "count", implicitPrefs = True)
    
        # create pipeline
        pipeline = Pipeline(stages=[indexer1,als])
        model = pipeline.fit(train)
        print("trained model with reg = %s, rank = %s, alpha = %s" %(reg, rank, alpha))
        
        # predict on validation data and indexed users
        val_indexed = model.transform(val)
        val_indexed = val_indexed.select([c for c in val_indexed.columns if c in ["user_index", "count", "track_index"]])
        print("indexed users")

        # make labels
        val_indexed.createOrReplaceTempView('val_indexed')
        Labels = spark.sql('SELECT user_index, collect_list(track_index) AS label FROM val_indexed GROUP BY user_index')
        Labels.createOrReplaceTempView('Labels')
        print("created ground truth labels")

        # generate top 500 track recommendations for each user in validation set
        user_subset = val_indexed.select("user_index").distinct()
        userRecs = model.stages[-1].recommendForUserSubset(user_subset,500)
        userRecs.createOrReplaceTempView("userRecs")
        print("made user recommendations")

        # explode recommendations in long format
        Recs = (userRecs.select("user_index", explode("recommendations").alias("pred")).select("user_index", "pred.*"))
        Recs.createOrReplaceTempView("Recs")

        # make predictions
        Preds = spark.sql('SELECT user_index, collect_list(track_index) AS prediction FROM Recs GROUP BY user_index')
        Preds.createOrReplaceTempView("Preds")

        # make label pairs
        Preds_labels = spark.sql('SELECT Preds.prediction AS prediction, Labels.label as label FROM Preds INNER JOIN Labels ON Preds.user_index = Labels.user_index')
        print("inner join preds & labels")

        # calculate MAP
        MAPrecommendationsAndTruth = Preds_labels.select("prediction", "label")
        metrics = RankingMetrics(MAPrecommendationsAndTruth.rdd)
        MAP = metrics.meanAveragePrecision
        print("MAP = %s" % MAP)

        # get best model
        if MAP > bestValidationMAP:
            bestModel = model
            bestValidationMAP = MAP
            best_rank, best_regparam, best_alpha = rank, reg, alpha

    # save best model and params
    pip_model = bestModel
    pip_model.write().overwrite().save(model_file)
    print("Best model saved with reg = %s, rank = %s, alpha = %s, MAP = %s" %(best_regparam, best_rank, best_alpha, bestValidationMAP))
示例#10
0
    # $example on$
    lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
    parts = lines.map(lambda row: row.value.split("::"))
    ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]),
                                         movieId=int(p[1]),
                                         rating=float(p[2]),
                                         timestamp=long(p[3])))
    ratings = spark.createDataFrame(ratingsRDD)
    (training, test) = ratings.randomSplit([0.8, 0.2])

    # Build the recommendation model using ALS on the training data
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5,
              regParam=0.01,
              userCol="userId",
              itemCol="movieId",
              ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)

    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each movie
trainingData.cache()
validationData.cache()
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="plays", metricName="rmse")
regParams = [0.25]
ranks = [16]
tolerance = 0.03
errors = [[0]*len(ranks)]*len(regParams)
models = [[0]*len(ranks)]*len(regParams)
err = 0
min_error = float('inf')
best_rank = -1
i=0
for regParam in regParams:
	j=0
	for rank in ranks:
		als = ALS(maxIter=5, regParam=regParam,rank= rank,alpha=80, seed=8427,userCol="new_user_id", 
			itemCol="new_song_id", ratingCol="plays",implicitPrefs=True)
		model = als.fit(trainingData)
		# Evaluate the model by computing the RMSE on the test data
		predictions = model.transform(validationData)
		# Remove NaN values from prediction (due to SPARK-14489)
		predicted_plays_df = predictions.filter(predictions.prediction != float('nan'))
		#evaluator = RegressionEvaluator(metricName="rmse", labelCol="plays",predictionCol="prediction")
		#rmse = evaluator.evaluate(predictions)
		#print("For regParam: " + str(regParam) + ", rank: " +str(rank) + ", alpha: " + str(alpha) + ", Root-mean-square error = " + str(rmse))
		# Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame
		error = reg_eval.evaluate(predicted_plays_df)
		errors[i][j] = error
		models[i][j] = model
		print("For rank " + str(rank) + ", regularization parameter " + str(regParam) + "the RMSE is " + str(error))
		if error < min_error:
			min_error = error
示例#12
0
def train_als(params, data):
    symbol = ALS(**params)
    with Timer() as t:
        model = symbol.fit(data)
    return model, t
示例#13
0
class RecommendationEngine:
    """A anime recommendation engine
    """
    def __train_model(self):
        """Train the ALS model with the current dataset
        """

        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="user_id",
                       itemCol="anime_id",
                       ratingCol="rating",
                       coldStartStrategy="drop")
        self.model = self.als.fit(self.ratings)

        logger.info("ALS model built!")

    def add_ratings(self, user_id, anime_id, ratings):
        """Add additional anime ratings in the format (user_id, anime_id, rating)
        """
        # Convert ratings to an RDD
        new_ratings = self.spark.createDataFrame(
            [(user_id, anime_id, ratings)], ["user_id", "anime_id", "rating"])
        # Add new ratings to the existing ones
        self.ratings = self.ratings.union(new_ratings)
        # Re-train the ALS model with the new ratings
        self.__train_model()
        new_ratings = new_ratings.toPandas()
        new_ratings = new_ratings.to_json()
        return new_ratings

    def get_ratings_for_anime_ids(self, user_id, anime_id):
        """Given a user_id and a list of anime_ids, predict ratings for them 
        """

        dataframe = self.spark.createDataFrame([(user_id, anime_id)],
                                               ["user_id", "anime_id"])
        predictions = self.model.transform(dataframe)
        ratings = predictions.toPandas()
        ratings = ratings.to_json()

        return ratings

    def get_top_ratings(self, user_id, animes_count):
        """Recommends up to animes_count top unrated animes to user_id
        """
        users = self.ratings.select(self.als.getUserCol()).distinct()
        users = users.filter(users.user_id == user_id)
        top_ratings = self.model.recommendForUserSubset(users, animes_count)

        self.json_top = top_ratings.toPandas()
        self.json_top = self.json_top.to_json()
        return self.json_top

    def get_anime_top_ratings(self, anime_id, users_count):
        """Recommends up to animes_count top unrated animes to user_id
        """
        animes = self.ratings.select(self.als.getItemCol()).distinct()
        animes = animes.filter(animes.anime_id == anime_id)
        anime_top = self.model.recommendForItemSubset(animes, users_count)

        self.json_top = anime_top.toPandas()
        self.json_top = self.json_top.to_json()
        return self.json_top

    def __init__(self, spark, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """

        logger.info("Starting up the Recommendation Engine: ")

        self.spark = spark

        # Load ratings data for later use
        logger.info("Loading Ratings data...")
        ratings_file_path = os.path.join(dataset_path, 'rating.csv')
        self.ratings = spark.read.csv(ratings_file_path,
                                      header=True,
                                      inferSchema=True)
        # Load data Anime
        # logger.info("Loading Anime data...")
        # ratings_file_path = os.path.join(dataset_path, 'anime.csv')
        # self.animes = spark.read.csv(ratings_file_path, header=True, inferSchema=True)

        self.__train_model()
示例#14
0
    "/Users/grey/Documents/Big Data/project/files/ratings_small.csv")
linesRdd = lines.mapPartitions(lambda x: csv.reader(x))
ratingheader = linesRdd.first()
linesRdd = linesRdd.filter(lambda x: x != ratingheader)
# parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = linesRdd.map(
    lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2], int(sys.argv[1]))

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5,
          regParam=0.01,
          userCol="userId",
          itemCol="movieId",
          ratingCol="rating",
          coldStartStrategy="drop",
          rank=70)
als.setSeed(int(sys.argv[1]))
# Fits a model to the input dataset with optional parameters.
# Returns:	fitted model(s)
model = als.fit(training)

# # Evaluate the model by computing the RMSE on the test data
# # transform()    Transforms the input dataset with optional parameters.
# predictions = model.transform(test)
# # Evaluator for Regression, which expects two input columns: prediction and label.
# evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
#                                 predictionCol="prediction")
# # evaluate()  Returns: metric
        
    def _evaluate(self, dataset):       
        error=self.rmse(dataset,self.predictionCol,self.targetCol)
        print ("Error: {}".format(error))
        return error
    
    def isLargerBetter(self):
        return False
    
    @staticmethod
    def rmse(dataset,predictionCol,targetCol):
        return sqrt(dataset.dropna().map(lambda x: (x[targetCol] - x[predictionCol]) ** 2).reduce(add) / float(dataset.count()))


    
lr1 = ALS()
grid1 = ParamGridBuilder().addGrid(lr1.regParam, [1.0,0.5,2.0]).build()
evaluator1 = MiEvaluador(predictionCol=lr1.getPredictionCol(),targetCol=lr1.getRatingCol())
cv1 = CrossValidator(estimator=lr1, estimatorParamMaps=grid1, evaluator=evaluator1, numFolds=2)
cvModel1 = cv1.fit(dfRatings)
a=cvModel1.transform(dfRatings)
error_cross_validation=MiEvaluador.rmse(a,lr1.getPredictionCol(),lr1.getRatingCol())
print ('ERROR de validacion: {}'.format(error_cross_validation))

error_models=[]
for reg_param in (1.0,0.5,2.0):
    lr = ALS(regParam=reg_param)
    model = lr.fit(dfRatings)
    error=MiEvaluador.rmse(model.transform(dfRatings),lr.getPredictionCol(),lr.getRatingCol())
    error_models.append(error)
    print ('reg_param: {}, rmse: {}'.format(reg_param,error))
# the ML libraries require integers, so we need to create keys for the users & videos temporarily
user_ids = ratings.select("userid").distinct().rdd.zipWithUniqueId()
user_map = user_ids.map(lambda (x, y): Row(userid=x.userid, userid_int=y)).toDF().cache()

# same as above - this is a UUID/int mapping
video_ids = ratings.select("videoid").distinct().rdd.zipWithUniqueId().cache()
video_map = video_ids.map(lambda (x, y): Row(videoid=x.videoid, videoid_int=y)).toDF().cache()

print "Recommending based on {0} users and {1} videos.".format(user_map.count(), video_map.count())

training_data = ratings.join(user_map, ratings.userid == user_map.userid).\
                    join(video_map, ratings.videoid == video_map.videoid).\
                    select(user_map.userid, user_map.userid_int, video_map.videoid, video_map.videoid_int, "rating")

# Create ALS transformer and train with the ratings from our C* table
als = ALS(rank=10, maxIter=10).setUserCol("userid_int").setItemCol("videoid_int").setRatingCol("rating")
model = als.fit(training_data)

users = user_map.collect()
user_map.unpersist()
count = 0
length = len(users)
for user in users:
    videos_and_user = video_map.withColumn("userid", lit(user.userid)).\
                            withColumn("userid_int", lit(user.userid_int))

    model.transform(videos_and_user).\
        sort("prediction", ascending=False).limit(30).\
        select("videoid", "userid", col("prediction").alias("rating")).\
        write.format("org.apache.spark.sql.cassandra").\
        options(keyspace="killrvideo", table="video_recommendations_by_video").\
示例#17
0
def als_model(userid, df):

    als_df_pd = session.execute('SELECT *  FROM movie_rating')
    #als_df_pd = pd.read_csv("ratings_small.csv")
    movie_list_df = df.select('id', 'title')
    movie_list_df = movie_list_df.withColumn('userId', lit(userid))

    for col in als_df_pd.columns:
        if als_df_pd[col].dtypes == 'object':
            als_df_pd[col] = als_df_pd[col].astype('str')
    ratings = sqlContext.createDataFrame(als_df_pd)

    #ratings.printSchema()
    #ratings.show()
    #print((ratings.count(), len(ratings.columns)))

    mv_notwatched_df = ratings.filter(ratings.userId == userid)\
        .select('movieId')\
        .join(movie_list_df, ratings.movieId == movie_list_df.id, 'right_outer')\
        .drop("movieId")\
        .withColumnRenamed("id", "movieId")

    #ratings.groupBy("userID").count().show()

    usercount = ratings.agg(
        countDistinct(ratings.userId).alias("Users_Count")).head()[0]

    #print('The number of distinct values of  Users is: ', str(usercount))

    (training, test) = ratings.randomSplit([0.8, 0.2])

    # # Build the recommendation model using ALS on the training data
    # # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics

    als = ALS(maxIter=5,
              regParam=0.01,
              userCol="userId",
              itemCol="movieId",
              ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)

    # # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("ALS- Model Root-mean-square error before Tuning= " + str(rmse))
    # # Generate top 10 movie recommendations for each user

    print("Top 10 movies recommended for each user")
    userRecs = model.recommendForAllUsers(10)
    userRecs.show(10)

    # Generate top 10 user recommendations for each movie
    print("Top 10 movies recommended for each movie")
    movieRecs = model.recommendForAllItems(10)
    movieRecs.show(10)

    # Tune the model

    pipeline = Pipeline(stages=[als])

    paramGrid = ParamGridBuilder() \
        .addGrid(als.regParam, [0.1, 0.01]) \
        .build()

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=2)  # use 3+ folds in practice

    # Run cross-validation, and choose the best set of parameters.
    cvModel_fitted = crossval.fit(training)

    #print("best model")
    bestModel = cvModel_fitted.bestModel

    print("ALS model - Root-mean-square error after Tuning= " + str(rmse))
    predictions = cvModel_fitted.transform(test)

    print("Best  Prediction Model")
    predictions.show(10)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)

    print("Root-mean-square error after cross validation = " + str(rmse))

    param_dict = cvModel_fitted.bestModel.stages[0].extractParamMap()

    print("List of Movies not watched by the User")
    mv_notwatched_df.show(10)

    print("Top 10 movies Recommended")
    top_10 = cvModel_fitted.transform(mv_notwatched_df).orderBy(
        desc('prediction')).limit(10)
    top_10.show(10)
    return top_10
示例#18
0
    spark = SparkSession\
        .builder\
        .appName("ALSExample")\
        .getOrCreate()

    # $example on$
    lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
    parts = lines.map(lambda row: row.value.split("::"))
    ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                         rating=float(p[2]), timestamp=long(p[3])))
    ratings = spark.createDataFrame(ratingsRDD)
    (training, test) = ratings.randomSplit([0.8, 0.2])

    # Build the recommendation model using ALS on the training data
    # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
              coldStartStrategy="drop")
    model = als.fit(training)

    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)
    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(10)

    # Generate top 10 movie recommendations for a specified set of users
示例#19
0
def run_train(spark, train, test_input, param_dict):
    models = {}

    # als_param = param_dict["als"]
    als_nn_param = param_dict["als_nn"]
    # als_ibcf_param = param_dict["als_ibcf"]
    als_nn_ibcf_param = param_dict["als_nn_ibcf"]
    # als_ibcf_mean_param = param_dict["als_ibcf_mean"]
    # als_nn_ibcf_mean_param = param_dict["als_nn_ibcf_mean"]

    user_col = "MASV1"
    item_col = "F_MAMH"
    item_index_col = "F_MAMH_index"
    grade_col = "TKET"
    prediction_col = "prediction"
    #IBCF prediction model
    # print("train count: {}".format(train.count()))
    # print("test_input count: {}".format(test_input.count()))

    ibcf_estimator = IBCFEstimator(spark, user_col, item_col, item_index_col,
                                   grade_col, prediction_col)
    ibcf_model = ibcf_estimator.fit(train)

    nbcf_estimator = NBCFEstimator(spark, user_col, item_col, grade_col,
                                   prediction_col)
    nbcf_model = nbcf_estimator.fit(
        train.unionAll(test_input).drop(item_index_col))

    # user_col = "MASV1"
    # item_col = "F_MAMH"
    # item_index_col = "F_MAMH_index"
    # grade_col = "TKET"
    # prediction_col = "prediction"
    #
    # #IBCF prediction model
    # ibcf_model = IBCF(spark, user_col, item_col, item_index_col, grade_col, prediction_col)
    # train_part_df = ibcf_model.remove_unknown_item(train, test_input)
    # validate_part_df = ibcf_model.remove_unknown_item(train, test_output)
    # item_similarity_df = ibcf_model.fit(train.drop(item_col))
    #
    # for rank in ibcf_ranks:
    #     result_df = ibcf_model.predict(validate_part_df, item_similarity_df, train_part_df, rank)
    #     result_df.show()
    #     error_ibcf = evaluate(result_df,evaluators)
    #     error_list_ibcf[rank] = error_ibcf
    #
    # #NBCF prediction model
    # nbcf_model = NBCF(spark, user_col, item_col, item_index_col, grade_col, prediction_col)
    # train_df = train.unionAll(test_input)
    # user_similarity = nbcf_model.fit(train_df.drop(item_col))
    # for rank in ibcf_ranks:
    #     result_df = nbcf_model.predict(test_output, user_similarity, train_df, rank)
    #     result_df.show()
    #     error_nbcf = evaluate(result_df,evaluators)
    #     error_list_nbcf[rank] = error_nbcf

    als_input = train.unionAll(test_input)
    #
    # als non negative false
    # als = ALS(rank=als_param["rank"], maxIter=15, regParam=0.01, userCol="MASV1",
    #               itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False)
    #
    # als_model = als.fit(als_input)

    als_nn = ALS(rank=als_nn_param["rank"],
                 maxIter=15,
                 regParam=0.01,
                 userCol="MASV1",
                 itemCol="F_MAMH_index",
                 ratingCol="TKET",
                 coldStartStrategy="drop",
                 nonnegative=True)
    als_nn_model = als_nn.fit(als_input)

    # combine mf_ibcf_model

    # # als_ibcf
    # als_ibcf_als = ALS(rank=als_ibcf_param["als_rank"], maxIter=15, regParam=0.01, userCol="MASV1",
    #                  itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False)
    # als_ibcf_als_model = als_ibcf_als.fit(als_input)
    # als_ibcf_model = IBCFWithItemFactor(spark, als_ibcf_als_model.itemFactors, IBCFWithItemFactor.create_item_index(als_input,"F_MAMH", "F_MAMH_index")) \
    #             .setUserCol("MASV1") \
    #             .setItemCol("F_MAMH") \
    #             .setValueCol("TKET") \
    #             .setRank(als_ibcf_param["ibcf_rank"])
    #
    # # als_ibcf_mean
    # als_ibcf_mean_als = ALS(rank=als_ibcf_mean_param["als_rank"], maxIter=15, regParam=0.01, userCol="MASV1",
    #                  itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False)
    # als_ibcf_mean_als_model = als_ibcf_mean_als.fit(als_input)
    # als_ibcf_mean_ibcf_model = IBCFWithItemFactor(spark, als_ibcf_mean_als_model.itemFactors) \
    #             .setUserCol("MASV1") \
    #             .setItemCol("F_MAMH_index") \
    #             .setValueCol("TKET") \
    #             .setRank(als_ibcf_mean_param["ibcf_rank"])
    #
    # als_ibcf_mean_model = ALSIBCFMeanModel(spark, als_ibcf_mean_ibcf_model, als_ibcf_mean_als_model)\
    #             .setUserCol("MASV1") \
    #             .setItemCol("F_MAMH_index") \
    #             .setValueCol("TKET")
    #
    # als_nn_ibcf
    als_nn_ibcf_als = ALS(rank=als_nn_ibcf_param["als_rank"],
                          maxIter=15,
                          regParam=0.01,
                          userCol="MASV1",
                          itemCol="F_MAMH_index",
                          ratingCol="TKET",
                          coldStartStrategy="drop",
                          nonnegative=True)
    als_nn_ibcf_als_model = als_nn_ibcf_als.fit(als_input)
    als_nn_ibcf_model = IBCFWithItemFactor(spark, als_nn_ibcf_als_model.itemFactors, IBCFWithItemFactor.create_item_index(als_input,"F_MAMH", "F_MAMH_index")) \
                .setUserCol("MASV1") \
                .setItemCol("F_MAMH") \
                .setValueCol("TKET") \
                .setRank(als_nn_ibcf_param["ibcf_rank"])
    #
    # # als_nn_ibcf_mean
    # als_ibcf_nn_mean_als = ALS(rank=als_nn_ibcf_mean_param["als_rank"], maxIter=15, regParam=0.01, userCol="MASV1",
    #                  itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False)
    # als_ibcf_nn_mean_als_model = als_ibcf_nn_mean_als.fit(als_input)
    # als_nn_ibcf_mean_ibcf_model = IBCFWithItemFactor(spark, als_ibcf_nn_mean_als_model.itemFactors) \
    #             .setUserCol("MASV1") \
    #             .setItemCol("F_MAMH_index") \
    #             .setValueCol("TKET") \
    #             .setRank(als_nn_ibcf_mean_param["ibcf_rank"])
    #
    # als_nn_ibcf_mean_model = ALSIBCFMeanModel(spark, als_nn_ibcf_mean_ibcf_model, als_ibcf_nn_mean_als_model)\
    #             .setUserCol("MASV1") \
    #             .setItemCol("F_MAMH_index") \
    #             .setValueCol("TKET")

    baseline_model = MeanTransformer(spark)\
        .setUserCol("MASV1")\
        .setItemCol("F_MAMH_index")\
        .setValueCol("TKET")\
        .setOutputCol("prediction")

    models["ibcf"] = ibcf_model
    models["ubcf"] = nbcf_model
    # models["als"] = als_model
    models["als_nn"] = als_nn_model
    # models["als_ibcf"] = als_ibcf_model
    models["als_nn_ibcf"] = als_nn_ibcf_model
    # models["als_ibcf_mean"] = als_ibcf_mean_model
    # models["als_nn_ibcf_mean"] = als_nn_ibcf_mean_model
    models["baseline"] = baseline_model

    return models
示例#20
0
import os
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv("/Users/luokui/laji/ml-100k/u.data",
                 sep="\t",
                 header=None,
                 names=names).head(10000)

spark = SparkSession.builder.master("spark://luokuideMacBook-Pro.local:7077"
                                    ).appName("test.als").getOrCreate()
data = spark.createDataFrame(df)

(trainsss, testing) = data.randomSplit([0.8, 0.2])
(training, valid) = trainsss.randomSplit([0.8, 0.2])

del df

als = ALS(maxIter=10,
          regParam=0.01,
          userCol="user_id",
          itemCol="item_id",
          ratingCol="rating",
          coldStartStrategy="drop",
          numBlocks=6)
model = als.fit(training)
示例#21
0
def get_best_param(spark, train, test_input, test_output, rank_list,
                   ibcf_ranks):
    evaluator_rmse = RegressionEvaluator(metricName="rmse",
                                         labelCol="TKET",
                                         predictionCol="prediction")
    evaluator_mse = RegressionEvaluator(metricName="mse",
                                        labelCol="TKET",
                                        predictionCol="prediction")
    evaluator_mae = RegressionEvaluator(metricName="mae",
                                        labelCol="TKET",
                                        predictionCol="prediction")
    evaluators = [evaluator_rmse, evaluator_mse, evaluator_mae]
    error_list_als = {}
    error_list_als_nn = {}
    error_list_als_ibcf = {}
    error_list_als_nn_ibcf = {}
    error_list_combine = {}
    error_list_combine_nn = {}
    error_list_ibcf = {}
    error_list_nbcf = {}
    error_models = {}
    best_models = {}
    # test_input.show()
    # test_output.show()
    baseline_model = MeanTransformer(spark)\
        .setUserCol("MASV1")\
        .setItemCol("F_MAMH_index")\
        .setValueCol("TKET")\
        .setOutputCol("prediction")
    # predi = baseline_model.transform(test_input, test_output)
    # predi.show()
    user_col = "MASV1"
    item_col = "F_MAMH"
    item_index_col = "F_MAMH_index"
    grade_col = "TKET"
    prediction_col = "prediction"
    #
    #IBCF prediction model
    ibcf_estimator = IBCFEstimator(spark, user_col, item_col, item_index_col,
                                   grade_col, prediction_col)
    train_part_df = ibcf_estimator.remove_unknown_item(train, test_input)
    validate_part_df = ibcf_estimator.remove_unknown_item(train, test_output)
    ibcf_model = ibcf_estimator.fit(train)

    for rank in ibcf_ranks:
        result_df = ibcf_model.transform(train_part_df.drop("F_MAMH_index"),
                                         validate_part_df.drop("F_MAMH_index"),
                                         rank)
        # result_df.show()
        error_ibcf = evaluate(result_df, evaluators)
        error_list_ibcf[rank] = error_ibcf
        best_models = put_best_model(
            best_models, "ibcf",
            Model_Error_Wrapper("ibcf_{}".format(rank), ibcf_model,
                                error_ibcf[0], {"rank": rank}))
    # #
    #NBCF prediction model
    nbcf_model = NBCFEstimator(spark, user_col, item_col, grade_col,
                               prediction_col)
    train_df = train.unionAll(test_input)
    nbcf_model = nbcf_model.fit(train_df.drop("F_MAMH_index"))
    for rank in ibcf_ranks:
        result_df = nbcf_model.transform(test_output.drop("F_MAMH_index").drop("TKET"), rank)\
            .join(test_output.drop("F_MAMH_index"), [user_col, item_col])
        error_nbcf = evaluate(result_df, evaluators)
        error_list_nbcf[rank] = error_nbcf
        best_models = put_best_model(
            best_models, "ubcf",
            Model_Error_Wrapper("ubcf_{}".format(rank), nbcf_model,
                                error_nbcf[0], {"rank": rank}))

    for i in range(len(rank_list)):
        als_input = train.unionAll(test_input)

        # # als non negative false
        # als = ALS(rank=rank_list[i], maxIter=15, regParam=0.01, userCol="MASV1",
        #           itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False)
        #
        # als_model = als.fit(als_input)
        # predict_als = als_model.transform(test_output)

        # als non negative true
        als_nn = ALS(rank=rank_list[i],
                     maxIter=15,
                     regParam=0.01,
                     userCol="MASV1",
                     itemCol="F_MAMH_index",
                     ratingCol="TKET",
                     coldStartStrategy="drop",
                     nonnegative=True)
        als_nn_model = als_nn.fit(als_input)
        predict_als_nn = als_nn_model.transform(test_output)
        # predict_als_nn.coalesce(1).write.option("header", "true").option("charset", "UTF-8").csv("output_test_" + str(i) + ".csv")
        # error_als = evaluate(predict_als, evaluators)
        error_als_nn = evaluate(predict_als_nn, evaluators)

        # error_list_als[rank_list[i]] = error_als
        error_list_als_nn[rank_list[i]] = error_als_nn

        # best_models = put_best_model(best_models, "als",
        #                              Model_Error_Wrapper("als_{}".format(rank_list[i]), als_model, error_als[0], {"rank": rank_list[i]}))
        best_models = put_best_model(
            best_models, "als_nn",
            Model_Error_Wrapper("als_nn_{}".format(rank_list[i]), als_nn_model,
                                error_als_nn[0], {"rank": rank_list[i]}))

        # combine mf_ibcf_model

        for ibcf_rank in ibcf_ranks:
            # als_ibcf
            # als_ibcf_model = IBCFWithItemFactor(spark, als_model.itemFactors, IBCFWithItemFactor.create_item_index(als_input,"F_MAMH", "F_MAMH_index")) \
            #     .setUserCol("MASV1") \
            #     .setItemCol("F_MAMH") \
            #     .setValueCol("TKET") \
            #     .setRank(ibcf_rank)
            # predict_als_ibcf = als_ibcf_model.transform(test_input, test_output.drop("TKET"))
            # predict_als_ibcf_with_gt = predict_als_ibcf.join(test_output, ["MASV1", "F_MAMH"])
            # # predict_als_ibcf_with_gt.show()
            # error_als_ibcf = evaluate(predict_als_ibcf_with_gt, evaluators)
            # error_list_als_ibcf["{}_{}".format(rank_list[i], ibcf_rank)] = error_als_ibcf
            # best_models = put_best_model(best_models, "als_ibcf",
            #                              Model_Error_Wrapper("als_ibcf_{}_{}".format(rank_list[i], ibcf_rank),
            #                                                  als_ibcf_model, error_als_ibcf[0], {"als_rank": rank_list[i],
            #                                                                                      "ibcf_rank": ibcf_rank}))
            #
            #     # als_ibcf_mean
            #     als_ibcf_mean_model = ALSIBCFMeanModel(spark, als_ibcf_model, als_model).setUserCol("MASV1") \
            #         .setItemCol("F_MAMH_index") \
            #         .setValueCol("TKET")
            #     combine = als_ibcf_mean_model.transform(test_input, test_output.drop("TKET")).join(test_output, ["MASV1",
            #                                                                                                      "F_MAMH_index"])
            #     # combine.show()
            #
            #     # combine with als
            #     # combine = predict_als_ibcf.withColumnRenamed("prediction", "prediction_ibcf") \
            #     #     .join(predict_als.withColumnRenamed("prediction", "prediction_als"), ["MASV1", "F_MAMH_index"]) \
            #     #     .withColumn("prediction", (col("prediction_ibcf") + col("prediction_als")) / 2)
            #
            #     error_combine = evaluate(combine, evaluators)
            #     error_list_combine["{}_{}".format(rank_list[i], ibcf_rank)] = error_combine
            #     best_models = put_best_model(best_models, "als_ibcf_mean",
            #                                  Model_Error_Wrapper("als_ibcf_mean_{}_{}".format(rank_list[i], ibcf_rank),
            #                                                      als_ibcf_mean_model, error_combine[0],{"als_rank": rank_list[i],
            #                                                                                          "ibcf_rank": ibcf_rank}))
            #
            # als_nn_ibcf
            als_nn_ibcf_model = IBCFWithItemFactor(spark, als_nn_model.itemFactors, IBCFWithItemFactor.create_item_index(als_input,"F_MAMH", "F_MAMH_index")) \
                .setUserCol("MASV1") \
                .setItemCol("F_MAMH") \
                .setValueCol("TKET") \
                .setRank(ibcf_rank)
            predict_als_nn_ibcf = als_nn_ibcf_model.transform(
                test_input, test_output.drop("TKET"))
            predict_als_nn_ibcf_with_gt = predict_als_nn_ibcf.join(
                test_output, ["MASV1", "F_MAMH"])

            error_als_nn_ibcf = evaluate(predict_als_nn_ibcf_with_gt,
                                         evaluators)
            error_list_als_nn_ibcf["{}_{}".format(
                rank_list[i], ibcf_rank)] = error_als_nn_ibcf
            best_models = put_best_model(
                best_models, "als_nn_ibcf",
                Model_Error_Wrapper(
                    "als_nn_ibcf_{}_{}".format(rank_list[i], ibcf_rank),
                    als_nn_ibcf_model, error_als_nn_ibcf[0], {
                        "als_rank": rank_list[i],
                        "ibcf_rank": ibcf_rank
                    }))
        #
        #     # als_nn_ibcf_mean
        #     als_nn_ibcf_mean_model = ALSIBCFMeanModel(spark, als_nn_ibcf_model, als_nn_model).setUserCol("MASV1") \
        #         .setItemCol("F_MAMH_index") \
        #         .setValueCol("TKET")
        #     combine_nn = als_nn_ibcf_mean_model.transform(test_input, test_output.drop("TKET")).join(test_output,
        #                                                                                              ["MASV1",
        #                                                                                               "F_MAMH_index"])
        #     # combine_nn.show()
        #
        #     # combine with als_nn
        #     # combine_nn = predict_als_nn_ibcf.withColumnRenamed("prediction", "prediction_ibcf") \
        #     #     .join(predict_als_nn.withColumnRenamed("prediction", "prediction_als"), ["MASV1", "F_MAMH_index"]) \
        #     #     .withColumn("prediction", (col("prediction_ibcf") + col("prediction_als")) / 2)
        #
        #     error_combine_nn = evaluate(combine_nn, evaluators)
        #     error_list_combine_nn["{}_{}".format(rank_list[i], ibcf_rank)] = error_combine_nn
        #     best_models = put_best_model(best_models, "als_nn_ibcf_mean",
        #                                  Model_Error_Wrapper("als_nn_ibcf_mean_{}_{}".format(rank_list[i], ibcf_rank),
        #                                                      als_nn_ibcf_mean_model, error_combine_nn[0],{"als_rank": rank_list[i],
        #                                                                                          "ibcf_rank": ibcf_rank}))

    # error_models["als"] = error_list_als
    error_models["als_nn"] = error_list_als_nn
    # error_models["als_ibcf"] = error_list_als_ibcf
    error_models["als_nn_ibcf"] = error_list_als_nn_ibcf
    # error_models["als_ibcf_mean"] = error_list_combine
    # error_models["als_nn_ibcf_mean"] = error_list_combine_nn

    best_models["baseline"] = Model_Error_Wrapper("baseline", baseline_model,
                                                  0, {})
    error_models["ibcf"] = error_list_ibcf
    error_models["ubcf"] = error_list_nbcf
    return error_models, best_models
示例#22
0
def main(spark, train_file, test_file, rank, reg, alpha):
    '''Main routine for supervised training
    Parameters
    ----------
    spark : SparkSession object
    data_file : string, path to the parquet file to load
    model_file : string, path to store the serialized model file
    '''

    # Load the dataframe
    train = spark.read.parquet(train_file)
    test = spark.read.parquet(test_file)

    # Give the dataframe a temporary view so we can run SQL queries
    train.createOrReplaceTempView('train')
    test.createOrReplaceTempView('test')

    # Build model for input parameters
    rank = float(rank)
    reg = float(reg)
    alpha = float(alpha)

    als = ALS(implicitPrefs=True, userCol="user_idx", itemCol="item_idx", ratingCol="count")\
        .setParams(rank=rank, regParam=reg, alpha=alpha)
    model = als.fit(train)

    print("model fitted")

    # Create predition and truth lists
    k = 500

    recommendations = model.recommendForUserSubset(test, k)
    perUserRecom = recommendations.selectExpr(
        "user_idx", "recommendations.item_idx as prediction")
    label_list = test.orderBy(F.col("user_idx"),
                              F.expr("count DESC")).groupby("user_idx").agg(
                                  F.expr("collect_list(item_idx) as label"))
    perUserItem = label_list.select("user_idx", "label")

    print("predition and label")

    predictionAndLabel = perUserItem.join(
        perUserRecom,
        "user_idx").rdd.map(lambda row: (row.prediction, row.label))

    print("inner join")

    # Use Ranking Metrics for evaluation
    metrics = RankingMetrics(predictionAndLabel)
    mean_precision = metrics.meanAveragePrecision

    print(
        "At rank={0}, regParam={1}, alpha = {2}, mean average precision is {3}"
        .format(rank, reg, alpha, mean_precision))

    # Use only for final indexed_test.parquet
    k_precision = metrics.precisionAt(k)
    print(
        "At rank={0}, regParam={1}, alpha = {2}, precision at top 500 words is {3}"
        .format(rank, reg, alpha, k_precision))

    pass
# vytvaram jednu tabulku v ktorej su data potrebne na ucenie
full_data = scores_data.join(users_data,
                             "username").join(anime_data, "anime_id")
recommend_data = full_data.select("user_id", "anime_id", "my_score")

# niektore ciselne polia mi inferschema dalo ako string preto ich musim precastovat
recommend_data = recommend_data.withColumn(
    "anime_id", recommend_data["anime_id"].cast(IntegerType()))
recommend_data = recommend_data.withColumn(
    "my_score", recommend_data["my_score"].cast(FloatType()))

# samotne ucenie
training, test = recommend_data.randomSplit([0.8, 0.2], seed=42)
als = ALS(maxIter=5,
          regParam=0.01,
          userCol="user_id",
          itemCol="anime_id",
          ratingCol="my_score",
          coldStartStrategy="drop")
model = als.fit(training)

# chyba predikcie
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="my_score",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# pre kazdeho pouzivatela urobim predikciu pre 3 anime ktore by sa mu mali najviac pacit aj s predikovanym skore
userRecs = model.recommendForAllUsers(3)
userRecs = userRecs.withColumn("predicted_score",
class MovieRecommendation:
    def createDf(self):

        moviesCustomSchema = StructType([
            StructField('movieID', IntegerType(), True),
            StructField('title', StringType(), True),
            StructField('genre', StringType(), True)
        ])

        ratingsCustomSchema = StructType([
            StructField('userId', IntegerType(), True),
            StructField('movieId', IntegerType(), True),
            StructField('rating', DoubleType(), True)
        ])
        self.movies_df = self.sqlContext.read.format("jdbc").option(
            "url", "jdbc:mysql://127.0.0.1:3306/music").option(
                "driver",
                "com.mysql.jdbc.Driver").option("dbtable", "movies").option(
                    "user", "root").option("password", "root").load()
        self.movies_df = self.movies_df.withColumnRenamed("movieId", "ID")
        self.movies_df = self.movies_df.cache()
        self.ratings_df = self.sqlContext.read.format("jdbc").option(
            "url", "jdbc:mysql://127.0.0.1:3306/music").option(
                "driver",
                "com.mysql.jdbc.Driver").option("dbtable", "ratings").option(
                    "user", "root").option("password", "root").load()
        self.ratings_df = self.ratings_df.drop('timestamp')
        self.ratings_df = self.ratings_df.cache()

    def topRatedMovies(self):

        # movies_df = movies_df.drop('genres')
        self.movie_names_with_avg_ratings_df = self.ratings_df.groupBy(
            'movieId').agg({
                'rating': 'avg',
                'userId': 'count'
            }).withColumnRenamed('avg(rating)', 'average').withColumnRenamed(
                'count(userId)', 'count')

        self.moviesRatingsJoined_df = \
            self.movies_df.join(self.movie_names_with_avg_ratings_df,
                           self.movies_df.ID
                           == self.movie_names_with_avg_ratings_df.movieId,
                           'inner')

        self.moviesRatingsJoined_df = \
            self.moviesRatingsJoined_df.sort(self.moviesRatingsJoined_df.average.desc()).drop('ID'
                                                                                    )

        self.moviesWithHighestRatingWithCountMoreThan500 = \
            self.moviesRatingsJoined_df.filter('count >= 500')

    def splitDataset(self):

        (split_60_df, split_a_20_df, split_b_20_df) = \
            self.ratings_df.randomSplit([0.6, 0.2, 0.2], 123)

        self.training_df = split_60_df.cache()
        self.validation_df = split_a_20_df.cache()
        self.test_df = split_b_20_df.cache()

    def alternatingLeastSquare(self):

        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol='userId',
                       itemCol='movieId',
                       ratingCol='rating')
        model = self.als.fit(self.training_df)

        # Create an RMSE evaluator using the label and predicted columns

        self.reg_eval = RegressionEvaluator(predictionCol='prediction',
                                            labelCol='rating',
                                            metricName='rmse')

        self.tolerance = 0.03
        self.ranks = [4, 8, 12]
        self.errors = []
        self.models = []
        self.min_error = float('inf')
        self.best_rank = -1
        for rank in self.ranks:

            # Set the rank here:

            self.als.setRank(rank)

            # Create the model with these parameters.

            model = self.als.fit(self.training_df)

            # Run the model to create a prediction. Predict against the validation_df.

            self.predict_df = model.transform(self.validation_df)

            # Remove NaN values from prediction (due to SPARK-14489)

            self.predicted_ratings_df = \
                self.predict_df.filter(self.predict_df.prediction != float('nan'))

            # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame

            error = self.reg_eval.evaluate(self.predicted_ratings_df)
            self.errors.append(error)
            self.models.append(model)
            if error < self.min_error:
                self.min_error = error
                self.best_rank = rank
                self.my_model = model

        self.als.setRank(self.best_rank)

    def testModel(self):
        self.predict_df = self.my_model.transform(self.test_df)

        # Remove NaN values from prediction (due to SPARK-14489)

        predicted_test_df = self.predict_df.filter(
            self.predict_df.prediction != float('nan'))

        # Run the previously created RMSE evaluator, reg_eval, on the predicted_test_df DataFrame

        self.reg_eval = RegressionEvaluator(predictionCol='prediction',
                                            labelCol='rating',
                                            metricName='rmse')
        test_RMSE = self.reg_eval.evaluate(predicted_test_df)

        return test_RMSE

    def get_top_ratings(self, user_id, movies_count):

        ratings_df_for_user = self.ratings_df.filter('userId=' + str(user_id) +
                                                     '')
        ratings_df_for_user.show(5)
        list_of_movies_row = ratings_df_for_user.select('movieId').collect()
        my_rated_movie_ids = [i.movieId for i in list_of_movies_row]
        not_rated_df = self.movies_df.filter(
            ~self.movies_df['ID'].isin(my_rated_movie_ids))

        # Rename the "ID" column to be "movieId", and add a column with my_user_id as "userId".
        print my_rated_movie_ids

        my_unrated_movies_df = not_rated_df.selectExpr(
            'ID as movieId').withColumn('userId', F.lit(user_id))

        # Use my_rating_model to predict ratings for the movies that I did not manually rate.
        my_unrated_movies_df.show(5)
        raw_predicted_ratings_df = \
            self.my_model.transform(my_unrated_movies_df)

        predicted_ratings_df = \
            raw_predicted_ratings_df.filter(raw_predicted_ratings_df['prediction'
                                            ] != float('nan')).withColumnRenamed("movieId", "ID")
        predicted_ratings_df.show(5)
        # Join your predicted_ratings_df DataFrame with the movie_names_with_avg_ratings_df DataFrame to obtain the ratings counts for each movie

        predicted_with_counts_df = \
            predicted_ratings_df.join(self.moviesRatingsJoined_df,
                                      self.moviesRatingsJoined_df['movieId']
                                      == predicted_ratings_df['ID']).drop('ID')
        predicted_with_counts_df = predicted_with_counts_df.sort(
            predicted_with_counts_df.prediction.desc()).filter(
                'count >= 50').filter('prediction >= 3.0')
        list_of_predictions = map(lambda row: row.asDict(),
                                  predicted_with_counts_df.collect())
        return list_of_predictions

    def add_ratings(self, ratings):
        print ratings
        # my_ratings_df = self.sqlContext.createDataFrame(ratings,['userId', 'movieId', 'rating'])
        self.ratings_df = self.sqlContext.read.format("jdbc").option(
            "url", "jdbc:mysql://127.0.0.1:3306/music").option(
                "driver",
                "com.mysql.jdbc.Driver").option("dbtable", "ratings").option(
                    "user", "root").option("password", "root").load()
        # self.ratings_df = self.ratings_df.unionAll(my_ratings_df)
        self.topRatedMovies()
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol='userId',
                       itemCol='movieId',
                       ratingCol='rating')

        self.als.setPredictionCol('prediction').setMaxIter(5).setSeed(
            123).setRegParam(0.1).setUserCol('userId').setItemCol(
                'movieId').setRatingCol('rating').setRank(self.best_rank)

        # Create the model with these parameters.

        self.my_model = self.als.fit(self.ratings_df)

        return ratings

    def __init__(self, sc, sqlcontext):
        self.sqlContext = SQLContext(sc)
        self.createDf()
        self.topRatedMovies()
        self.splitDataset()
        self.alternatingLeastSquare()
        test_RMSE = self.testModel()
示例#25
0
#!/usr/bin/env python

from pyspark.ml.recommendation import ALS

als = ALS(maxIter=10,
          regParam=0.01,
          userCol="user_id",
          itemCol="book_id",
          ratingCol="rating",
          coldStartStrategy="drop",
          implicitPrefs=False,
          seed=42)

model = als.fit(train)


def evaluation(model, val, metric):

    from pyspark.mllib.evaluation import RegressionMetrics
    import pyspark.sql.functions as f
    from pyspark.sql.functions import *

    #all users in val
    user_val = val.select('user_id').distinct()
    #recommend top 500 books for each user in val
    val_rec = model.recommendForUserSubset(user_val, 500)
    #print(val_rec.first())
    #DataFrame[user_id: int, recommendations: array<struct<book_id:int,rating:float>>]

    #####Reshape the dataframe######
示例#26
0
def tune_ALS_NLP(spark, train_data, validation_data, val_true_list, maxIter, regParams, ranks, review_val_predictions):
    # initial
    min_error = float('inf')
    best_iter1 = -1
    best_rank1 = -1
    best_regularization1 = 0
    best_model_rmse = None
    max_map = 0.0
    best_iter2 = -1
    best_rank2 = -1
    best_regularization2 = 0
    best_model_map = None

    for iteration in maxIter:
        for current_rank in ranks:
            for reg in regParams:
                als=ALS(maxIter=iteration,regParam=reg,rank=current_rank, \
                        userCol='user_id',itemCol='book_id',ratingCol='rating', \
                        coldStartStrategy="drop",nonnegative=True)
                als_model = als.fit(train_data)
                predictions = als_model.transform(validation_data)
                
                review_predictions = review_val_predictions.withColumnRenamed('prediction','review_prediction')
                als_predictions = predictions.withColumnRenamed('prediction','als_prediction')
                total_predictions = als_predictions.join(review_predictions,['user_id','book_id','rating'],'outer')
                total_predictions = total_predictions.withColumn('total_prediction', \
                                                                 when(total_predictions['review_prediction'].isNotNull(), \
                                                                      total_predictions['review_prediction']) \
                                                                 .otherwise(total_predictions['als_prediction']))
                              
                window = Window.partitionBy(total_predictions['user_id']).orderBy(total_predictions['total_prediction'].desc())
                top_predictions = total_predictions.select('*', rank().over(window).alias('row_num')).filter(col('row_num') <= 500)

                # rmse
                evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='total_prediction')
                rmse = evaluator.evaluate(top_predictions)
                if rmse < min_error:
                    min_error = rmse
                    best_rank1 = current_rank
                    best_regularization1 = reg
                    best_iter1 = iteration
                    best_model_rmse = als_model

                # MAP
                current_map = MAP.getMAP(top_predictions, val_true_list)
                if current_map > max_map:
                    max_map = current_map
                    best_rank2 = current_rank
                    best_regularization2 = reg
                    best_iter2 = iteration
                    best_model_map = als_model

                print('{} latent factors and regularization = {} with maxIter {}: '
                  'validation RMSE is {}' 'validation MAP is {}' .format(current_rank, reg, iteration, rmse, current_map))
              
                with open('train05_review_eval.csv', 'ab') as f:
                    np.savetxt(f, [np.array([iteration, current_rank, reg, rmse, current_map])],delimiter=",")

    print('\nThe best model select by RMSE has {} latent factors and '
          'regularization = {}'' with maxIter = {}: RMSE = {}'.format(best_rank1, best_regularization1, best_iter1, min_error))
    print('\nThe best model select by MAP has {} latent factors and '
          'regularization = {}'' with maxIter = {}: MAP = {}'.format(best_rank2, best_regularization2, best_iter2, max_map))

    return best_model_rmse,best_model_map
class RecommendationEngine:
    """A product recommendation engine
    """
    def __train_all_model(self):
        """Train the ALS model with the current dataset
        """

        #Model 1
        logger.info("Training the ALS model 1")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="UserId",
                       itemCol="ProductId",
                       ratingCol="Rating",
                       coldStartStrategy="drop")
        self.model1 = self.als.fit(self.df0)
        logger.info("ALS model 1 built!")

        #Model 2
        logger.info("Training the ALS model 2")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="UserId",
                       itemCol="ProductId",
                       ratingCol="Rating",
                       coldStartStrategy="drop")
        self.model2 = self.als.fit(self.df1)
        logger.info("ALS model 2 built!")

        #Model 3
        logger.info("Training the ALS model 3")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="UserId",
                       itemCol="ProductId",
                       ratingCol="Rating",
                       coldStartStrategy="drop")
        self.model3 = self.als.fit(self.df2)
        logger.info("ALS model 3 built!")

    def __train_model(self, model):
        """Train the ALS model with the current dataset
        """

        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="UserId",
                       itemCol="ProductId",
                       ratingCol="Rating",
                       coldStartStrategy="drop")
        if model == 0:
            self.model1 = self.als.fit(self.df0)
        elif model == 1:
            self.model2 = self.als.fit(self.df1)
        elif model == 2:
            self.model3 = self.als.fit(self.df2)
        logger.info("ALS model built!")

    def get_top_ratings(self, model, user_id, products_count):

        if model == 0:
            users = self.df0.select(self.als.getUserCol())
            users = users.filter(users.UserId == user_id)
            userSubsetRecs = self.model1.recommendForUserSubset(
                users, products_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('UserId'),
                                                   func.col('recommendations')['ProductId'].alias('ProductId'),
                                                   func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
            userSubsetRecs = userSubsetRecs.drop('Rating')
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs
        elif model == 1:
            users = self.df1.select(self.als.getUserCol())
            users = users.filter(users.UserId == user_id)
            userSubsetRecs = self.model2.recommendForUserSubset(
                users, products_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('UserId'),
                                                   func.col('recommendations')['ProductId'].alias('ProductId'),
                                                   func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
            userSubsetRecs = userSubsetRecs.drop('Rating')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs
        elif model == 2:
            users = self.df2.select(self.als.getUserCol())
            users = users.filter(users.UserId == user_id)
            userSubsetRecs = self.model3.recommendForUserSubset(
                users, products_count)
            userSubsetRecs = userSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            userSubsetRecs = userSubsetRecs.select(func.col('UserId'),
                                                   func.col('recommendations')['ProductId'].alias('ProductId'),
                                                   func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
            userSubsetRecs = userSubsetRecs.drop('Rating')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            userSubsetRecs = userSubsetRecs.toPandas()
            userSubsetRecs = userSubsetRecs.to_json()
            return userSubsetRecs

    def get_top_product_recommend(self, model, product_id, user_count):

        if model == 0:
            products = self.df0.select(self.als.getItemCol())
            products = products.filter(products.ProductId == product_id)
            productSubsetRecs = self.model1.recommendForItemSubset(
                products, user_count)
            productSubsetRecs = productSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            productSubsetRecs = productSubsetRecs.select(func.col('ProductId'),
                                                     func.col('recommendations')['UserId'].alias('UserId'),
                                                     func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                            drop('recommendations')
            productSubsetRecs = productSubsetRecs.drop('Rating')
            # userSubsetRecs.printSchema()
            productSubsetRecs = productSubsetRecs.toPandas()
            productSubsetRecs = productSubsetRecs.to_json()
            return productSubsetRecs
        elif model == 1:
            products = self.df1.select(self.als.getItemCol())
            products = products.filter(products.ProductId == product_id)
            productSubsetRecs = self.model2.recommendForItemSubset(
                products, user_count)
            productSubsetRecs = productSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            productSubsetRecs = productSubsetRecs.select(func.col('ProductId'),
                                                     func.col('recommendations')['UserId'].alias('UserId'),
                                                     func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                            drop('recommendations')
            productSubsetRecs = productSubsetRecs.drop('Rating')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            productSubsetRecs = productSubsetRecs.toPandas()
            productSubsetRecs = productSubsetRecs.to_json()
            return productSubsetRecs
        elif model == 2:
            products = self.df2.select(self.als.getItemCol())
            products = products.filter(products.ProductId == product_id)
            productSubsetRecs = self.model3.recommendForItemSubset(
                products, user_count)
            productSubsetRecs = productSubsetRecs.withColumn(
                "recommendations", explode("recommendations"))
            productSubsetRecs = productSubsetRecs.select(func.col('ProductId'),
                                                     func.col('recommendations')['UserId'].alias('UserId'),
                                                     func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                            drop('recommendations')
            productSubsetRecs = productSubsetRecs.drop('Rating')
            # userSubsetRecs.show()
            # userSubsetRecs.printSchema()
            productSubsetRecs = productSubsetRecs.toPandas()
            productSubsetRecs = productSubsetRecs.to_json()
            return productSubsetRecs

    def get_ratings_for_product_ids(self, model, user_id, product_id):

        if model == 0:
            request = self.spark_session.createDataFrame(
                [(user_id, product_id)], ["UserId", "ProductId"])
            ratings = self.model1.transform(request).collect()
            return ratings
        elif model == 1:
            request = self.spark_session.createDataFrame(
                [(user_id, product_id)], ["UserId", "ProductId"])
            ratings = self.model2.transform(request).collect()
            return ratings
        elif model == 2:
            request = self.spark_session.createDataFrame(
                [(user_id, product_id)], ["UserId", "ProductId"])
            ratings = self.model3.transform(request).collect()
            return ratings

    def __init__(self, spark_session, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        logger.info("Starting up the Recommendation Engine: ")
        self.spark_session = spark_session

        # Load Amazon data for later use
        logger.info("Loading Amazon data...")
        file_name1 = 'model-1.txt'
        dataset_file_path1 = os.path.join(dataset_path, file_name1)
        exist = os.path.isfile(dataset_file_path1)
        if exist:
            self.df0 = spark_session.read.csv(dataset_file_path1,
                                              header=None,
                                              inferSchema=True)
            self.df0 = self.df0.selectExpr("_c0 as UserId", "_c1 as ProductId",
                                           "_c2 as Rating")

        file_name2 = 'model-2.txt'
        dataset_file_path2 = os.path.join(dataset_path, file_name2)
        exist = os.path.isfile(dataset_file_path2)
        if exist:
            self.df1 = spark_session.read.csv(dataset_file_path2,
                                              header=None,
                                              inferSchema=True)
            self.df1 = self.df1.selectExpr("_c0 as UserId", "_c1 as ProductId",
                                           "_c2 as Rating")

        file_name3 = 'model-3.txt'
        dataset_file_path3 = os.path.join(dataset_path, file_name3)
        exist = os.path.isfile(dataset_file_path3)
        if exist:
            self.df2 = spark_session.read.csv(dataset_file_path3,
                                              header=None,
                                              inferSchema=True)
            self.df2 = self.df2.selectExpr("_c0 as UserId", "_c1 as ProductId",
                                           "_c2 as Rating")

        # Train the model
        self.__train_all_model()
示例#28
0
文件: predict.py 项目: azataiot/Oraz
Test.assertEquals(training_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 1196) & (ratings_df.rating == 4.5)).count(), 1)

Test.assertEquals(validation_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 296) & (ratings_df.rating == 4.0)).count(), 1)
Test.assertEquals(validation_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 32) & (ratings_df.rating == 3.5)).count(), 1)
Test.assertEquals(validation_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 6888) & (ratings_df.rating == 3.0)).count(), 1)

Test.assertEquals(test_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 4993) & (ratings_df.rating == 5.0)).count(), 1)
Test.assertEquals(test_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 4128) & (ratings_df.rating == 4.0)).count(), 1)
Test.assertEquals(test_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 4915) & (ratings_df.rating == 3.0)).count(), 1)

# TODO: Replace <FILL IN> with appropriate code
# This step is broken in ML Pipelines: https://issues.apache.org/jira/browse/SPARK-14489
from pyspark.ml.recommendation import ALS

# Let's initialize our ALS learner
als = ALS()

# Now we set the parameters for the method
als.setMaxIter(5)\
   .setSeed(seed)\
   .setRegParam(0.1)\
   .<FILL_IN>

# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")

tolerance = 0.03
ranks = [4, 8, 12]
import math

spark = SparkSession.builder.appName("hw1").getOrCreate()

all_lines = spark.read.text("train.dat").rdd
divs = all_lines.map(lambda row: row.value.split("\t"))
row_rdd = divs.map(lambda a: Row(userId=int(a[0]),
                                 movieId=int(a[1]),
                                 rating=float(a[2]),
                                 timestamp=int(a[3])))
df = spark.createDataFrame(row_rdd)
df.show(10)

als = ALS(maxIter=8,
          regParam=0.085,
          userCol="userId",
          itemCol="movieId",
          ratingCol="rating",
          coldStartStrategy="nan")
model = als.fit(df)

all_lines_test = spark.read.text("test.dat").rdd
divs_test = all_lines_test.map(lambda row: row.value.split("\t"))
row_rdd_test = divs_test.map(
    lambda a: Row(userId=int(a[0]), movieId=int(a[1])))
df_test = spark.createDataFrame(row_rdd_test)
df_test.show(10)

res = df_test.withColumn("col_id", monotonically_increasing_id())
res.show(10)
predictions = model.transform(res)
predictions.show(20)
示例#30
0
def train_als(params, data):
    symbol = ALS(**params)
    with Timer() as t:
        model = symbol.fit(data)
    return model, t
# See some statistics about the train, validation and test data
print('Statistics for Training Data: ')
train.describe().show()
print('Statistics for Validation Data: ')
val.describe().show()
print('Statistics for Test Data: ')
test.describe().show()

# After doing the hyperparameter tuning, ideal values for rank and regParam are: rank = 50 and regParam = 0.09
r = 50
l = 0.09
als = ALS(rank=r,
          regParam=l,
          userCol='user_id',
          itemCol='book_id',
          ratingCol='rating',
          coldStartStrategy='drop',
          nonnegative=True)

# Train the model
model = als.fit(train)

# RMSE value evalutation (Regression Metric)
evaluator = RegressionEvaluator(metricName='rmse',
                                labelCol='rating',
                                predictionCol='prediction')

# Prediction of rating for validation set
predictions = model.transform(val)
predictions = predictions.withColumn("prediction",
# MAGIC Using the ML Pipeline's [CrossValidator](http://spark.apache.org/docs/1.6.2/api/python/pyspark.ml.html#pyspark.ml.tuning.CrossValidator) with ALS is thus problematic, because cross validation involves dividing the training data into a set of folds (e.g., three sets) and then using those folds for testing and evaluating the parameters during the parameter grid search process. It is likely that some of the folds will contain users that are not in the other folds, and, as a result, ALS produces NaN values for those new users. When the CrossValidator uses the Evaluator (RMSE) to compute an error metric, the RMSE algorithm will return NaN. This will make *all* of the parameters in the parameter grid appear to be equally good (or bad).
# MAGIC 
# MAGIC You can read the discussion on [Spark JIRA 14489](https://issues.apache.org/jira/browse/SPARK-14489) about this issue. There are proposed workarounds of having ALS provide default values or having RMSE drop NaN values. Both introduce potential issues. We have chosen to have RMSE drop NaN values. While this does not solve the underlying issue of ALS not predicting a value for a new user, it does provide some evaluation value. We manually implement the parameter grid search process using a for loop (below) and remove the NaN values before using RMSE.
# MAGIC 
# MAGIC For a production application, you would want to consider the tradeoffs in how to handle new users.
# MAGIC 
# MAGIC **Note**: This cell will likely take a couple of minutes to run.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
# This step is broken in ML Pipelines: https://issues.apache.org/jira/browse/SPARK-14489
from pyspark.ml.recommendation import ALS

# Let's initialize our ALS learner
als = ALS()

# Now we set the parameters for the method
als.setMaxIter(5).setSeed(seed).setRegParam(0.1).setUserCol("userId").setItemCol("movieId").setRatingCol("rating")

# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")

tolerance = 0.03
ranks = [4, 8, 12]
errors = [0, 0, 0]
models = [0, 0, 0]
err = 0
示例#33
0
def main(spark, train_data_file, test_data_file, model_file):

    time_a = time.time()
    start = time_a

    # Use Validation and Test user_id to filter Train data, to get the 110k mandatory users
    # Stored here hdfs:/user/dz584/cf_train_sample.parquet
    """
    training_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_train.parquet')
    validation_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_validation.parquet')
    testing_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_test.parquet')

    validandtest_userid = validation_data.union(testing_data).select('user_id').distinct()
    validandtest_userid.createOrReplaceTempView('validandtest_userid')

    training_data.createOrReplaceTempView('training_data')
    training_data = spark.sql("SELECT * FROM training_data WHERE user_id IN (SELECT user_id FROM validandtest_userid GROUP BY user_id)")
    training_data.write.parquet("cf_train_sample.parquet")
    """

    training_data = spark.read.parquet(train_data_file)
    indexer_id = StringIndexer(inputCol="user_id",
                               outputCol="userindex").setHandleInvalid("skip")
    indexer_id_model = indexer_id.fit(training_data)
    indexer_item = StringIndexer(
        inputCol="track_id", outputCol="itemindex").setHandleInvalid("skip")
    indexer_item_model = indexer_item.fit(training_data)

    training_data = indexer_id_model.transform(training_data)
    training_data = indexer_item_model.transform(training_data)

    testing_data = spark.read.parquet(test_data_file)
    testing_data = indexer_id_model.transform(testing_data)
    testing_data = indexer_item_model.transform(testing_data)

    training_data = training_data.select('userindex', 'itemindex', 'count')
    testing_data = testing_data.select('userindex', 'itemindex', 'count')

    # Add Log Compression
    training_data.createOrReplaceTempView('training_data')
    training_data = spark.sql(
        "SELECT *, count+1 as plus_count FROM training_data")
    training_data = training_data.withColumn("log_count", F.log("plus_count"))

    print('Finished Indexing!')
    time_b = time.time()
    print(time_b - time_a)
    time_a = time_b

    result_dict = {}
    rank_list = [500, 600, 700]  #[10,20,30,50]
    reg_param_list = [0.7]  #[0.1,0.5]
    alpha_list = [1]  #[1,1.5]

    for rank in rank_list:
        for reg_param in reg_param_list:
            for alpha in alpha_list:

                current_key = (rank, reg_param, alpha)
                als = ALS(maxIter=5,
                          userCol="userindex",
                          itemCol="itemindex",
                          ratingCol="log_count",
                          rank=rank,
                          regParam=reg_param,
                          alpha=alpha)
                model = als.fit(training_data)

                print('Finished Modeling with Param:', current_key)
                time_b = time.time()
                print(time_b - time_a)
                time_a = time_b

                prediction = model.recommendForAllUsers(500).select(
                    'userindex', 'recommendations.itemindex')
                print('Finished Prediction DF!')

                testing_df = testing_data.groupBy('userindex').agg(
                    expr('collect_list(itemindex) as item_list'))
                print('Finished Label DF!')

                predictionAndLabels = prediction.join(testing_df, 'userindex')
                predandlabel_name = 'logplus_rk' + str(rank) + 'reg' + str(
                    reg_param) + 'a' + str(alpha)
                predandlabel_name = predandlabel_name.replace(".",
                                                              "") + '.parquet'
                predictionAndLabels.write.parquet(predandlabel_name)

                print('Joined Prediction and Labels!')
                time_b = time.time()
                print(time_b - time_a)
                time_a = time_b

    #             pred_df = predictionAndLabels.select(['itemindex','item_list']).rdd.map(list)

    #             metrics = RankingMetrics(pred_df)

    #             print('Ranking Metrics Calculated!')
    #             time_b = time.time()
    #             print(time_b - time_a)
    #             time_a = time_b

    #             eva = metrics.meanAveragePrecision
    #             result_dict[current_key] = eva

    #             print(current_key,"parameter combination has been trained! MAP= ", eva)
    #             time_b = time.time()
    #             print(time_b - time_a)
    #             time_a = time_b

    # best_model_param = max(result_dict, key=result_dict.get)
    # als = ALS(maxIter=5, userCol="userindex", itemCol="itemindex", ratingCol="count", rank=best_model_param[0], regParam=best_model_param[1], alpha=best_model_param[2])
    # als.fit(training_data).write().overwrite().save(model_file)

    print('Process Finished!')
    print(time.time() - start)
示例#34
0
def generate_predictions(training_df, prediction_df, rank, model=None):
    iterations = 10
    als = ALS(rank=rank, maxIter=iterations, implicitPrefs=True)
    if model == None:
        model = als.fit(training_df)
    return model.transform(prediction_df).dropna()
示例#35
0
ratings_with_user_and_item_Idx = ratings_with_userIdx.join(item_index, on=['itemId'], how='left')

## persisting this dataframe is the key:

# https://medium.com/@meltem.tutar/pyspark-under-the-hood-randomsplit-and-sample-inconsistencies-examined-7c6ec62644bc

ratings_with_user_and_item_Idx.persist()

(training, test) = ratings_with_user_and_item_Idx.randomSplit([0.99, 0.01])

# Build the recommendation model using ALS on the training data

# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics

als = ALS(maxIter=10, regParam=0.2, userCol="userIdx", itemCol="itemIdx", ratingCol="rating", rank=16,
          coldStartStrategy="drop")

model = als.fit(training)

end_time = time.time()

print("Time elapsed %f" % (end_time - start_time))

# Evaluate the model by computing the RMSE on the test data

predictions = model.transform(test)

# evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
#                                predictionCol="prediction")

# rmse = evaluator.evaluate(predictions)
# MAGIC For a production application, you would want to consider the tradeoffs in how to handle new users.
# MAGIC 
# MAGIC **Note**: This cell will likely take a couple of minutes to run.

# COMMAND ----------



# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
# This step is broken in ML Pipelines: https://issues.apache.org/jira/browse/SPARK-14489
from pyspark.ml.recommendation import ALS

# Let's initialize our ALS learner
als = ALS()

# Now we set the parameters for the method
als.setMaxIter(5)\
   .setSeed(seed)\
   .setRegParam(0.1)\
   .setUserCol("userId").setItemCol("movieId").setRatingCol("rating")

# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")

tolerance = 0.03
ranks = [4, 8, 12]
示例#37
0
class Predictor(object):
    def __init__(self,
                 spark,
                 user_col_name,
                 item_col_name,
                 rating_col_name,
                 rank=15,
                 maxIter=15,
                 regParam=0.01):
        self.user_col_name = user_col_name
        self.item_col_name = item_col_name
        self.item_col_name_index = "INDEX_" + item_col_name
        self.rating_col_name = rating_col_name
        self.als = ALS(rank=rank,
                       maxIter=maxIter,
                       regParam=regParam,
                       userCol=user_col_name,
                       itemCol=self.item_col_name_index,
                       ratingCol=rating_col_name,
                       coldStartStrategy="drop",
                       nonnegative=True)
        self.item_indexer = StringIndexer().setInputCol(
            self.item_col_name).setOutputCol(self.item_col_name_index)
        self.item_index_df = None
        self.indexer_model = None
        self.model = None
        self.item_similarity = None
        self.spark = spark

    # fit all the course index
    def fit_item_index(self, item_df):
        self.indexer_model = self.item_indexer.fit(item_df)
        self.item_index_df = self.indexer_model.transform(
            item_df.select(self.item_col_name).distinct())

    # fit training data (call this after fit_item_index)
    def fit(self, training_df):
        encoded_df = self.indexer_model.transform(training_df)
        # encoded_df = encoded_df.withColumn(self.user_col_name, encoded_df[self.user_col_name].cast(IntegerType()))
        # encoded_df = encoded_df.withColumn(self.rating_col_name, encoded_df[self.rating_col_name].cast(DoubleType()))
        normalize_rating_udf = udf(lambda p: 0.0
                                   if p > 10 else p, DoubleType())
        encoded_df = encoded_df.withColumn(
            self.rating_col_name,
            normalize_rating_udf(encoded_df[self.rating_col_name]))
        self.model = self.als.fit(encoded_df)
        item_factor = self.model.itemFactors
        item_factor.createOrReplaceTempView("ItemFactor")

        # function to calculate cosine similarity between two array
        def cosine_similarity(item1, item2):
            dot_product = np.linalg.norm(item1) * np.linalg.norm(item2)
            if dot_product == 0:
                return 0.0
            return float(np.dot(item1, item2) / dot_product)

        cosine_similarity_udf = udf(cosine_similarity, DoubleType())
        item_similarity = self.spark.sql(
            "SELECT I1.id as id1, I2.id as id2, I1.features as features1, I2.features as features2  FROM ItemFactor I1, ItemFactor I2 WHERE I1.id != I2.id"
        )
        self.item_similarity = item_similarity.withColumn(
            "similarity",
            cosine_similarity_udf(item_similarity["features1"],
                                  item_similarity["features2"]))

    # self.item_similarity.show()
    # can drop 2 feature column and tempView
    # item_similarity = item_similarity.drop("features1")
    # item_similarity = item_similarity.drop("features2")
    # self.spark.catalog.dropTempView("ItemFactor")

    # input_df will have 1 student id and all course that the student already studied
    # first we will index all the course the student already studied and normalize all score
    # then map similarity data to the already studied course
    # then check if predict_course_df is None or not, if it None, then predict all the remaining course,
    # if not transform the predict_course_df to get the index of predict course
    # then begin predict function (use first 5 relevant course to that course that the student already studied)
    def predict_using_cosine_similarity(self,
                                        input_df,
                                        predict_course_df=None):
        # preprocessed input data
        # print("begin predict using cosine similarity")
        encoded_df = self.indexer_model.transform(input_df)
        normalize_rating_udf = udf(lambda p: 0.0
                                   if p > 10 else p, DoubleType())
        encoded_df = encoded_df.withColumn(
            self.rating_col_name,
            normalize_rating_udf(encoded_df[self.rating_col_name]))

        # get predict course df (remaining course)
        if predict_course_df is None:
            predict_course_df_predict = encoded_df.join(self.item_index_df,
                                                        encoded_df[self.item_col_name_index] != self.item_index_df[
                                                            self.item_col_name_index]) \
                .select(self.user_col_name, self.item_col_name_index)
        else:
            predict_course_df = self.indexer_model.transform(predict_course_df)
            predict_course_df_predict = predict_course_df.drop(
                self.rating_col_name)

        # get all value that can participate in evaluate final score
        similarity_score_df = encoded_df.join(self.item_similarity,
                                              encoded_df[self.item_col_name_index] == self.item_similarity['id1']) \
            .select(self.user_col_name, self.rating_col_name, 'id1', 'id2', 'similarity') \
            # .withColumnRenamed(self.user_col_name, "user_name_similarity")

        #                encoded_df[self.item_col_name_index] == self.item_similarity['id2']) # can delete this part if allow duplicate id1,id2

        # def predict(student, course, similarity_score_df):
        #     # get first 5 course the student already attended which are the most relevant to the current course
        #     relevant_df = similarity_score_df.filter(similarity_score_df[self.user_col_name] == student and
        #                                              similarity_score_df['id2'] == course) \
        #         .orderBy('similarity', ascending=False) \
        #         .head(5)
        #     relevant_df = relevant_df.withColumn('score', relevant_df[self.rating_col_name] * relevant_df['similarity'])
        #     return relevant_df.select(spark_func.avg(relevant_df['score']).alias('avg')).collect()[0][
        #         'avg']  # need to check again if avg is enough
        def predict(list_score, list_similarity):
            sum_simi = sum(list_similarity)
            if sum_simi == 0:
                return 0.0
            return sum([
                list_score[i] * list_similarity[i]
                for i in range(len(list_score))
            ]) / sum(list_similarity)

        predict_udf = udf(predict, DoubleType())
        window = Window.partitionBy([
            spark_func.col(self.user_col_name),
            spark_func.col(self.item_col_name_index)
        ]).orderBy(spark_func.col('similarity').desc())

        predict_course_df_predict = predict_course_df_predict.join(
            similarity_score_df.withColumnRenamed("id2", self.item_col_name_index),
            [self.item_col_name_index, self.user_col_name]) \
            .select("*", spark_func.rank().over(window).alias("rank")) \
            .filter(spark_func.col("rank") <= 7).groupby(self.user_col_name, self.item_col_name_index) \
            .agg(spark_func.collect_list(self.rating_col_name).alias("list_score"),
                 spark_func.collect_list("similarity").alias("list_similarity"))
        predict_course_df_predict = predict_course_df_predict.withColumn(
            "prediction",
            predict_udf(spark_func.col("list_score"),
                        spark_func.col("list_similarity")))

        if predict_course_df is not None and self.rating_col_name in predict_course_df.columns:
            predict_course_df_predict = predict_course_df_predict.join(
                predict_course_df,
                [self.user_col_name, self.item_col_name_index])

        return predict_course_df_predict

    def transform(self, df):
        encoded_df = self.indexer_model.transform(df)
        normalize_rating_udf = udf(lambda p: 0.0
                                   if p > 10 else p, DoubleType())
        encoded_df = encoded_df.withColumn(
            self.rating_col_name,
            normalize_rating_udf(encoded_df[self.rating_col_name]))
        return self.model.transform(encoded_df)
示例#38
0
 def test_storage_levels(self):
     df = self.spark.createDataFrame(
         [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],
         ["user", "item", "rating"])
     als = ALS().setMaxIter(1).setRank(1)
     # test default params
     als.fit(df)
     self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_AND_DISK")
     self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_AND_DISK")
     self.assertEqual(als.getFinalStorageLevel(), "MEMORY_AND_DISK")
     self.assertEqual(als._java_obj.getFinalStorageLevel(), "MEMORY_AND_DISK")
     # test non-default params
     als.setIntermediateStorageLevel("MEMORY_ONLY_2")
     als.setFinalStorageLevel("DISK_ONLY")
     als.fit(df)
     self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_ONLY_2")
     self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_ONLY_2")
     self.assertEqual(als.getFinalStorageLevel(), "DISK_ONLY")
     self.assertEqual(als._java_obj.getFinalStorageLevel(), "DISK_ONLY")
示例#39
0
def train_als(data, input_user, input_video, input_rating):
    """Train a als model
    Args:
        data: Data used for training
        input_user: User column
        input_video: Video column
        input_rating: Rating column
    Returns:
        best_model: Trained als model
        model1: StringIndexer of user
        model2: StringIndexer of video
    """
    print(proc_date)
    # Define StringIndexer
    user_indexer = StringIndexer(inputCol=input_user,
                                 outputCol=input_user + "_index")
    model1 = user_indexer.fit(data)
    index1_data = model1.transform(data)
    video_indexer = StringIndexer(inputCol=input_video,
                                  outputCol=input_video + "_index")
    model2 = video_indexer.fit(index1_data)
    index2_data = model2.transform(index1_data)

    newdata = index2_data.select(
        col(input_user + "_index").cast(IntegerType()),
        col(input_video + "_index").cast(IntegerType()), input_rating)

    # Split data
    train_data, test_data = split_data(newdata)

    # ALS model
    als = ALS(userCol=input_user + "_index",
              itemCol=input_video + "_index",
              ratingCol=input_rating,
              coldStartStrategy="nan",
              implicitPrefs=False)

    # Crossvalidator
    paramGrid = ParamGridBuilder() \
        .addGrid(als.maxIter, [5, 10]) \
        .addGrid(als.regParam, [0.01, 0.1]) \
        .addGrid(als.rank, [10, 20]) \
        .build()

    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol=input_rating,
                                    predictionCol="prediction")
    crossval = CrossValidator(estimator=als,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=3)

    model = crossval.fit(train_data)
    best_model = model.bestModel
    # Compute rmse
    predictions = best_model.transform(test_data).na.drop()
    rmse = evaluator.evaluate(predictions)
    print("RMSE: ", rmse)
    print("MAXIter: ", best_model._java_obj.parent().getMaxIter())
    print("RegParam: ", best_model._java_obj.parent().getRegParam())
    print("Rank: ", best_model._java_obj.parent().getRank())

    return best_model, model1, model2
示例#40
0
model_activation.add(keras.layers.Dense(units=1))
model_activation.add(keras.layers.Dropout(rate=0.5))
model_activation.compile(loss='mean_squared_error',
                        optimizer='sgd',
                        metrics=['Precision'])
model_activation.fit(X_train, y_train,
                     epochs=10, batch_size=1, verbose = 0)


# RECOMMENDERS
'''Spark ALS Collaborative Filtering'''
spark = SparkSession.builder.getOrCreate()
als_model = ALS(
            itemCol='',
            userCol='',
            ratingCol='',
            nonnegative=True,
            maxIter=20,
            regParam=0.05,
            rank=20)
 #fit
sdf = spark.createDataFrame( #DF )
recommender = als_model.fit(sdf)
#predict
prediction = recommender.transform(sdf)#.toPandas().prediction
#evaluate
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
als_rmse = evaluator.evaluate(predictions)
#https://jaceklaskowski.gitbooks.io/mastering-apache-spark/spark-mllib/spark-mllib-RegressionEvaluator.html

示例#41
0
import findspark
findspark.init("D:\Spark")

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Ilk Ornek').getOrCreate()
lnes = spark.read.csv('ratings.csv', inferSchema=True, header=True)
lnes.show()
lnes.describe().show()
training, test = lnes.randomSplit([0.7, 0.3])
als = ALS(maxIter=5,
          regParam=0.01,
          userCol='userId',
          itemCol='movieId',
          ratingCol='rating')
model = als.fit(training)
predictions = model.transform(test)
predictions.show()
single_user = test.filter(test['userId'] == 12).select(['movieId', 'userId'])
single_user.show()
rec = model.transform(single_user)
rec.orderBy('prediction', ascending=False).show()
evalate = RegressionEvaluator(metricName="rmse",
                              labelCol="rating",
                              predictionCol="prediction")
rmse = evalate.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))