示例#1
0
def benchmark_spark(ratings, factors, iterations=5):
    conf = (SparkConf()
            .setAppName("implicit_benchmark")
            .setMaster('local[*]')
            .set('spark.driver.memory', '16G')
            )
    context = SparkContext(conf=conf)
    spark = SparkSession(context)

    times = {}
    try:
        ratings = convert_sparse_to_dataframe(spark, context, ratings)

        for rank in factors:
            als = ALS(rank=rank, maxIter=iterations,
                      alpha=1, implicitPrefs=True,
                      userCol="row", itemCol="col", ratingCol="data")
            start = time.time()
            als.fit(ratings)
            elapsed = time.time() - start
            times[rank] = elapsed / iterations
            print("spark. factors=%i took %.3f" % (rank, elapsed/iterations))
    finally:
        spark.stop()

    return times
示例#2
0
 def test_storage_levels(self):
     df = self.spark.createDataFrame(
         [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],
         ["user", "item", "rating"])
     als = ALS().setMaxIter(1).setRank(1)
     # test default params
     als.fit(df)
     self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_AND_DISK")
     self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_AND_DISK")
     self.assertEqual(als.getFinalStorageLevel(), "MEMORY_AND_DISK")
     self.assertEqual(als._java_obj.getFinalStorageLevel(), "MEMORY_AND_DISK")
     # test non-default params
     als.setIntermediateStorageLevel("MEMORY_ONLY_2")
     als.setFinalStorageLevel("DISK_ONLY")
     als.fit(df)
     self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_ONLY_2")
     self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_ONLY_2")
     self.assertEqual(als.getFinalStorageLevel(), "DISK_ONLY")
     self.assertEqual(als._java_obj.getFinalStorageLevel(), "DISK_ONLY")
示例#3
0
#os.chdir("/Users/kponnambalam/Dropbox/V2Maestros/Courses/Spark n X - Do Big Data Analytics and ML/Python")
#os.curdir

#Load the data file in ALS format (user, item, rating)
ratingsData = SpContext.textFile("UserItemData.txt")
ratingsData.collect()

#Convert the strings into a proper vector
ratingVector=ratingsData.map(lambda l: l.split(','))\
        .map(lambda l:(int(l[0]), int(l[1]), float(l[2])))

#Build a SQL Dataframe
ratingsDf=SpSession.createDataFrame(ratingVector, \
            ["user","item","rating"])

#build the model based on ALS
from pyspark.ml.recommendation import ALS
als = ALS(rank=10, maxIter=5)
model = als.fit(ratingsDf)

model.userFactors.orderBy("id").collect()

#Create a test data set of users and items you want ratings for
testDf = SpSession.createDataFrame(   \
        [(1001, 9003),(1001,9004),(1001,9005)], \
        ["user","item"])

#Predict
predictions = (model.transform(testDf).collect())
predictions
示例#4
0
# ## Recommender system

# In[10]:

from pyspark.ml.recommendation import ALS

als = ALS(maxIter=15,
          regParam=0.1,
          userCol='reviewerIndex',
          itemCol='asinIndex',
          ratingCol='label',
          rank=24,
          seed=1800009193L)

# ## Evaluating the model

# In[14]:

recommender_system = als.fit(train_reviews)

# In[15]:

predictions = recommender_system.transform(test)

# In[16]:

evaluation = evaluator.evaluate(
    predictions.filter(col('prediction') != float('nan')))

print('The RMSE of the recommender system is {0}'.format(evaluation))
示例#5
0
ratings = spark.read.format("csv")\
.options(header='false') \
.option("delimiter","\\t") \
.schema(schema) \
.load("resources/sample_movies_users.data")
print(ratings.describe().toPandas().transpose())

(training, test) = ratings.randomSplit([0.8, 0.2])
als = ALS(rank=10,
          maxIter=10,
          userCol='userId',
          itemCol='movieId',
          ratingCol='rating',
          regParam=0.1,
          coldStartStrategy="drop")
alsModel = als.fit(training)

predictions = alsModel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

testDF = spark.createDataFrame([(0, 50, -1), (0, 172, -1), (0, 133, -1)],
                               ["userId", "movieId", "rating"])
predictionDF = alsModel.transform(testDF)
predictionDF.show(5)

tvs = TrainValidationSplit(
    estimator=pipeline,
示例#6
0
    def post(self):
        global als_m
        global new_user_unrated_movies_DF
        global small_ratings_DF
        global small_ratings_DF_upd
        global model_tr
        global new_user_unrated_movies_DF
        global new_user_recommendations_DF

        ### Load request
        content = request.get_json(force=True)
        df = pd.DataFrame.from_dict(content)
        sp_df = sqlContext.createDataFrame(df)

        ### Load best hyper-parameters
        with open('./GenDataCollection/best_params.json') as f:
            import_param = json.load(f)

        ### CREATE AN EMPTY MODEL WITH THE BEST hyPARAMS

        als = ALS(maxIter=3,
                  regParam=import_param['regParam'],
                  rank=import_param['rank'],
                  userCol="userId",
                  itemCol="movieId",
                  ratingCol="rating",
                  coldStartStrategy="drop")

        ###GET PARSED USER ID
        new_user_ID = sp_df.first().userId
        ### UPDATE THE EXISTING TABLE WITH THE NEW RATINGS
        small_ratings_DF_upd = small_ratings_DF.union(sp_df)

        ### CREATE A LIST WITH MOVIE_IDS THAT THE USER HAS RATED
        sp_df_rated = sp_df.select('movieId').rdd.map(list).map(lambda x: x[0])
        sp_df_rated_list = sp_df_rated.collect()

        ### CREATE A DF WITH THE MOVIES THAT THE USER HAS NOT RATED (ALL MOVIES - RATED MOVIES)
        new_user_unrated_movies_DF = small_movies_data_DF.filter(
            ~small_movies_data_DF.movieId.isin(sp_df_rated_list))
        #.map(lambda x: (new_user_ID, x[0]))
        #.map(lambda x: (198, x[0])))

        ###Sanity Checks
        #small_movies_data_DF.count()

        ###Preprocessing
        new_user_unrated_movies_DF = new_user_unrated_movies_DF.drop('title')
        new_user_unrated_movies_DF = new_user_unrated_movies_DF.withColumn(
            'userId', lit(new_user_ID))
        new_user_unrated_movies_DF = new_user_unrated_movies_DF.select(
            'userId', 'movieId')  #re-arrange columns

        #### TRAIN THE MODEL WITH ALL THE PREVIOUS RATINGS + NEW RECEIVED RATINGS
        model = als.fit(small_ratings_DF_upd)

        ### PREDICT
        #use the model to predict ratings for the rest movies of the user
        new_user_recommendations_DF = model.transform(
            new_user_unrated_movies_DF)

        #get the total number of pre-existing reviews for each movie
        new_user_recommendations_DF = new_user_recommendations_DF.join(
            rati_count,
            new_user_recommendations_DF.movieId == rati_count.movieId).drop(
                rati_count.movieId)

        #order by the highest rated predictions
        new_user_recommendations_DF = new_user_recommendations_DF.orderBy(
            new_user_recommendations_DF.prediction.desc())

        #filter out movies with less than 30 reviews
        new_user_recommendations_DF = new_user_recommendations_DF.filter(
            new_user_recommendations_DF.TotalReviews >
            30)  #(returns around 10%)

        resp = new_user_recommendations_DF.na.drop(subset=["prediction"])
        resp = resp.limit(20)

        #get movie title
        resp = resp.join(small_movies_data_DF,
                         resp.movieId == small_movies_data_DF.movieId).drop(
                             small_movies_data_DF.movieId)
        resp = resp.select('userId', 'title', 'prediction')
        resp = resp.orderBy(resp.prediction.desc())

        #### CONVERT PYSPARK DF--> PANDAS DF --> DICT --> JSON RESPONSE
        resp_pd = resp.toPandas()
        resp_json = resp_pd.to_json()

        ### Write the new ratings to the parquet file (update existing ones)
        small_ratings_DF_upd.repartition(1).write.csv(
            path="./GenDataCollection/ratings_upd.csv",
            mode="append",
            header=True)
        #small_ratings_DF_upd.write.parquet("ratings_upd.parquet", mode='append')

        return resp_json
示例#7
0
splits = df.randomSplit([1.0,1.0,1.0,1.0,1.0], 111)
(training1, test1) = (splits[0].union(splits[1]).union(splits[2]).union(splits[3]), splits[4])
(training2, test2) = (splits[0].union(splits[1]).union(splits[2]).union(splits[4]), splits[3])
(training3, test3) = (splits[0].union(splits[1]).union(splits[4]).union(splits[3]), splits[2])
(training4, test4) = (splits[0].union(splits[4]).union(splits[2]).union(splits[3]), splits[1])
(training5, test5) = (splits[4].union(splits[1]).union(splits[2]).union(splits[3]), splits[0])

# ALS V1

als = ALS(maxIter=10, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")

rmse = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
mae = RegressionEvaluator(metricName="mae", labelCol="rating",predictionCol="prediction")

model1 = als.fit(training1)
model2 = als.fit(training2)
model3 = als.fit(training3)
model4 = als.fit(training4)
model5 = als.fit(training5)

predictions1 = model1.transform(test1)
predictions2 = model2.transform(test2)
predictions3 = model3.transform(test3)
predictions4 = model4.transform(test4)
predictions5 = model5.transform(test5)

rmse1 = rmse.evaluate(predictions1)
mae1 = mae.evaluate(predictions1)
print("RMSE = " + str(rmse1) + " MAE = " +  str(mae1))
rmse2 = rmse.evaluate(predictions2)
def get_als_model(df,
                  rank,
                  regParam=1,
                  split=[0.8, 0.2],
                  model='ALS',
                  evaluator='Regression',
                  use_cache=True):

    cache_path = os.path.join(CACHE_PATH, f'get_als_model.msgpack')
    if use_cache and os.path.exists(cache_path):
        print(f'Loading from {cache_path}')
        (predictions, model, rmse_train, rmse_test, coverage_train,
         coverage_test, running_time, train,
         test) = pd.read_msgpack(cache_path)
        print(f'Loaded from {cache_path}')
    else:

        le1 = LabelEncoder()
        le1.fit(df['user_id'])
        df['user_id'] = le1.transform(df['user_id'])
        print(len(df['user_id']))
        le2 = LabelEncoder()
        le2.fit(df['business_id'])
        df['business_id'] = le2.transform(df['business_id'])
        print(len(df['business_id']))

        df = pandas_to_spark(df)

        train, test = df.randomSplit(split, seed=1)

        total_unique_businessids_train = train.select(
            'business_id').distinct().toPandas().values
        total_unique_businessids_test = test.select(
            'business_id').distinct().toPandas().values

        if model == 'ALS':
            model = ALS(maxIter=5,
                        regParam=regParam,
                        rank=rank,
                        userCol="user_id",
                        itemCol="business_id",
                        ratingCol="rating",
                        coldStartStrategy="drop",
                        nonnegative=True)

        if evaluator == 'Regression':
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
        start = time()
        model = model.fit(train)
        running_time = time() - start
        predictions = model.transform(test)
        rmse_test = evaluator.evaluate(model.transform(test))
        rmse_train = evaluator.evaluate(model.transform(train))

        pred_unique_businessids = calculate_coverage(model)
        subset_pred_train = [
            i for i in pred_unique_businessids
            if i in total_unique_businessids_train
        ]
        subset_pred_test = [
            i for i in pred_unique_businessids
            if i in total_unique_businessids_test
        ]
        coverage_train = len(subset_pred_train) / len(
            total_unique_businessids_train)
        coverage_test = len(subset_pred_test) / len(
            total_unique_businessids_test)

        #        pd.to_msgpack(cache_path, (predictions, model, rmse_train, rmse_test, coverage_train,
        #            coverage_test, running_time, train, test))
        print(f'Dumping to {cache_path}')

    # breakpoint()

    return (predictions, model, rmse_train, rmse_test, coverage_train,
            coverage_test, running_time, train, test)
示例#9
0
    def fit(self, tensor, timer=False):
        # add check that each dimensions_col start at 0
        self.tensor = tensor

        self.dims = dict.fromkeys(self.dimensions_col)
        for col in self.dimensions_col:
            self.dims[col] = self.tensor.shape[self.dimensions_col.index(col)]

        #==============================================================================
        # recuparation of the (user,item,rate) of the unfold matrix
        #==============================================================================
        unfolded_matrix = dict.fromkeys(self.dimensions_col)
        datas = dict.fromkeys(self.dimensions_col)

        for dim in self.dimensions_col:
            ind = self.dimensions_col.index(dim)

            unfolded_matrix[dim] = csr_matrix(self.tensor.unfold(ind))
            y = list(unfolded_matrix[dim].indices)
            indptr = unfolded_matrix[dim].indptr
            r = list(unfolded_matrix[dim].data)
            tmp = indptr[1:len(indptr)] - indptr[0:(len(indptr) - 1)]
            x = []
            for i in np.arange(len(tmp)):
                x.extend(np.repeat(i, tmp[i]))

            datas[dim] = pd.DataFrame({'row': x, 'col': y, 'rating': r})

        #==============================================================================
        # Factorization
        #==============================================================================
        res = dict.fromkeys(self.dimensions_col)
        self.features = dict.fromkeys(self.dimensions_col)
        features_star = dict.fromkeys(self.dimensions_col)
        if timer:
            times = []

        for mode in self.dimensions_col:
            print("\t Start " + mode + " learning")

            ind = self.dimensions_col.index(mode)
            local_dataset = sqlContext.createDataFrame(datas[mode])

            # Build the recommendation model using Alternating Least Squares
            if timer:
                t0 = time.time()

            if self.model == 'tucker':
                rank = self.get(mode, 'ranks')
            else:
                rank = self.get('rank', None)
            local_als = ALS(rank=rank,
                            maxIter=self.get('maxIter'),
                            regParam=self.get('lbda'),
                            alpha=self.get('alpha'),
                            implicitPrefs=self.implicitPrefs,
                            userCol='row',
                            itemCol='col',
                            ratingCol='rating',
                            seed=self.seed)
            res[mode] = local_als.fit(local_dataset)
            if timer:
                t1 = time.time()
                delta = t1 - t0
                print('\t \t time :', delta, "seconds")
                times.append(delta)

            latentFactors = res[mode].userFactors  #.orderBy("id")
            latentFactors_index = latentFactors.select('id').toPandas()
            latentFactors = latentFactors.select('features')

            for k in range(rank):
                latentFactors = latentFactors.withColumn(
                    'factor' + str(k), latentFactors.features[k])
            latentFactors = latentFactors.drop('features')
            latentFactors = latentFactors.toPandas()
            latentFactors.index = latentFactors_index['id']
            unknowns = list(
                set(range(self.dims[mode])) - set(latentFactors_index['id']))
            for unknown in unknowns:
                latentFactors.loc[unknown] = 0
            latentFactors = latentFactors.sort_index()
            self.features[mode] = np.array(latentFactors)
        if timer:
            print('\t \t longest mode time :', np.max(times), "seconds")

        if self.model.lower() == "tucker":
            print("\t Get core tensor")
            # get W
            if self.implicitPrefs:
                self.tensor.vals = np.repeat(1, len(self.tensor.vals))

            self.W = deepcopy(self.tensor)
            for mode in self.dimensions_col:
                ind = self.dimensions_col.index(mode)
                self.W = self.W.ttm(np.linalg.pinv(self.features[mode]),
                                    mode=ind)
        return False
    
    @staticmethod
    def rmse(dataset,predictionCol,targetCol):
        return sqrt(dataset.dropna().map(lambda x: (x[targetCol] - x[predictionCol]) ** 2).reduce(add) / float(dataset.count()))


    
lr1 = ALS()
grid1 = ParamGridBuilder().addGrid(lr1.regParam, [1.0,0.5,2.0]).build()
evaluator1 = MiEvaluador(predictionCol=lr1.getPredictionCol(),targetCol=lr1.getRatingCol())
cv1 = CrossValidator(estimator=lr1, estimatorParamMaps=grid1, evaluator=evaluator1, numFolds=2)
cvModel1 = cv1.fit(dfRatings)
a=cvModel1.transform(dfRatings)
error_cross_validation=MiEvaluador.rmse(a,lr1.getPredictionCol(),lr1.getRatingCol())
print ('ERROR de validacion: {}'.format(error_cross_validation))

error_models=[]
for reg_param in (1.0,0.5,2.0):
    lr = ALS(regParam=reg_param)
    model = lr.fit(dfRatings)
    error=MiEvaluador.rmse(model.transform(dfRatings),lr.getPredictionCol(),lr.getRatingCol())
    error_models.append(error)
    print ('reg_param: {}, rmse: {}'.format(reg_param,error))
    
import numpy as np
if np.isclose(error_models[np.argmin(error_models)],error_cross_validation):
	print("***\nFunciona correctamente pyspark\n****")	
else:
	raise RuntimeError("Deberia coincidir con el modulo donde reg_param = 0.5")
示例#11
0
    def train(self):
        if self.check_if_necessary() is False:
            return

        for media in self.__media__:
            st_time = datetime.utcnow()
            m = media(logger=self.logger)

            if m.content_type in [
                    ContentType.GAME,  # no ratings
                    ContentType.SERIE,  # too much ratings
                    ContentType.MOVIE  # too much ratings
            ]:
                continue

            sqlContext = SQLContext(sc)

            df = m.get_meta(cols=['user_id', m.id, 'rating'])

            # Convert Pandas DF to PySpark DF
            sparkDF = sqlContext.createDataFrame(df)

            als = ALS(userCol="user_id",
                      itemCol=m.id,
                      ratingCol="rating",
                      coldStartStrategy="drop")
            model = als.fit(sparkDF)

            user_df = User.get()

            # Check if is empty
            if user_df.shape[0] == 0:
                continue

            modelGest = model.recommendForUserSubset(
                sqlContext.createDataFrame(user_df), self.max_nb_elem)

            len_values = 0

            for user in modelGest.collect():
                # Do not recommend already recommended content
                already_recommended_media = []
                with db as session:
                    result = session.execute(
                        'SELECT %s FROM "%s" WHERE user_id = \'%s\' AND engine <> \'%s\''
                        % (m.id, m.tablename_recommended, user.user_id,
                           self.__class__.__name__))
                    already_recommended_media = [
                        dict(row)[m.id] for row in result
                    ]

                values = []
                for rating in user.recommendations:
                    id = int(rating[m.id])
                    if id in already_recommended_media:
                        continue
                    values.append({
                        "user_id": int(user.user_id),
                        m.id: id,
                        # divide by 5 to get a score between 0 and 1
                        "score": float(rating.rating / 5),
                        "engine": self.__class__.__name__,
                        "engine_priority": self.__engine_priority__,
                    })

                len_values += len(values)

                with db as session:
                    # Reset list of recommended `media` for this engine
                    session.execute(
                        text(
                            'DELETE FROM "%s" WHERE user_id = %s AND engine = \'%s\' AND content_type = \'%s\''
                            % (m.tablename_recommended, user.user_id,
                               self.__class__.__name__, str(
                                   m.content_type).upper())))

                    if len(values) > 0:
                        markers = ':user_id, :%s, :score, :engine, :engine_priority' % m.id
                        ins = 'INSERT INTO {tablename} VALUES ({markers}) ON CONFLICT ON CONSTRAINT recommended_content_pkey DO NOTHING'
                        ins = ins.format(tablename=m.tablename_recommended,
                                         markers=markers)
                        session.execute(ins, values)

            self.logger.info(
                "%s recommendation from collaborative filtering performed in %s (%s lines)"
                % (m.content_type, datetime.utcnow() - st_time, len_values))
            self.store_date(m.content_type)
tolerance = 0.02
min_error = float('inf')
best_rank = -1
best_iteration = -1

training_df, validation_df, test_df = ratings_df.randomSplit([.6, .2, .2],
                                                             seed=42)

for rank in ranks:
    als = ALS(maxIter=iterations,
              regParam=regularization_parameter,
              rank=rank,
              userCol="userId",
              itemCol="movieId",
              ratingCol="rating")
    model = als.fit(training_df)
    predictions = model.transform(validation_df)
    new_predictions = predictions.filter(col('prediction') != np.nan)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(new_predictions)
    errors.append(rmse)

    print('For rank %s the RMSE is %s' % (rank, rmse))
    if rmse < min_error:
        min_error = rmse
        best_rank = rank
print('The best model was trained with rank %s' % best_rank)

final_als = ALS(maxIter=iterations,
    movieName = loadMovieNames()

    lines = spark.read.text(
        'hdfs:///user/maria_dev/ml-100k/u.data'
    ).rdd  # spark.read.text() returns a data frame, use .rdd to get rdd object
    ratingsRDD = lines.map(parseInput)
    ratings = spark.createDataFrame(ratingsRDD).cache(
    )  # call .cache() so that Spark won't recreate this DataFrame more than once

    als = ALS(maxIter=5,
              regParam=0.01,
              userCol='userID',
              itemCol='movieID',
              ratingCol='rating')
    model = als.fit(ratings)  # use the ratings DataFrame to fit the ALS model

    # Print out ratings from user 6:
    print('\nRatings for userID 6:')
    userRatings = ratings.filter('userID=6')
    for rating in userRatings.collect():
        print(movieName[rating['movieID']], rating['rating'])

    print('\nTop 20 recommendations:')
    ratingCounts = ratings.groupBy('movieID').count().filter('count>100')
    popularMovies = ratingCounts.select('movieID').withColumn('userID', lit(6))

    recommendations = model.transform(popularMovies)

    topRecommendations = recommendations.sort(
        recommendations.prediction.desc()).take(20)
# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")

tolerance = 0.03
ranks = [4, 8, 12]
errors = [0, 0, 0]
models = [0, 0, 0]
err = 0
min_error = float('inf')
best_rank = -1
for rank in ranks:
  # Set the rank here:
  als.setRank(rank)
  # Create the model with these parameters.
  model = als.fit(training_df)
  # Run the model to create a prediction. Predict against the validation_df.
  predict_df = model.transform(validation_df)

  # Remove NaN values from prediction (due to SPARK-14489)
  predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan'))

  # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame
  error = reg_eval.evaluate(predicted_ratings_df)
  errors[err] = error
  models[err] = model
  print 'For rank %s the RMSE is %s' % (rank, error)
  if error < min_error:
    min_error = error
    best_rank = err
  err += 1
示例#15
0
def train_model(training_df, rank):
    iterations = 10
    als = ALS(rank=rank, maxIter=iterations, implicitPrefs=True)
    return als.fit(training_df)
示例#16
0
def generate_predictions(training_df, prediction_df, rank, model=None):
    iterations = 10
    als = ALS(rank=rank, maxIter=iterations, implicitPrefs=True)
    if model == None:
        model = als.fit(training_df)
    return model.transform(prediction_df).dropna()
def train_als(params, data):
    symbol = ALS(**params)
    with Timer() as t:
        model = symbol.fit(data)
    return model, t
示例#18
0
  lines = spark.read.text('ratings.dat').rdd
  ratingsRDD = lines.map(parse_rating)
  lines = spark.read.text('gender.dat').rdd
  users = dict(lines.map(parse_user).collect())

  ratings = spark.createDataFrame(ratingsRDD)
  (training, test) = ratings.randomSplit([0.8, 0.2])

  num_training = training.count()
  num_validation = test.count()

  print('Training: %d' % num_training)
  print('Validation: %d' % num_validation)

  # setup ALS
  rank = 8
  num_iterations = 8
  lambda_ = 0.1

  als = ALS(maxIter=num_interations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating")
  model = als.fit(training)

  # Evaluate the model by computing the RMSE on the test data
  predictions = model.transform(test)
  evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                  predictionCol="prediction")
  rmse = evaluator.evaluate(predictions)
  print("Root-mean-square error = " + str(rmse))

  spark.stop()
    def build_recommendation_model(self):
        logging.info("getting distinct users")
        print_with_time("getting distinct users")
        users = self.df.select(["user_id"]).distinct()

        logging.info("getting distinct items")
        print_with_time("getting distinct items")
        items = self.df.select(["item_id"]).distinct()

        logging.info("mapping user_id to number")
        print_with_time("mapping user_id to number")
        user_indexer = StringIndexer(inputCol="user_id",
                                     outputCol="user_id_no")
        self.user_indexed = user_indexer.fit(users).transform(users)
        self.user_indexed = self.user_indexed.select(
            self.user_indexed.user_id.cast("string"),
            self.user_indexed.user_id_no.cast("int"))

        logging.info("mapping item_id to number")
        print_with_time("mapping item_id to number")
        item_indexer = StringIndexer(inputCol="item_id",
                                     outputCol="item_id_no")
        self.item_indexed = item_indexer.fit(items).transform(items)
        self.item_indexed = self.item_indexed.select(
            self.item_indexed.item_id.cast("string"),
            self.item_indexed.item_id_no.cast("int"))

        logging.info("joining df with user_indexed rdd")
        print_with_time("joining df with user_indexed rdd")
        self.df = self.df.join(self.user_indexed, ["user_id"], 'inner')

        logging.info("joining df with item_indexed rdd")
        print_with_time("joining df with item_indexed rdd")
        self.df = self.df.join(self.item_indexed, ["item_id"], 'inner')
        self.df = self.df.select(["item_id_no", "user_id_no", "rating"])

        ############

        logging.info("splitting dataset into training and testing")
        print_with_time("splitting dataset into training and testing")
        (training, validation, test) = self.df.randomSplit([0.6, 0.2, 0.2])

        ######

        ranks = [25, 50, 100]
        regParam = [0.1, 0.01, 0.001]
        all_params = [(rank, reg) for rank in ranks for reg in regParam]

        min_mpr = float('inf')
        best_rank = -1
        best_reg = -1
        for (iteration_no, (rank, reg)) in enumerate(all_params):

            logging.info(iteration_no)
            print_with_time(str(iteration_no))
            logging.info("rank=%s, reg=%s " % (rank, reg))
            print_with_time("rank=%s, reg=%s " % (rank, reg))

            als = ALS(rank=rank,
                      regParam=reg,
                      nonnegative=True,
                      implicitPrefs=True,
                      userCol="user_id_no",
                      itemCol="item_id_no",
                      checkpointInterval=-1,
                      coldStartStrategy="drop",
                      ratingCol="rating")
            self.model = als.fit(training)

            logging.info("transforming the validation set")
            print_with_time("transforming the validation set")
            predictions = self.model.transform(validation)

            logging.info("getting rmse on validation set")
            print_with_time("getting rmse on validation set")

            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            logging.info("Root-mean-square error = " + str(rmse))
            print_with_time("Root-mean-square error = " + str(rmse))

            logging.info("getting MPR on validation set")
            print_with_time("getting MPR on validation set")

            ev = RankBasedEvaluator2("user_id_no", "rating", "prediction")
            mpr = ev.evaluate(sqlContext, predictions)
            logging.info("Mean Percentile Ranking = " + str(mpr))
            print_with_time("Mean Percentile Ranking = " + str(mpr))

            if mpr < min_mpr:
                min_mpr = mpr
                best_rank = rank
                best_reg = reg

        logging.info('The best model was trained with rank %s and reg %s' %
                     (best_rank, best_reg))
        print_with_time('The best model was trained with rank %s and reg %s' %
                        (best_rank, best_reg))

        ######

        logging.info("starting model training")
        print_with_time("starting model training")

        als = ALS(rank=best_rank,
                  regParam=best_reg,
                  nonnegative=True,
                  implicitPrefs=True,
                  userCol="user_id_no",
                  itemCol="item_id_no",
                  checkpointInterval=-1,
                  coldStartStrategy="drop",
                  ratingCol="rating")
        self.model = als.fit(training)

        logging.info("transforming the test set")
        print_with_time("transforming the test set")
        predictions = self.model.transform(test)

        logging.info("getting rmse on test set")
        print_with_time("getting rmse on test set")

        evaluator = RegressionEvaluator(metricName="rmse",
                                        labelCol="rating",
                                        predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)
        logging.info("Root-mean-square error = " + str(rmse))
        print_with_time("Root-mean-square error = " + str(rmse))

        logging.info("getting MPR on test set")
        print_with_time("getting MPR on test set")
        ev = RankBasedEvaluator2("user_id_no", "rating", "prediction")
        mpr = ev.evaluate(sqlContext, predictions)
        logging.info("Mean Percentile Ranking = " + str(mpr))
        print_with_time("Mean Percentile Ranking = " + str(mpr))
    for column in list(set(df.columns))
]

pipeline = Pipeline(stages=indexers)
ratings = pipeline.fit(df).transform(df)

train, validation, test = ratings.randomSplit([0.6, 0.2, 0.2], seed=427471138)

als_model = ALS(userCol='user_index',
                itemCol='hotel id',
                ratingCol='ratings',
                nonnegative=True,
                regParam=0.1,
                rank=10)

recommender = als_model.fit(train)

# Build a single row DataFrame
data = [(1, 100)]
columns = ('user', 'movie')
one_row_spark_df = spark.createDataFrame(data, columns)

user_factor_df = recommender.userFactors.filter('id = 1')
item_factor_df = recommender.itemFactors.filter('id = 100')

user_factors = user_factor_df.collect()[0]['features']
item_factors = item_factor_df.collect()[0]['features']

# Get the recommender's prediction
recommender.transform(one_row_spark_df).show()
示例#21
0
def train_ALS(train, test, evaluator, num_iters, reg_params, ranks, alphas):
    """
    Grid Search Function to select the best model based on RMSE
    of hold-out data
    Inspired by
    https://github.com/KevinLiao159/MyDataSciencePortfolio/blob/master
            /movie_recommender/movie_recommendation_using_ALS.ipynb
    Parameters
    ----------
    train : pyspark dataframe with training data
    test : pyspark dataframe with test data
    num_iters: list of iterations to test
    reg_params: list of regularization parameters to test
    ranks: list of # of latent factors to test
    alphas: list of alphas to test

    Returns
    -------
    fitted alsModel object
    """

    # initial
    min_error = float('inf')
    # best_rank = -1
    # best_regularization = 0
    # best_alpha = 1
    best_model = None

    # tuple up the lists
    combos = [num_iters, reg_params, ranks, alphas]
    combos_tup = list(itertools.product(*combos))

    # Init list for list of combos
    params_errs = []

    # Loop though combos
    for tup in combos_tup:
        num_iter = tup[0]
        reg = tup[1]
        rank = tup[2]
        alpha = tup[3]

        # train ALS model
        als = ALS(
            maxIter=num_iter,
            rank=rank,
            userCol='account_id',
            itemCol='comic_id',
            ratingCol='bought',
            implicitPrefs=True,
            regParam=reg,
            alpha=alpha,
            coldStartStrategy='drop',  # Just for CV
            seed=41916)

        model = als.fit(train)

        # Generate predictions on Test
        predictions = model.transform(test)
        predictions.persist()

        error = evaluator.evaluate(predictions)

        print('{} iterations, '.format(num_iter) +
              '{} latent factors, regularization='.format(rank) +
              '{}, and alpha @ {} : '.format(reg, alpha) +
              'validation error is {:.4f}'.format(error))

        # Save best model to date
        if error < min_error:
            # best_rank = rank
            # best_regularization = reg
            # best_alpha = alpha
            best_model = model

        # Add error to tuple, append to list of param and their errors
        tup_list = list(tup)
        _ = tup_list.append(error)
        params_errs.append(tup_list)

    return best_model, params_errs
    lines = spark.read.text("hdfs:///user/maria_dev/ml-100k/u.data").rdd

    # Convert it to a RDD of Row objects with (userID, movieID, rating)
    ratingsRDD = lines.map(parseInput)

    # Convert to a DataFrame and cache it
    #   this need to be used more than once
    ratings = spark.createDataFrame(ratingsRDD).cache()

    # Create an ALS collaborative filtering model from the complete data set
    als = ALS(maxIter=5,
              regParam=0.01,
              userCol="userID",
              itemCol="movieID",
              ratingCol="rating")
    model = als.fit(ratings)  #train

    # fabricate a user 0 in u.data, who likes science fiction but not like historical drama
    # recommend movies to this user 0( actually predict user 0's rating on each movie that he has never seen)

    # Print out ratings from user 0:
    print("\nRatings for user ID 0:")
    userRatings = ratings.filter("userID = 0")
    for rating in userRatings.collect():
        print movieNames[rating['movieID']], rating['rating']

    print("\nTop 20 recommendations:")

    # only predict user 0's rating on movies with more than 100 ratings, so hvae a reasonabale amount of data

    # Find movies rated more than 100 times
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS

# ALTERNATIVE
# from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
# from pyspark.mllib.recommendation import ALS, Rating

spark = SparkSession.builder.master("local").appName("SQL").getOrCreate()

print("\033[36mInitial data\033[0m")
columns = ["user", "item", "rating"]
data = [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0),
        (2, 2, 5.0)]
df = spark.createDataFrame(data, columns)
df.show()

print("\033[36mTraining model...\033[0m")
als = ALS()
model = als.fit(df)

output_model_path = "data/peliculas0_trained_model"
print("\033[36mSaving model to '{}'...\033[0m".format(output_model_path))
model.write().overwrite().save(output_model_path)

print("\033[36mTesting some user/item pairs...:\033[0m")
test = spark.createDataFrame([(0, 2), (1, 0), (2, 0), (3, 0)],
                             ["user", "item"])
model.transform(test).show()
示例#24
0
test.show(10)
print("Train data loaded")

############################################################################################################
######################### Question 2.A.2 ALS with lab settings #############################################
############################################################################################################

myseed = 200206518

als_50 = ALS(userCol="userId",
             itemCol="movieId",
             seed=myseed,
             coldStartStrategy="drop")

# Trainnig the model
model_50 = als_50.fit(train)

#Perdictions
predictions_50 = model_50.transform(test)

print("Evaluation for 50/50 split")
## Question 2.A.3 for time-split 50%
evaluator_rmse = RegressionEvaluator(metricName="rmse",
                                     labelCol="rating",
                                     predictionCol="prediction")
rmse_50 = evaluator_rmse.evaluate(predictions_50)
print("Root-mean-square error = " + str(rmse_50))

evaluator_mse = RegressionEvaluator(metricName="mse",
                                    labelCol="rating",
                                    predictionCol="prediction")
示例#25
0
from pyspark.ml.recommendation import ALS
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
import pandas as pd
sc = SparkContext()
sql_sc = SQLContext(sc)

pd_df_ratings = pd.read_csv('./ratings_small.csv')
pyspark_df_ratings = sql_sc.createDataFrame(pd_df_ratings)
pyspark_df_ratings = pyspark_df_ratings.drop('Timestamp')
#print(pyspark_df_ratings.show(5, truncate=False))

# 创建ALS模型
als = ALS(rank=3, maxIter = 10, regParam=0.1, userCol= 'userId', itemCol='movieId', ratingCol='rating')
model = als.fit(pyspark_df_ratings)
# 对userId=100进行Top-N推荐
recommendations = model.recommendForAllUsers(5)
print(recommendations.where(recommendations.userId == 100).collect())
示例#26
0
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

ratings_df = (spark.read.table("retail_features").selectExpr(
    "CAST(invoice_num AS INT) as user_id",
    "CAST(stock_code AS INT) as item_id",
    "CAST(quantity AS INT) as rating").where(
        "user_id is NOT NULL AND item_id is NOT NULL"))

#ratings_df.display()
(train_df, test_df) = ratings_df.randomSplit([0.7, 0.3])
als = ALS(maxIter=3,
          regParam=0.03,
          userCol="user_id",
          itemCol="item_id",
          ratingCol="rating",
          coldStartStrategy="drop")
als_model = als.fit(train_df)

predictions = model.transform(test_df)
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

user_recs = als_model.recommendForAllUsers(5)
user_recs.display()

item_recs = als_model.recommendForAllItems(5)
item_recs.display()
示例#27
0
文件: engine.py 项目: wivans/BigData
class RecommendationEngine:
    """A movie recommendation engine
    """
    def __train_model(self):
        """Train the ALS model with the current dataset
        """
        logger.info("Training the ALS model...")
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="movieId",
                       ratingCol="rating",
                       coldStartStrategy="drop")
        self.model = self.als.fit(self.ratingsdf)
        logger.info("ALS model built!")

    def get_top_ratings(self, user_id, movies_count):
        """Recommends up to movies_count top unrated movies to user_id
        """
        users = self.ratingsdf.select(self.als.getUserCol())
        users = users.filter(users.userId == user_id)
        userSubsetRecs = self.model.recommendForUserSubset(users, movies_count)
        userSubsetRecs = userSubsetRecs.withColumn("recommendations",
                                                   explode("recommendations"))
        userSubsetRecs = userSubsetRecs.select(func.col('userId'),
                                               func.col('recommendations')['movieId'].alias('movieId'),
                                               func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                    drop('recommendations')
        userSubsetRecs = userSubsetRecs.drop('Rating')
        userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"),
                                             'inner')
        # userSubsetRecs.show()
        # userSubsetRecs.printSchema()
        userSubsetRecs = userSubsetRecs.toPandas()
        userSubsetRecs = userSubsetRecs.to_json()
        return userSubsetRecs

    def get_top_movie_recommend(self, movie_id, user_count):
        """Recommends up to movies_count top unrated movies to user_id
        """
        movies = self.ratingsdf.select(self.als.getItemCol())
        movies = movies.filter(movies.movieId == movie_id)
        movieSubsetRecs = self.model.recommendForItemSubset(movies, user_count)
        movieSubsetRecs = movieSubsetRecs.withColumn(
            "recommendations", explode("recommendations"))
        movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'),
                                                 func.col('recommendations')['userId'].alias('userId'),
                                                 func.col('recommendations')['Rating'].alias('Rating')).\
                                                                                        drop('recommendations')
        movieSubsetRecs = movieSubsetRecs.drop('Rating')
        movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"),
                                               'inner')
        # userSubsetRecs.show()
        # userSubsetRecs.printSchema()
        movieSubsetRecs = movieSubsetRecs.toPandas()
        movieSubsetRecs = movieSubsetRecs.to_json()
        return movieSubsetRecs

    def __init__(self, spark_session, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """
        logger.info("Starting up the Recommendation Engine: ")
        self.spark_session = spark_session
        # Load ratings data for later use
        logger.info("Loading Ratings data...")
        ratings_file_path = os.path.join(dataset_path, 'ratings.csv')
        self.ratingsdf = spark_session.read.csv(ratings_file_path,
                                                header=True,
                                                inferSchema=True).na.drop()
        self.ratingsdf = self.ratingsdf.drop("timestamp")
        # Load movies data for later use
        logger.info("Loading Movies data...")
        movies_file_path = os.path.join(dataset_path, 'items.csv')
        self.moviesdf = spark_session.read.csv(movies_file_path,
                                               header=True,
                                               inferSchema=True).na.drop()
        #self.moviesdf = self.moviesdf.drop("genres",)
        # Train the model
        self.__train_model()
示例#28
0
    predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan'))

    # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame
    error = reg_eval.evaluate(predicted_ratings_df)
    errors[err] = error
    models[err] = model
    print('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = err
    err += 1

"""

als.setRank(12)
my_model = als.fit(training_df)
predict_df = my_model.transform(validation_df)
predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan'))
error = reg_eval.evaluate(predicted_ratings_df)
#print('The best model was trained with rank %s' % ranks[best_rank])
#my_model = models[best_rank]

# Run the best model with test dataset
predict_test_df = my_model.transform(test_df)

# Remove NaN values from prediction (due to SPARK-14489)
predicted_test_df = predict_df.filter(predict_test_df.prediction != float('nan'))

# Run the previously created RMSE evaluator, reg_eval, on the predicted_test_df DataFrame
test_RMSE = reg_eval.evaluate(predicted_test_df)
             .config('spark.executor.memoryOverhead', memory) \
             .config("spark.sql.broadcastTimeout", "36000") \
             .config("spark.storage.memoryFraction","0") \
             .config("spark.memory.offHeap.enabled","true") \
             .config("spark.memory.offHeap.size",memory).getOrCreate()


train = spark.read.parquet(sys.argv[1])
val = spark.read.parquet(sys.argv[1])


results = []
for rank in [2, 5, 10, 20]:
    for reg in [0.05, 0.1, 0.5, 1]:
        als = ALS(rank=rank, maxIter=10, regParam=reg, seed=seed)
        model = als.fit(train.toDF('user', 'item', 'rating'))

        predictions_val = model.transform(val.toDF('user', 'item', 'rating'))
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
        rmse = evaluator.evaluate(predictions_val)

        print('Rank = ', rank, ' regParam = ', reg)
        print("Root-mean-square error = " + str(rmse))


        val.createOrReplaceTempView('val')
        val_true = spark.sql('select user, book from val where rating > 2 sort by rating desc')
        labels = val_true.groupby('user').agg(collect_list('book'))

        val_recommendations = model.recommendForUserSubset(labels.select('user'), 500)
        preds = val_recommendations.withColumn('recommendations', explode('recommendations')).select('user', 'recommendations.item').groupBy('user').agg(collect_list('item'))
def main(spark, train_file, val_file, model_file):

    train_df = spark.read.parquet(train_file)
    val_df = spark.read.parquet(val_file)
    train_df = train_df.select('user_label', 'track_label', 'count')
    val_df = val_df.select('user_label', 'track_label', 'count')
    val_grouped = val_df.groupBy('user_label').agg(
        F.collect_list(F.col('track_label')).alias('track_label'))

    # ALS for implicit feedback
    als = ALS(maxIter = 5, regParam = 0.01, implicitPrefs = True, \
          userCol = 'user_label', itemCol = 'track_label', ratingCol = 'count')

    als_model = als.fit(train_df)
    predictions = als_model.recommendForAllUsers(10)
    prediction_df = predictions.rdd.map(
        lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF()
    prediction_df = prediction_df.selectExpr('_1 as user_label',
                                             '_2 as recommendations')

    # Join table
    val_pred = val_grouped.join(prediction_df, 'user_label', 'inner')
    rdd = val_pred.select('recommendations', 'track_label').rdd
    ranking_metrics = RankingMetrics(rdd)
    print('Before tuning, MAP = %s' % ranking_metrics.meanAveragePrecision)

    # hyperparameter tuning
    ranks = [10, 20]
    reg_params = [0.001]
    alphas = [0.10, 0.20, 0.40]
    best_rank = None
    best_reg_param = None
    best_alpha = None
    best_model = None
    best_map = 0

    for rank_i, alpha_i, reg_param_i in itertools.product(
            ranks, alphas, reg_params):

        print('Running on rank:', rank_i)
        print('Running on alpha:', alpha_i)
        print('Running on reg:', reg_param_i)

        als = ALS(maxIter=5,
                  regParam=reg_param_i,
                  implicitPrefs=True,
                  alpha=alpha_i,
                  rank=rank_i,
                  userCol='user_label',
                  itemCol='track_label',
                  ratingCol='count')

        als_model = als.fit(train_df)
        predictions = als_model.recommendForAllUsers(100)
        prediction_df = predictions.rdd.map(lambda r: (
            r.user_label, [i[0] for i in r.recommendations])).toDF()
        prediction_df = prediction_df.selectExpr('_1 as user_label',
                                                 '_2 as recommendations')

        # Join table
        val_pred = val_grouped.join(prediction_df, 'user_label', 'inner')
        rdd = val_pred.select('recommendations', 'track_label').rdd
        ranking_metrics = RankingMetrics(rdd)
        map_ = ranking_metrics.meanAveragePrecision

        print('MAP:', map_)

        if map_ > best_map:
            best_rank = rank_i
            best_reg_param = reg_param_i
            best_alpha = alpha_i
            best_model = als_model
            best_map = map_

    print('Best rank:', best_rank)
    print('Best regParam:', best_reg_param)
    print('Best alpha:', best_alpha)
    print('Best map:', best_map)

    # save the best model
    best_model.save(model_file)
# -----------------------------------------------------------------------------
# Modeling
# -----------------------------------------------------------------------------

# Imports

from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer

from pyspark.mllib.evaluation import RankingMetrics

# Modeling

als = ALS(maxIter=5, regParam=0.01, userCol="user_id_encoded", itemCol="song_id_encoded", ratingCol="plays")
als_model = als.fit(training)
predictions = als_model.transform(test)

predictions = predictions.orderBy(col("user_id"), col("song_id"), col("prediction").desc())
predictions.cache()

predictions.show(50, False)

# +------------------+----------------------------------------+-----+---------------+---------------+-----------+
# |song_id           |user_id                                 |plays|user_id_encoded|song_id_encoded|prediction |
# +------------------+----------------------------------------+-----+---------------+---------------+-----------+
# |SORDKNX12A8C13A45F|00000b722001882066dff9d2da8a775658053ea0|1    |856763.0       |50622.0        |0.63414586 |
# |SOBFEDK12A8C13BB25|00001638d6189236866af9bbf309ae6c2347ffdc|1    |859779.0       |17821.0        |-1.0087988 |
# |SOLOYFG12A8C133391|00001638d6189236866af9bbf309ae6c2347ffdc|1    |859779.0       |19812.0        |-0.74704367|
# |SOOEPEG12A6D4FC7CA|00001638d6189236866af9bbf309ae6c2347ffdc|1    |859779.0       |4703.0         |-0.5360813 |
# |SOWOTHK12A67AD818B|00001638d6189236866af9bbf309ae6c2347ffdc|24   |859779.0       |192657.0       |0.38297927 |
als = ALS(userCol='user_id',
          itemCol='book_id',
          ratingCol='rating',
          coldStartStrategy='drop')

for rank in RANKS:
    for maxIter in MAX_ITERS:
        for regParam in REG_PARAMS:

            rank = int(rank)
            maxIter = int(maxIter)

            print("Running for " + str((rank, maxIter, regParam)))
            als.setParams(rank=rank, maxIter=maxIter, regParam=regParam)

            model = als.fit(interactions_train)
            # model.save(os.path.join(MODELS_DIRECTORY, 'als'))

            predictions = model.transform(interactions_val)
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            print("Root-mean-square error = " + str(rmse))

            f = open("validation_errors.csv", "a+")
            f.write(
                str(rank) + "," + str(maxIter) + "," + str(regParam) + "," +
                str(rmse) + "\n")
            f.close()
            print("Finshed running for " + str((rank, maxIter, regParam)))
user_ids = ratings.select("userid").distinct().rdd.zipWithUniqueId()
user_map = user_ids.map(lambda (x, y): Row(userid=x.userid, userid_int=y)).toDF().cache()

# same as above - this is a UUID/int mapping
video_ids = ratings.select("videoid").distinct().rdd.zipWithUniqueId().cache()
video_map = video_ids.map(lambda (x, y): Row(videoid=x.videoid, videoid_int=y)).toDF().cache()

print "Recommending based on {0} users and {1} videos.".format(user_map.count(), video_map.count())

training_data = ratings.join(user_map, ratings.userid == user_map.userid).\
                    join(video_map, ratings.videoid == video_map.videoid).\
                    select(user_map.userid, user_map.userid_int, video_map.videoid, video_map.videoid_int, "rating")

# Create ALS transformer and train with the ratings from our C* table
als = ALS(rank=10, maxIter=10).setUserCol("userid_int").setItemCol("videoid_int").setRatingCol("rating")
model = als.fit(training_data)

users = user_map.collect()
user_map.unpersist()
count = 0
length = len(users)
for user in users:
    videos_and_user = video_map.withColumn("userid", lit(user.userid)).\
                            withColumn("userid_int", lit(user.userid_int))

    model.transform(videos_and_user).\
        sort("prediction", ascending=False).limit(30).\
        select("videoid", "userid", col("prediction").alias("rating")).\
        write.format("org.apache.spark.sql.cassandra").\
        options(keyspace="killrvideo", table="video_recommendations_by_video").\
        save(mode="append")
def main(input_1, input_2, input_3):

    business_df = spark.read.json(input_1)
    user_df = spark.read.json(input_2)
    review_df = spark.read.json(input_3)

    # Spark ALS implementation requires the rating matrix to have the follwoing data types
    user_df_schema = StructType([
        StructField("user_id", StringType(), True),
        StructField("userId", IntegerType(), True)
    ])

    user_id = user_df.select('user_id')
    user_newid_df = spark.createDataFrame(
        user_id.rdd.map(lambda x: x[0]).zipWithIndex(), user_df_schema)

    # add the new userId column the user dataframe
    user_new_df = user_df.join(user_newid_df, 'user_id',
                               'inner').select('userId', 'user_id', 'name')

    bus_df_schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("businessId", IntegerType(), True)
    ])
    bus_id = business_df.select('business_id')
    business_newid_df = spark.createDataFrame(
        bus_id.rdd.map(lambda x: x[0]).zipWithIndex(), bus_df_schema)
    business_new_df = business_df.join(business_newid_df, 'business_id',
                                       'inner').select('businessId',
                                                       'business_id', 'name',
                                                       'categories',
                                                       'latitude', 'longitude')

    # map new userId and businessId in the review dataframe
    review_df = review_df.select('user_id', 'business_id', 'stars')
    review_userId_df = review_df.join(user_newid_df, "user_id",
                                      'inner').select('business_id', 'userId',
                                                      'user_id', 'stars')
    # map the businessId
    review_userId_df = review_userId_df.join(business_newid_df, "business_id",
                                             'inner').select(
                                                 'user_id', 'business_id',
                                                 'stars', 'userId',
                                                 'businessId')

    #create the rating dataframe required by the ALS model
    rating_df = review_userId_df.select(
        'userId', 'businessId',
        review_userId_df.stars.cast('float').alias('rating'))
    rating_df.cache()
    #print(' Rating matrx no. of rows :', rating_df.count())

    (train, test) = rating_df.randomSplit([0.8, 0.2], seed=123)
    # Cross Validation
    als = ALS(userCol="userId",
              itemCol="businessId",
              ratingCol="rating",
              coldStartStrategy="drop")
    param_grid = ParamGridBuilder().addGrid(als.rank, [10, 15, 20]).addGrid(
        als.maxIter, [10, 15, 20]).build()
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating")
    cv = CrossValidator(estimator=als,
                        estimatorParamMaps=param_grid,
                        evaluator=evaluator,
                        numFolds=5,
                        seed=123)
    cv_als_model = cv.fit(train)

    # Evaluate the model by compu
    als_predictions = cv_als_model.bestModel.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(als_predictions)
    print("Root-mean-square error" + str(rmse))
    #rmse = 1.559099

    #best_model = cv_als_model.bestModel
    #best_rank is 20
    #best_model.rank

    #best_maxIter is 20
    #(best_model._java_obj.parent().getMaxIter())
    # drop columns for Nan values (ColdStrategy parameter) and tune ALS model
    als = ALS(rank=20,
              maxIter=20,
              regParam=0.3,
              userCol="userId",
              itemCol="businessId",
              ratingCol="rating",
              coldStartStrategy="drop",
              seed=123)
    alsb_model = als.fit(train)
    alsb_predictions = alsb_model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(alsb_predictions)
    # save the ALS model
    alsb_model.write().overwrite().save('als_model')
    print("alsb_model Root-mean-square error = " + str(rmse))
    # rmse is 1.45023

    alsn_model = ALSModel.load('als_model')
    # generate top 10 business recommendations for each user
    userRecoms = alsn_model.recommendForAllUsers(10)

    all_userRecoms = userRecoms.join(user_newid_df, 'userId',
                                     'inner').select('userId',
                                                     'recommendations',
                                                     'user_id')
    all_userRecoms.cache()

    # test and show recommendations
    u_id = 'ZWD8UH1T7QXQr0Eq-mcWYg'

    userFlatRec = spark.createDataFrame(
        all_userRecoms.filter(
            col('user_id') == u_id).rdd.flatMap(lambda p: p[1]))
    # businessId|            rating|
    #+----------+------------------+
    #|    171476|5.4555559158325195|
    #|     25624|5.3495965003967285|
    #|     14049| 5.271500110626221|

    #show the recommeded restaurants details
    collab_df = business_new_df.join(userFlatRec, 'businessId',
                                     'inner').drop('businessId')

    result = getCollabRecom(u_id, all_userRecoms, business_new_df)
    result.show()
示例#35
0
def run_train_and_validation(spark, train, test_input, test_output, rank_list):
    evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="TKET",
                                         predictionCol="prediction")
    evaluator_mse = RegressionEvaluator(metricName="mse", labelCol="TKET",
                                        predictionCol="prediction")
    evaluator_mae = RegressionEvaluator(metricName="mae", labelCol="TKET",
                                        predictionCol="prediction")
    evaluators = [evaluator_rmse, evaluator_mse, evaluator_mae]
    error_list_als = {}
    error_list_als_nn = {}
    error_list_als_ibcf = {}
    error_list_als_nn_ibcf = {}
    error_list_combine = {}
    error_list_combine_nn = {}
    error_list_ibcf = {}
    error_list_nbcf = {}
    error_models = {}
    best_models = {}
    ibcf_ranks = [3, 5, 7, 9, 11, 13, 15, 17, 20, 23, 25]


    # user_col = "MASV1"
    # item_col = "F_MAMH"
    # item_index_col = "F_MAMH_index"
    # grade_col = "TKET"
    # prediction_col = "prediction"
    #
    # #IBCF prediction model
    # ibcf_model = IBCF(spark, user_col, item_col, item_index_col, grade_col, prediction_col)
    # train_part_df = ibcf_model.remove_unknown_item(train, test_input)
    # validate_part_df = ibcf_model.remove_unknown_item(train, test_output)
    # item_similarity_df = ibcf_model.fit(train.drop(item_col))
    #
    # for rank in ibcf_ranks:
    #     result_df = ibcf_model.predict(validate_part_df, item_similarity_df, train_part_df, rank)
    #     result_df.show()
    #     error_ibcf = evaluate(result_df,evaluators)
    #     error_list_ibcf[rank] = error_ibcf
    #
    # #NBCF prediction model
    # nbcf_model = NBCF(spark, user_col, item_col, item_index_col, grade_col, prediction_col)
    # train_df = train.unionAll(test_input)
    # user_similarity = nbcf_model.fit(train_df.drop(item_col))
    # for rank in ibcf_ranks:
    #     result_df = nbcf_model.predict(test_output, user_similarity, train_df, rank)
    #     result_df.show()
    #     error_nbcf = evaluate(result_df,evaluators)
    #     error_list_nbcf[rank] = error_nbcf



    for i in range(len(rank_list)):
        als_input = train.unionAll(test_input)

        #als non negative false
        als = ALS(rank=rank_list[i], maxIter=15, regParam=0.01, userCol="MASV1",
                  itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False)
        
        als_model = als.fit(als_input)
        predict_als = als_model.transform(test_output)

        #als non negative true
        als_nn = ALS(rank=rank_list[i], maxIter=15, regParam=0.01, userCol="MASV1",
                  itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=True)
        als_nn_model = als_nn.fit(als_input)
        predict_als_nn = als_nn_model.transform(test_output)

        error_als = evaluate(predict_als, evaluators)
        error_als_nn = evaluate(predict_als_nn, evaluators)

        error_list_als[rank_list[i]] = error_als
        error_list_als_nn[rank_list[i]] = error_als_nn

        best_models = put_best_model(best_models, "als", Model_Error_Wrapper("als_{}".format(rank_list[i]), als_model, error_als[0]))
        best_models = put_best_model(best_models, "als_nn", Model_Error_Wrapper("als_nn_{}".format(rank_list[i]), als_nn_model, error_als_nn[0]))

        #combine mf_ibcf_model
        
        for ibcf_rank in ibcf_ranks:
            #als_ibcf
            als_ibcf_model = IBCFWithItemFactor(spark, als_model.itemFactors) \
                .setUserCol("MASV1") \
                .setItemCol("F_MAMH_index") \
                .setValueCol("TKET") \
                .setRank(ibcf_rank)
            predict_als_ibcf = als_ibcf_model.transform(test_input, test_output.drop("TKET"))
            predict_als_ibcf_with_gt = predict_als_ibcf.join(test_output, ["MASV1", "F_MAMH_index"])

            error_als_ibcf = evaluate(predict_als_ibcf_with_gt, evaluators)
            error_list_als_ibcf["{}_{}".format(rank_list[i], ibcf_rank)] = error_als_ibcf
            best_models = put_best_model(best_models, "als_ibcf", Model_Error_Wrapper("als_ibcf_{}_{}".format(rank_list[i], ibcf_rank), als_ibcf_model, error_als_ibcf[0]))


            #als_ibcf_mean
            als_ibcf_mean_model = ALSIBCFMeanModel(spark, als_ibcf_model, als_model)
            combine = als_ibcf_mean_model.transform(test_input, test_output.drop("TKET")).join(test_output, ["MASV1", "F_MAMH_index"])

            #combine with als
            # combine = predict_als_ibcf.withColumnRenamed("prediction", "prediction_ibcf") \
            #     .join(predict_als.withColumnRenamed("prediction", "prediction_als"), ["MASV1", "F_MAMH_index"]) \
            #     .withColumn("prediction", (col("prediction_ibcf") + col("prediction_als")) / 2)

            error_combine = evaluate(combine, evaluators)
            error_list_combine["{}_{}".format(rank_list[i], ibcf_rank)] = error_combine
            best_models = put_best_model(best_models, "als_ibcf_mean", Model_Error_Wrapper("als_ibcf_mean_{}_{}".format(rank_list[i], ibcf_rank), als_ibcf_mean_model, error_combine[0]))


            #als_nn_ibcf
            als_nn_ibcf_model = IBCFWithItemFactor(spark, als_nn_model.itemFactors) \
                .setUserCol("MASV1") \
                .setItemCol("F_MAMH_index") \
                .setValueCol("TKET") \
                .setRank(ibcf_rank)
            predict_als_nn_ibcf = als_nn_ibcf_model.transform(test_input, test_output.drop("TKET"))
            predict_als_nn_ibcf_with_gt = predict_als_nn_ibcf.join(test_output, ["MASV1", "F_MAMH_index"])

            error_als_nn_ibcf = evaluate(predict_als_nn_ibcf_with_gt, evaluators)
            error_list_als_nn_ibcf["{}_{}".format(rank_list[i], ibcf_rank)] = error_als_nn_ibcf
            best_models = put_best_model(best_models, "als_nn_ibcf", Model_Error_Wrapper("als_nn_ibcf_{}_{}".format(rank_list[i], ibcf_rank), als_nn_ibcf_model, error_als_nn_ibcf[0]))


            #als_nn_ibcf_mean
            als_nn_ibcf_mean_model = ALSIBCFMeanModel(spark, als_nn_ibcf_model, als_nn_model)
            combine_nn = als_nn_ibcf_mean_model.transform(test_input, test_output.drop("TKET")).join(test_output, ["MASV1", "F_MAMH_index"])

            #combine with als_nn
            # combine_nn = predict_als_nn_ibcf.withColumnRenamed("prediction", "prediction_ibcf") \
            #     .join(predict_als_nn.withColumnRenamed("prediction", "prediction_als"), ["MASV1", "F_MAMH_index"]) \
            #     .withColumn("prediction", (col("prediction_ibcf") + col("prediction_als")) / 2)

            error_combine_nn = evaluate(combine_nn, evaluators)
            error_list_combine_nn["{}_{}".format(rank_list[i], ibcf_rank)] = error_combine_nn
            best_models = put_best_model(best_models, "als_nn_ibcf_mean", Model_Error_Wrapper("als_nn_ibcf_mean_{}_{}".format(rank_list[i], ibcf_rank), als_nn_ibcf_mean_model, error_combine_nn[0]))

    error_models["als"] = error_list_als
    error_models["als_nn"] = error_list_als_nn
    error_models["als_ibcf"] = error_list_als_ibcf
    error_models["als_nn_ibcf"] = error_list_als_nn_ibcf
    error_models["als_ibcf_mean"] = error_list_combine
    error_models["als_nn_ibcf_mean"] = error_list_combine_nn
    # error_models["ibcf"] = error_list_ibcf
    # error_models["nbcf"] = error_list_nbcf
    return error_models, best_models
示例#36
0
# Read train and test data to Pandas dataframes
train_raw = pd.read_csv(train_input)
test_raw = pd.read_csv(test_input)

# Delete unused columns
test_raw = test_raw.drop(['date'],axis=1)
train_raw = train_raw.drop(['train_id','date'],axis=1)

# Create Spark Dataframes
train = spark.createDataFrame(train_raw)
test = spark.createDataFrame(test_raw)

# Create ALS predictor, fit the model and generate the predictions
als = ALS(userCol="user_id", itemCol="business_id", ratingCol="rating")
model = als.fit(train)
predictions = model.transform(test)

# Store result in a pandas dataframe
predict_df = predictions.select('test_id','prediction').coalesce(1).orderBy('test_id').toPandas()

### Scaling the model to range 1-5
max_value = predict_df.prediction.max()
min_value = predict_df.prediction.min()
predict_df['rating'] = predict_df.prediction.apply(normalize)

predict_df = predict_df[['test_id','rating']]

# Save the predictions in a file to submit to Kaggle
predict_df.sort_values('test_id').to_csv(output,index=False)
示例#37
0
def main(spark, data_file_train, data_file_val):

    start = time.time()

    # reading training and validation files
    df_train = spark.read.parquet(data_file_train)
    df_val = spark.read.parquet(data_file_val)

    window_user_ordered = Window.partitionBy('user_id').orderBy('rating')
    window_user = Window.partitionBy('user_id')

    actual_df_val = df_val.withColumn(
        'actual_books',
        F.collect_list('book_id').over(window_user_ordered)).groupBy(
            'user_id').agg(F.max('actual_books').alias('actual_books'))

    print("Datasets loaded | Time taken: {}".format(time.time() - start))

    ranks = [10, 15, 25, 50, 100]
    regParam = [1, 0.1, 0.01, 0.001]
    max_score = 0.0
    best_model = None

    for r in ranks:
        for reg in regParam:

            start = time.time()

            als = ALS(maxIter=10,
                      regParam=reg,
                      userCol="user_id",
                      itemCol="book_id",
                      ratingCol="rating",
                      rank=r)
            model = als.fit(df_train)

            print(
                "Done with model fitting | Time taken: {}".format(time.time() -
                                                                  start))
            start = time.time()

            # predictions = model.transform(df_val)
            # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
            # rmse = evaluator.evaluate(predictions)

            # print("RMSE: {}".format(rmse))

            recommendations = model.recommendForUserSubset(df_val, 500)
            userPredictions = recommendations.select(
                'user_id', F.explode('recommendations.book_id')).withColumn(
                    'pred_books',
                    F.collect_list('col').over(window_user)).groupBy(
                        'user_id').agg(
                            F.max('pred_books').alias('pred_books'))
            predAndLabels = userPredictions.join(actual_df_val,
                                                 on='user_id').select(
                                                     'pred_books',
                                                     'actual_books')
            metrics = RankingMetrics(predAndLabels.rdd)
            score = metrics.meanAveragePrecision
            print('Regularization: {} | Rank: {} | MAP: {}'.format(
                reg, r, score))
            print('Time taken: {}'.format(time.time() - start))

            if score > max_score:
                max_score = score
                best_model = model
                best_rank = r
                best_reg = reg

    best_model.itemFactors.rdd.saveAsTextFile('recom/iF')
    best_model.userFactors.rdd.saveAsTextFile('recom/uF')
    best_model.save("recom/best_model")
    print('Best Regularization: {} | Best Rank: {} | Best MAP: {}'.format(
        best_reg, best_r, best_score))
def main(spark,
         data_file,
         validation_file,
         test_file,
         model_file,
         tuning=False):
    # load data and create dataframe
    # train data
    train_df = spark.read.parquet(data_file)
    train_df.createOrReplaceTempView('train_df')
    # validation data
    validation_df = spark.read.parquet(validation_file)
    validation_df.createOrReplaceTempView('validation_df')
    # test data
    test_df = spark.read.parquet(test_file)
    test_df.createOrReplaceTempView('test_df')

    # omit data that not contains users in the validation and test data
    train_df = spark.sql(
        "SELECT DISTINCT(user_id), book_id, rating FROM train_df "
        "WHERE user_id IN ((SELECT user_id FROM validation_df) UNION (SELECT user_id FROM test_df)) AND rating!=0"
    )

    # sub sample 60% of data
    (train_df, train_rest) = train_df.randomSplit([0.6, 0.4], seed=20)

    print('data has been preprocessed. ')

    try:
        # load saved Model Indexer. If haven't created, then create indexer
        print('load Model Indexer')
        model_indexer = PipelineModel.load(
            './home/hj1325/final-project-final-project/model_indexer')

    except:
        # create indexer
        print('create Model Indexer')
        user_indexer = StringIndexer(
            inputCol='user_id',
            outputCol='user_label').setHandleInvalid('skip')
        book_indexer = StringIndexer(
            inputCol='book_id',
            outputCol='book_label').setHandleInvalid('skip')
        training_pipeline = Pipeline(stages=[user_indexer, book_indexer])

        model_indexer = training_pipeline.fit(train_df)
        model_indexer.write().overwrite().save(
            './home/hj1325/final-project-final-project/model_indexer')
        print('Model indexer has been created.')

    # use indexer to transform dataframe for training and validation
    train_df = model_indexer.transform(train_df)
    validation_df = model_indexer.transform(validation_df)
    validation_user = validation_df.select('user_label').distinct().alias(
        'userCol')

    validation_t_df = validation_df.select(['user_label', 'book_label'
                                            ]).repartition(800, 'user_label')
    # use panda udf to save run time
    user_truth = validation_t_df.groupby('user_label').agg(
        F.collect_list('book_label').alias('truth')).cache()
    print('Training and Validation dataframe have been transformed.')

    # set tuning to true to tune using hyper-parameter, by default use the the following hyper-parameter to save running
    # time
    # regularization parameter = 0.1, alpha = 1, rank = 100(handling implicit feedback)
    if tuning:
        RegParam = [0.1, 1, 10, 100]
        Alpha = [0.1, 1]
        Rank = [10, 100]
    else:
        RegParam = [0.1]
        Alpha = [1]
        Rank = [100]

    # precision_at_k store precision and average corresponding to each regparam, alpha and rank
    PRECISION_AT_K = {}
    RMSE_list = {}
    count = 0
    total = len(RegParam) * len(Alpha) * len(Rank)

    for a in RegParam:
        for b in Alpha:
            for c in Rank:
                print('currently using model with regParam =' + str(a) +
                      ', Alpha =' + str(b) + ', Rank =' + str(c))

                # use train_df to fit ALS model
                als_train = ALS(maxIter=10,
                                regParam=a,
                                alpha=b,
                                rank=c,
                                userCol='user_label',
                                itemCol='book_label',
                                ratingCol='rating',
                                coldStartStrategy='drop',
                                implicitPrefs=True)

                als_model = als_train.fit(train_df)

                # evaluate model
                predict = als_model.transform(validation_df)
                evaluator = RegressionEvaluator(metricName='rmse',
                                                labelCol='rating',
                                                predictionCol='prediction')
                rmse = evaluator.evaluate(predict)
                RMSE_list[rmse] = [rmse, als_model, als_train]

                count += 1
                print(str(count) + 'of the total ' + str(total) + ' finished.')

                print(' RMSE value= ' + str(rmse) + ' RegParam= ' + str(a) +
                      ' Alpha= ' + str(b) + ' rank= ' + str(c))

                # predict based on the top 500 item of each user
                # recommend = als_model.recommendForUserSubset(validation_df, 500)

                # prediction = spark.sql('SELECT * FROM recommend INNER JOIN user_truth WHERE recommend.user_label=user_truth.user_label')
                # after running panda udf is faster than using sparksql
                # prediction = recommend.join(user_truth, recommend.user_label == user_truth.user_label, 'inner')

                #score = prediction.select('recommendations.book_label', 'truth')
                #score = score.rdd.map(tuple).repartition(800)
                #rank_metric = RankingMetrics(score)

                #mean_precision = rank_metric.meanAveragePrecision
                #precision = rank_metric.precisionAt(500)

                #PRECISION_AT_K[mean_precision] = [precision, als_model, als_train]
                #count += 1

                #print(str(count) + 'of the total' + str(total) + 'finished.')
                #print(str(precision) + str(mean_precision))

    # store model with the best root square mean error statistic
    best_rmse = min(list(RMSE_list.keys()))
    lowest_rmse, best_model, best_als_model = RMSE_list[best_rmse]
    best_model.write().overwrite().save(model_file)

    # store model with the best precision statistic
    #best_mean_precision = max(list(PRECISION_AT_K.keys()))
    #highest_precision, best_model, best_als_model = PRECISION_AT_K[best_mean_precision]
    #best_model.write().overwrite().save(model_file)

    # save best ALS model
    # best_als_model.save('./recommender/alsFile')

    #print('Best model with the mean average precision of' + str(best_mean_precision) +
    #'and the best precision of ' + str(highest_precision) +
    #'regParam=' + str(best_als_model.getregParam) +
    #'Alpha=' + str(best_als_model.getAlpha) +
    #'Rank=' + str(best_als_model.getRank))

    print('Best model with the root mean square error of ' + str(lowest_rmse) +
          ' and the regParam of ' + {best_als_model.getRegParam} +
          ' Alpha of ' + {best_als_model.getAlpha} + ' Rank of ' +
          {best_als_model.getRank})
示例#39
0
def train_als(params, data):
    symbol = ALS(**params)
    with Timer() as t:
        model = symbol.fit(data)
    return model, t