def predict_ratings(self, df, type):
        if type == 'Restaurants':
            model = ALSModel.load(modelRest_input_dir)
        else:
            model = ALSModel.load(modelAll_input_dir)

        predictions = model.transform(df)
        return predictions
예제 #2
0
def parameter_tuning(file_path, percent, ranks, regParams, maxIter=10, ks=[10, 200, 500]):
	'''
	Tune parameters.
	'''

	print('Load train parquet...')
	df_train = spark.read.parquet(file_path + 'interactions_train_' + str(percent) + '.parquet')
	print('Load val parquet...')
	df_val = spark.read.parquet(file_path + 'interactions_val_' + str(percent) + '.parquet')
	print('Load test parquet...')
	df_test = spark.read.parquet(file_path + 'interactions_test_' + str(percent) + '.parquet')
	
	tuning_dict = {}
	for rank in ranks:
		for regParam in regParams:
			print('Tune parameters: rank={} and reg={}...'.format(rank, regParam))
			model_name = 'rank_' + str(rank) + '_regParam_' + str(regParam) + '_downsample_' + str(percent)
			print('Train...')
			try:
				model = ALSModel.load(model_name)
			except:
				model = train(df=df_train, name=model_name, rank=rank, maxIter=maxIter, regParam=regParam)
			print('Evaluate...')
			eval_results = evaluation(df_test, model, ks)
			tuning_dict[model_name] = eval_results
	
	metrics_name = 'metrices_test' + str(round(time.time())) + '_downsample_' + str(percent) + '.pkl'
	with open(metrics_name, 'wb') as f:
		pickle.dump(tuning_dict, f)
	
	return tuning_dict
예제 #3
0
def get_val_metrics_outdated (spark, model_file, train_file, val_file, output_log_filepath):
    '''
    Gets val metrics for given model, training, and validation data and saves to a log file
    '''
    train = spark.read.parquet(train_file)
    train.createOrReplaceTempView('train')
    
    val = spark.read.parquet(val_file)
    val.createOrReplaceTempView('val')
    
    # use stringIndexer and pipeline to index on user_id and book_id
    # for user indexer, throws error if there is a user in the validation that is not in training
    # for item indexer, skips item if it was not in training 
    user_idxer = StringIndexer(inputCol = 'user_id', outputCol = 'user', handleInvalid = 'skip')
    item_idxer = StringIndexer(inputCol = 'book_id', outputCol = 'item', handleInvalid = 'skip')
    
    
    # index data
    pipeline = Pipeline(stages = [user_idxer, item_idxer])
    
    val = pipeline.fit(train).transform(val)
    
    alsmodel = ALSModel.load(model_file)
    
    preds = alsmodel.transform(val)
    recs = alsmodel.recommendForUserSubset(val, 400)
    top_items = recs.selectExpr('user as user', 'recommendations.item as top_items')
    
    good_preds = preds.where(preds.rating >= 3.0)
    recs_tall = recs.select(recs.user, explode(recs.recommendations))
    recs_taller = recs_tall.withColumn('item', recs_tall.col.item).withColumn('pred_rating', recs_tall.col.rating).select('user', 'item', 'pred_rating')
    
    joined = recs_taller.join(good_preds, how = 'inner', on = ['item', 'user'])
    
    return preds, recs, val
예제 #4
0
def main(spark, genre_file, model_file, save_file):

    # Read fuzzy fuzzy book genres information and map each book to the genre of highest count
    print('Loading genre information and mapping genres ...')
    genre_data = [json.loads(line) for line in open(genre_file, 'r')]
    book_genre_map = [(int(x['book_id']),
                       sorted(x['genres'].items(), key=lambda y: y[1])[-1][0])
                      for x in genre_data if x['genres']]
    map_df = spark.createDataFrame(book_genre_map, ['id', 'genre'])

    # Load model and get vector representation for books left
    print('Loading model and getting item representations ...')
    model = ALSModel.load(model_file)
    item_vecs = model.itemFactors

    # Remove items with vector representation of all 0's
    print('Removing items withot representations ...')
    helper = F.udf(lambda x: all(v == 0 for v in x), BooleanType())
    item_vecs = item_vecs.withColumn('check', helper(item_vecs.features))
    item_vecs = item_vecs.filter(item_vecs.check == False).select(
        ['id', 'features'])

    map_df = map_df.join(item_vecs, 'id')
    print('There are {} items left.'.format(map_df.count()))

    # Save data
    print('Saving to csv ...')
    map_df.toPandas().to_csv(save_file)
예제 #5
0
 def __init__(self):
     self.model = None
     try:
         self.model = ALSModel.load(modelPath)
     except Exception as e:
         print(e)
         self.train()
def main(spark, model_file, data_file):
    '''
    Parameters
    ----------
    spark : SparkSession object
    model_file : string, path to the best model file
    data_file : string, path to the test parquet file to load
    '''

    # Loads test data
    data = spark.read.parquet(data_file).repartition(5000, "user_num_id")
    data.createOrReplaceTempView('data')

    # Loads trained ALS model
    model = ALSModel.load(model_file)

    users = data.select('user_num_id').distinct()
    truth = spark.sql(
        'SELECT user_num_id AS user_id, collect_list(track_num_id) AS label FROM data GROUP BY user_num_id'
    )

    # get recommendations
    userSubsetRecs = model.recommendForUserSubset(users, 500)
    recs = userSubsetRecs.select("recommendations.track_num_id", "user_num_id")
    # get input for ranking metrics
    pred = truth.join(recs, truth.user_id == recs.user_num_id,
                      how='left').select('track_num_id', 'label')
    predictionAndLabels = pred.rdd.map(lambda lp: (lp.track_num_id, lp.label))

    print('--------Start Computing ... ...--------')
    metrics = RankingMetrics(predictionAndLabels)
    meanAP = metrics.meanAveragePrecision
    print('Mean Average Precision on test set = {}'.format(meanAP))
예제 #7
0
def evaluate_model(spark, model_file, train_file, val_file):
    # load in the data
    train = spark.read.parquet(train_file)
    train.createOrReplaceTempView('train')
    
    val = spark.read.parquet(val_file)
    val.createOrReplaceTempView('val')
    
    # use stringIndexer and pipeline to index on user_id and book_id
    user_idxer = StringIndexer(inputCol = 'user_id', outputCol = 'user', handleInvalid = 'skip')
    item_idxer = StringIndexer(inputCol = 'book_id', outputCol = 'item', handleInvalid = 'skip')
    
    
    # index data
    pipeline = Pipeline(stages = [user_idxer, item_idxer])
    indexer = pipeline.fit(train)
    train = indexer.transform(train)
    val = indexer.transform(val)
    
    # load the model
    model = ALSModel.load(model_file)
    rmse, prec_at, mean_avg_prec, ndcg  = get_val_metrics(model, val)
    
    # print('Rank = %d, lambda = %.2f' %(model.getRank(), model.getRegParam()))
    print('RMSE: %f, precision at 500: %f, MAP %f, ndcg at 500 %f' %(rmse, prec_at, mean_avg_prec, ndcg))
예제 #8
0
    def _compute_rmse(model: ALSModel, data: DataFrame) -> float:
        """ computes the RMSE error for a given model .

        Args:
            model: the model instance

            data: a spark DataFrame on which to run the model and compare the
            predicted vs actual ratings.

        Returns:
            rmse : the root-mean-squared error value
        """
        predictions = model.transform(data)
        # remove all NaN values
        predictions = predictions.na.drop(subset=["prediction"])

        try:
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol=config.RATINGS_COL,
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            return rmse
        except Exception as e:
            logger.warning(
                'Error in computing rmse. Error description: {}'.format(e))
            return math.nan
예제 #9
0
def main(spark, test_file, index_file, model_file):
    # Load the dataframe
    test = spark.read.parquet(test_file)
    indexer = PipelineModel.load(index_file)
    #transform user and track ids
    test = indexer.transform(test)
    #select distinct users for recommendations
    #testUsers = test.select("userNew").distinct().alias("userCol")
    #establish "ground truth"
    groundTruth = test.groupby("userNew").agg(
        F.collect_list("trackNew").alias("truth"))
    print("created ground truth df")
    alsmodel = ALSModel.load(model_file)
    rec = alsmodel.recommendForAllUsers(500)
    print("created recs")
    predictions = rec.join(groundTruth, rec.userNew == groundTruth.userNew,
                           'inner')

    scoreAndLabels = predictions.select('recommendations.trackNew',
                                        'truth').rdd.map(tuple)
    metrics = RankingMetrics(scoreAndLabels)
    precision = metrics.precisionAt(500)
    map_out = metrics.meanAveragePrecision
    print(f"precision at 500: {precision}")
    print(f"map : {map_out}")
 def __init__(self):
     self.sc = pyspark.SparkContext()
     self.spark = SparkSession\
         .builder\
         .appName("ALSMoviePrediction") \
         .getOrCreate()
     self.model = ALSModel.load('ALSModel')
예제 #11
0
def main(spark, sc, test_file, index_file, model_file, limit=0.01):

    # Load the dataframe
    test = spark.read.parquet(test_file)
    indexer = PipelineModel.load(index_file)
    #transform user and track ids
    test = indexer.transform(test)
    #select distinct users for recommendations, limit if needed
    testUsers = test.select("userNew").distinct().alias("userCol").sample(
        limit)
    #establish "ground truth"
    groundTruth = test.groupby("userNew").agg(
        F.collect_list("trackNew").alias("truth"))
    print("created ground truth df")
    alsmodel = ALSModel.load(model_file)

    #default version
    baseline(alsmodel, groundTruth, testUsers)
    annoy(alsmodel, groundTruth, testUsers, sc)

    trees = [10, 20, 30, 40, 50]
    ks = [-1, 10, 50, 100]

    for t in trees:
        for k in ks:
            annoy(alsmodel, groundTruth, testUsers, sc, n_trees=t, search_k=k)
    print("finished!")
예제 #12
0
def main(spark, model_file, data_file, user_file, track_file):

    #load ALS model
    als_model = ALSModel.load(model_file)
    user_indexer = StringIndexer.load(user_file)
    track_indexer = StringIndexer.load(track_file)

    #read in test data as parquet
    df_test = spark.read.parquet(data_file)
    pipeline = Pipeline(stages=[user_indexer, track_indexer])
    mapping = pipeline.fit(df_test)
    df_test = mapping.transform(df_test)

    ########### PERFORM RANKING METRICS ###########

    #create user actual items dataframe
    actual_recs = df_test.groupBy('user_idx').agg(
        F.collect_list('track_idx').alias('track_idx'))

    #create user predicted items dataframe
    user_subset = df_test.select('user_idx').distinct()
    pred_recs = als_model.recommendForUserSubset(user_subset, 500)
    pred_recs = pred_recs.select(
        'user_idx',
        F.col('recommendations.track_idx').alias('track_idx'))

    #create user item RDD & join on users
    perUserItemsRDD = pred_recs\
                        .join(actual_recs, on='user_idx').rdd\
                        .map(lambda row: (row[1], row[2]))

    rankingMetrics = RankingMetrics(perUserItemsRDD)

    #print results to the console
    print("Ranking Metrics MAP: ", rankingMetrics.meanAveragePrecision)
def main(spark, sc, test_file, index_file, model_file, limit=0.01):
    # Load the dataframe
    test_df = spark.read.parquet(test_file)
    model_indexer = PipelineModel.load(index_file)
    # transform test_df using index_model
    test_df = model_indexer.transform(test_file)
    # select distinct user for recommendation, limit to save run time
    test_user = test_df.select('user_label').distinct().alias('userCol').sample(limit)
    # establish user_truth
    user_truth = test_df.groupby('user_label').agg(F.collect_list('book_label').alias('truth'))
    print('test data and user_truth has been preprocessed')
    # load als model
    als_model = ALSModel.load(model_file)

    # default settings
    baseline(als_model, user_truth, test_user)
    annoy(als_model, user_truth, test_user, sc)

    # hyper-parameter tunning:
    trees = [10, 15, 20]
    k_list = [-1, 5, 10]

    for i in trees:
        for j in k_list:
            annoy(als_model, user_truth, test_user, sc, n_trees=i, search_k=j)
    print('fast search feature has been established')
def main(spark, model_file, test_file):
    '''
    Parameters
    ----------
    spark : SparkSession object
    data_file : string, path to the parquet file to load
    model_file : string, path to store the serialized model file
    '''

    # Load the parquet file
    test = spark.read.parquet(test_file)
    test = test.sort('user', ascending=False)
    test.createOrReplaceTempView('test_table')
    model = ALSModel.load(model_file)

    user_subset = test.select("user").distinct()
    user_subset = model.recommendForUserSubset(user_subset, 500)

    user_subset = user_subset.select("user",
                                     col("recommendations.item").alias("item"))
    user_subset = user_subset.sort('user', ascending=False)
    print("sort user")
    predictionAndLabels = user_subset.join(
        test, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2]))
    print("joined predictions and counts")

    metrics = RankingMetrics(predictionAndLabels)
    print("made metrics")
    MAP = metrics.meanAveragePrecision
    precision = metrics.precisionAt(500)
    ndcg = metrics.ndcgAt(500)

    print('MAP: %f' % MAP)
    print('Precision: %f' % precision)
    print('NDCG: %f' % ndcg)
예제 #15
0
파일: als.py 프로젝트: ZhiYinZhang/study
def recForAllItem(df: DataFrame, model: ALSModel):
    print("recommend for all item")
    # Generate top 10 user recommendations for each item
    itemRecs = model.recommendForAllItems(param.get("top", 10))

    # 得到 userCol_new 与 userCol的映射
    df_dup = df.dropDuplicates([param["userCol_new"]])
    arrs = df_dup.select(param["userCol_new"], param["userCol"]).toJSON().collect()
    maps = {}
    for arr in arrs:
        arr = json.loads(arr)
        maps[arr[param["userCol_new"]]] = arr[param["userCol"]]

    def map_fun(row: Row):
        # list(Row)
        list_rows = row[1]
        result = []
        for r in list_rows:
            result.append(Row(maps[r[param["userCol_new"]]], r["rating"]))
        return Row(row[0], result)

    rdd0 = itemRecs.rdd.map(map_fun)
    # 定义struct type
    schema = StructType([
        StructField(param["itemCol"], IntegerType(), True), \
        StructField("recommendations", ArrayType(StructType(
            [StructField(param["userCol"], LongType(), True), StructField(param["ratingCol"], DoubleType(), True)]),
                                                 True), True)
    ])
    spark = SparkSession.builder.appName("als").master("local[3]").getOrCreate()
    df_pd = spark.createDataFrame(rdd0, schema).toPandas()
    #     df_pd.to_json(os.path.join(param["result_dir"],"itemRecs.json"),index=False,orient="split")
    return df_pd
예제 #16
0
def main(spark, model_file):

    ###
    train = spark.read.parquet('./train.parquet')
    #validation = spark.read.parquet('./validation.parquet')
    test = spark.read.parquet('./test.parquet')

    train_model = ALSModel.load(model_file)
    users = test.select('convert_user_id').distinct()
    user_recs = train_model.recommendForUserSubset(users, 500)
    prediction_df = user_recs.select('convert_user_id',
                                     'recommendations.convert_track_id')
    true_df = test.groupBy('convert_user_id').agg(
        expr('collect_list(convert_track_id) as true_items'))

    prediction_df.write.parquet('./recommendation_count.parquet')
    true_df.write.parquet('./true_count.parquet')

    prediction_rdd = prediction_df.join(true_df, 'convert_user_id') \
    .rdd \
    .map(lambda row: (row[1], row[2]))

    rankingMetrics = RankingMetrics(prediction_rdd)
    print(rankingMetrics.meanAveragePrecision)
    print(rankingMetrics.precisionAt(500))
예제 #17
0
def main(spark, test_file, model_file):

    test = spark.read.parquet(test_file)
    test_df = test.select('user_label', 'track_label', 'count')
    test_grouped = test_df.groupBy('user_label').agg(
        F.collect_list(F.col('track_label')).alias('track_label'))

    #model = MatrixFactorizationModel.load(sc, model_file)
    model = ALSModel.load(model_file)

    # Get the predictions
    # Generate top 10 movie recommendations for each user
    predictions = model.recommendForAllUsers(10)

    prediction_df = predictions.rdd.map(
        lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF()
    prediction_df = prediction_df.selectExpr("_1 as user_label",
                                             "_2 as recommendations")

    # Join table
    test_pred = test_grouped.join(prediction_df, "user_label", "inner")

    # Instantiate regression metrics to compare predicted and actual ratings
    rdd = test_pred.select('recommendations', 'track_label').rdd
    ranking_metrics = RankingMetrics(rdd)

    # MAP
    print("MAP = %s" % ranking_metrics.meanAveragePrecision)
예제 #18
0
def main(spark, txt):
    model = ALSModel.load('hdfs:/user/jm7955/' + args.model)
    distinct_users = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' %
                                        args.distinct)

    print("distinct_users")
    print('finished writing in %d seconds' % int(timer() - start))
    #distinct_users.show()
    labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels)
    print("labels")
    #labels.show()
    print('finished writing in %d seconds' % int(timer() - start))

    predictions = model.recommendForUserSubset(distinct_users, 500)\
        .select('user', F.col('recommendations.item').alias('item'))
    print("predictions")
    #predictions.show()
    print('finished writing in %d seconds' % int(timer() - start))
    predictionsAndLabels = predictions.join(
        labels, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2]))
    print("predictionsAndLabels")
    print('finished writing in %d seconds' % int(timer() - start))

    metrics = RankingMetrics(predictionsAndLabels)
    print('finished writing in %d seconds' % int(timer() - start))

    file = open(txt, 'w')

    file.write('metrics.meanAveragePrecision: %s\n' %
               metrics.meanAveragePrecision)
    file.write('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500))
    file.write('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500))
    file.close()
예제 #19
0
    def recommend(self, num, model1, df2, user=None):

        model = ALSModel.load("nest_recom_trained_model")
        #model=trained_model.fit(data)

        userRecs = model.recommendForAllUsers(num)

        aa = userRecs.withColumn("recommendations",
                                 F.explode("recommendations"))

        aa = aa.select("new_userId", "recommendations.new_propertyId",
                       "recommendations.rating")

        #aa_joined = aa.join(df2,['new_userId','new_propertyId'],'inner')
        aa_joined = aa.join(df2, ['new_userId', 'new_propertyId'])

        #df = df1.join(df2, (df1.x1 == df2.x1) & (df1.x2 == df2.x2))

        final_df1 = (aa_joined.select("userId", "propertyId",
                                      "rating").withColumn(
                                          "Recommendations",
                                          F.struct(F.col("propertyId"),
                                                   F.col("rating")))).select(
                                                       "userId",
                                                       "Recommendations")

        #final_df2 = final_df1.filter(final_df1.new_userId==userId)

        ##final_json=final_df1.groupby("userId").agg(F.collect_list("Recommendations").alias("Recommendations"))
        ##final_json.coalesce(1).write.format('json').save('cc.json')

        return final_df1
def main(spark, sc, test_file, model_file):
    test_data = spark.read.parquet(test_file)
    test_data.createOrReplaceTempView('test_data')

    test_users = spark.sql("select distinct user_id from test_data limit 800")
    #test_users = test_data.select("user_id").distinct().alias("user_id")
    groundTruth_test = test_data.groupby("user_id").agg(
        F.collect_list("book_id").alias("test_truth"))

    als_model = ALSModel.load(model_file)

    brute_force(als_model, groundTruth_test, test_users)

    trees = [10, 20, 40, 50]
    ks = [-1, 10, 50, 100]

    #annoy(alsmodel, groundTruth, testUsers, sc)

    for t in trees:
        for k in ks:
            annoy_model(als_model,
                        sc,
                        groundTruth_test,
                        test_users,
                        n_trees=t,
                        search_k=k)

    print("finished!")
예제 #21
0
 def _load_model(self):
     try:
         # path=Path(self.model_folder+'/')
         self.best_model = ALSModel.load(self.model_folder)
         print('Model load succeed.')
     except:
         print('Model load failed. You can re-train the model.')
         raise
 def test(self, test):
     '''Evaluate the model by computing the RMSE on the test data.
     :param test: test dataset
     '''
     model = ALSModel.load(self.args.model_storage_path)
     evaluator = RegressionEvaluator(labelCol='stars', metricName='rmse')
     predictions = model.transform(test)
     rmse = evaluator.evaluate(predictions)
     print("Root-mean-square error = " + str(rmse))
예제 #23
0
def main(spark):
    '''

    Parameters
    ----------
    spark : SparkSession object
    '''
    test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet'
    test = spark.read.parquet(test_file)
    test.createOrReplaceTempView('test')

    w = Window.partitionBy("user_id")

    def ratio_count(c, w):
        return (col(c) / count(c).over(w))


    test = test.select("user_id", "track_id", ratio_count("count", w).alias("count"))
    test.createOrReplaceTempView('test')
    print("Ratio scores done")

    train_sample = spark.read.parquet('hdfs:/user/dev241/extension4_ratio.parquet')
    train_sample.createOrReplaceTempView('train_sample')
    print("Training sample ext4 loaded")

    StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer')
    test_idx = StringIndexer.transform(test)
    train_idx = StringIndexer.transform(train_sample)

    #change to best
    rank = 78 
    alpha = 14.287069059772636
    reg = 0.41772043857578584

    model = ALSModel.load("Extension4_ratio")
    print('Model loaded')

    #test ranking metrics
    test_idx = test_idx.select('user_idx','track_idx','count')
    test_users = test_idx.select('user_idx').distinct()
    test_comb = test_idx.groupBy('user_idx').agg(F.collect_set('track_idx').alias('test_labels'))
    track_number = 500
    rec_test = spark.read.parquet('hdfs:/user/dev241/rec_test4.parquet')
    print('Rec test loaded.')
    join = test_comb.join(rec_test,test_comb.user_idx == rec_test.user_idx)
    print('Join done.')
    j4 = join.toDF('user_idx', 'test_labels','user_idx2','recommendations')
    j4.write.parquet("ext4join")
    print('j4 parquet written')
    predictionAndLabels = join.rdd.map(lambda r: ([track.track_idx for track in r.recommendations], r.test_labels))
    print('Map done.')
    metrics = RankingMetrics(predictionAndLabels)
    print('RM done.')
    mavgp = metrics.meanAveragePrecision
    print("Test mean Average Precision : ",mavgp)
    pass
예제 #24
0
파일: als.py 프로젝트: ZhiYinZhang/study
def recForAllUser(df: DataFrame, model:ALSModel):
    print("recommend for all user")
    # Generate top 10 movie recommendations for each user
    userRecs: DataFrame = model.recommendForAllUsers(param.get("top", 10))
    userRecs_pd = userRecs.join(
        df.select(param["userCol"], param["userCol_new"]).dropDuplicates([param["userCol_new"]]), param["userCol_new"]) \
        .select(param["userCol"], "recommendations") \
        .toPandas()
    #     userRecs_pd.to_json(os.path.join(param["result_dir"],"userRecs.json"),index=False,orient="split")
    return userRecs_pd
예제 #25
0
 def __import_model(self, mlInstance):
     """
     Reads an ML instance from the input path
     :mlInstance: Path to the saved model
     """
     self.model = ALSModel.load(mlInstance)
     predictions = self.model.transform(self.test)
     evaluator = RegressionEvaluator(metricName='rmse',
                                     labelCol='product_rating',
                                     predictionCol='prediction')
     self.rmse = evaluator.evaluate(predictions)
예제 #26
0
def main(spark, model_file, train_data_file, test_data_file):

    time_a = time.time()
    start = time_a

    training_data = spark.read.parquet(train_data_file)
    indexer_id = StringIndexer(inputCol="user_id",
                               outputCol="userindex").setHandleInvalid("skip")
    indexer_id_model = indexer_id.fit(training_data)
    indexer_item = StringIndexer(
        inputCol="track_id", outputCol="itemindex").setHandleInvalid("skip")
    indexer_item_model = indexer_item.fit(training_data)

    testing_data = spark.read.parquet(test_data_file)
    testing_data = indexer_id_model.transform(testing_data)
    testing_data = indexer_item_model.transform(testing_data)

    testing_data = testing_data.select('userindex', 'itemindex', 'count')

    print('Finished Indexing!')
    time_b = time.time()
    print(time_b - time_a)
    time_a = time_b

    model = ALSModel.load(model_file)
    prediction = model.recommendForAllUsers(500).select(
        'userindex', 'recommendations.itemindex')
    print('Finished Prediction DF!')

    testing_df = testing_data.groupBy('userindex').agg(
        expr('collect_list(itemindex) as item_list'))
    print('Finished Label DF!')

    predictionAndLabels = prediction.join(testing_df, 'userindex')
    print('Joined Prediction and Labels!')
    time_b = time.time()
    print(time_b - time_a)
    time_a = time_b

    pred_df = predictionAndLabels.select(['itemindex',
                                          'item_list']).rdd.map(list)
    metrics = RankingMetrics(pred_df)

    print('Ranking Metrics Calculated!')
    time_b = time.time()
    print(time_b - time_a)
    time_a = time_b

    eva = metrics.meanAveragePrecision
    print("Model on Testing Data gives MAP= ", eva)

    print('Process Finished!')
    print(time.time() - start)
예제 #27
0
def main():
    spark = SparkSession.builder.appName('test').getOrCreate()
    als_model = ALSModel.load('anshul_project/als_sampling')
   
    test_data = spark.read.parquet('anshul_project/test_index.parquet')

    als_predictions = als_model.transform(test_data)

    reg_evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction")
    rmse = reg_evaluator.evaluate(als_predictions)

    print("Test rmse " + str(rmse))
예제 #28
0
def basic_rec_val(spark, dirname, rank, regParam, k, random_seed):

    val_set = spark.read.parquet(f'{dirname}/val.parquet')

    print(
        f'Validating on model with rank = {rank} and regParam = {regParam} trained using {dirname} data ...'
    )

    # load corresponding trained model
    model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model')

    # computing RMSE on validation set
    predictions = model.transform(val_set)
    evaluator = RegressionEvaluator(metricName='rmse',
                                    labelCol='rating',
                                    predictionCol='prediction')
    rmse = evaluator.evaluate(predictions)

    print(f'rmse: {rmse}')

    print(f'Constructing top {k} books recommended to per user ...')
    val_users = val_set.select('user_id').distinct()

    start_time = time.time()

    perUserPredictedTopKItemsDF = model.recommendForUserSubset(val_users, k)

    myudf = udf(extract_item, ArrayType(IntegerType()))
    perUserPredictedTopKItemsDF = perUserPredictedTopKItemsDF.withColumn(
        'predictions',
        myudf(perUserPredictedTopKItemsDF['recommendations'])).drop(
            'recommendations')

    print('Constructing actual books per user ...')
    perUserActualItemsDF = val_set.filter(
        column('rating') >= 3.0).groupBy('user_id').agg(
            expr('collect_list(book_id) as book_ids'))

    print('Constructing Ranking Metrics ...')
    perUserItemsRDD = perUserPredictedTopKItemsDF.join(
        perUserActualItemsDF, 'user_id').rdd.map(lambda row: (row[1], row[2]))

    rankingMetrics = RankingMetrics(perUserItemsRDD)

    precisionAtK = rankingMetrics.precisionAt(k)
    mAP = rankingMetrics.meanAveragePrecision

    end_time = time.time()
    time_delta = str(datetime.timedelta(seconds=end_time - start_time))

    print(f'p@{k}: {precisionAtK}')
    print(f'mAP: {mAP}')
    print(f'run time: {time_delta}')
def main(spark, test_file, train_file, model_path):

    # Read data from parquet
    print('Reading parquet file ...')
    test = spark.read.parquet(test_file)
    test.createOrReplaceTempView('test')
    train = spark.read.parquet(train_file)
    train.createOrReplaceTempView('train')

    # Load the best model from training
    print('Loading model ...')
    best_model = ALSModel.load(model_path)

    # get recommendations for users in test set
    print('Evaluating model on test set ...')
    test_users = test.select("user_id").distinct()
    rec_test = best_model.recommendForUserSubset(test_users, 700)
    pred_test_700 = rec_test.select(
        rec_test.user_id,
        rec_test.recommendations.book_id.alias('rec_book_id'))

    sub_train_test = spark.sql('SELECT user_id, book_id \
                                FROM train \
                                WHERE user_id IN (SELECT DISTINCT user_id FROM test)'
                               )

    df_train_book_test = sub_train_test.groupby('user_id').agg(
        F.collect_set('book_id').alias('train_book_id'))

    df_join_test = pred_test_700.join(df_train_book_test, 'user_id')
    diff = F.udf(book_diff, ArrayType(IntegerType()))
    df_join_pred_test = df_join_test.withColumn(
        'predictions',
        diff(df_join_test.rec_book_id, df_join_test.train_book_id))
    pred_test = df_join_pred_test.select(df_join_pred_test.user_id,
                                         df_join_pred_test.predictions)

    # get true preferences of users in validation set
    label_test = test.filter(test.rating >= 3).groupby("user_id").agg(
        F.collect_list("book_id"))
    predAndLabel_test = pred_test.join(
        label_test, 'user_id').rdd.map(lambda row: (row[1], row[2]))

    # Use Mean Average Precision as evaluation metric
    metrics_test = RankingMetrics(predAndLabel_test)
    MAP_test = metrics_test.meanAveragePrecision
    pak_100_test = metrics_test.precisionAt(100)
    pak_500_test = metrics_test.precisionAt(500)
    print('\n')
    print(
        'Ranking scores of the best model on test data: MAP = {}, Precision@100 = {}, Precision@500 = {}'
        .format(MAP, pak_100_test, pak_500_test))
예제 #30
0
def whiskey_als(df):
    model = ALSModel.load("hdfs://master/ALSModel/")
    predict = model.recommendForItemSubset(df, 1)
    df_user = predict.select(
        predict.whiskeyId, 
        predict.recommendations[0].userId.alias("userId"),
    )
    
    df_whiskey= model.recommendForUserSubset(df_user, 5)
    result_df = df_user.join(df_whiskey, on=['userId'], how='left')
    result_df = result_df.join(df, on=['whiskeyId'], how='left')
    result_df = result_df.select("user_name", "whiskeyId", "recommendations")
    return result_df