示例#1
0
    def __init__(self, data):
        self.trainSet, self.testSet = train_test_split(data,
                                                       test_size=0.25,
                                                       random_state=1)

        LOOX = LeaveOneOut(1, random_state=1)
        for xtrain, xtest in LOOX.split(data):
            self.LOOX_trainSet = xtrain
            self.LOOX_testSet = xtest
            del xtrain, xtest
        self.LOOX_antitestSet = self.LOOX_trainSet.build_anti_testset()

        self.full_trainSet = data.build_full_trainset()
        self.full_antitestSet = self.full_trainSet.build_anti_testset()
示例#2
0
    def calculateRMSE(self, method=9, similarityMeasure=1, isUserBased="Yes"):
        conn = sqlite3.connect(DATABASE_NAME)
        df = pd.read_sql_query(
            "SELECT userID, glassID, relativeRating FROM ratings", conn)

        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(
            df[['userID', 'glassID', 'relativeRating']], reader)

        trainset, testset = train_test_split(data, test_size=.20)

        isUserBased = True if (isUserBased == "Yes") else False
        if similarityMeasure == 1:
            similarityMeasure = "cosine"
        elif similarityMeasure == 2:
            similarityMeasure = "pearson"
        else:
            similarityMeasure = "pearson_baseline"

        sim_options = {'name': similarityMeasure, 'user_based': isUserBased}

        if method == 1:
            algo = SVD()
        elif method == 2:
            algo = SlopeOne()
        elif method == 3:
            algo = NMF()
        elif method == 4:
            algo = NormalPredictor()
        elif method == 5:
            algo = KNNBaseline(sim_options=sim_options)
        elif method == 6:
            algo = KNNBasic(sim_options=sim_options)
        elif method == 7:
            algo = KNNWithMeans(sim_options=sim_options)
        elif method == 8:
            algo = KNNWithZScore(sim_options=sim_options)
        elif method == 9:
            algo = BaselineOnly()
        else:
            algo = CoClustering()

        algo.fit(trainset)
        predictions = algo.test(testset)

        conn.close()

        #cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
        return round(accuracy.rmse(predictions, verbose=False), 4)
示例#3
0
    def __init__(self, data):

        self.fullTrainSet = data.build_full_trainset()
        self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
        self.trainSet, self.testSet = train_test_split(data, test_size=0.25, random_state=1)

        LOOCV = LeaveOneOut(n_splits=1, random_state=1)
        for train, test in LOOCV.split(data):
            self.LOOCVTrain = train
            self.LOOCVTest = test
        self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()

        sim_options = {'name': 'cosine', 'user_based': False}
        self.simsAlgo = KNNBaseline(sim_options=sim_options)
        self.simsAlgo.fit(self.fullTrainSet)
示例#4
0
    def __init__(self, data, popularityRankings):

        self.rankings = popularityRankings

        self.fullTrainSet = data.build_full_trainset()
        self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()

        self.trainSet, self.testSet = train_test_split(data,
                                                       test_size=.25,
                                                       random_state=1)

        #Compute similarty matrix between items so we can measure diversity
        sim_options = {'name': 'cosine', 'user_based': False}
        self.simsAlgo = KNNBaseline(sim_options=sim_options)
        self.simsAlgo.fit(self.fullTrainSet)
示例#5
0
def test_svd(data):
    reader = Reader(rating_scale=(1, 5))
    svd_data = Dataset.load_from_df(data, reader)
    trainset, testset = train_test_split(svd_data,
                                         test_size=.10,
                                         random_state=24)
    svd_model = SVD(n_factors=150,
                    n_epochs=20,
                    lr_all=0.008,
                    reg_all=0.1,
                    random_state=24)
    svd_model.fit(trainset)
    predictions = svd_model.test(testset)
    test_mse = accuracy.mse(predictions, verbose=False)
    return test_mse
示例#6
0
def binary_value(data, threshold) :
    trainset, testset = train_test_split(data, test_size=.1)
    
    algo = KNNWithMeans(k = 30)
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    like0 = []#real
    like  = []#predict
    for row in range(len(predictions)) :
        like.append( 1 if predictions[row][3] > threshold else 0)
        like0.append(1 if predictions[row][2] > threshold else 0)
    #predictions[row][3] -> predict value
    #predictions[row][2] -> real value
    return like0, like
示例#7
0
    def split_train_test(self,
                         surprise_dataset: Optional[Dataset] = None,
                         test_size: Optional[float] = None,
                         inplace: Optional[bool] = True):  # -> Optional[Tuple]
        if surprise_dataset is None:
            surprise_dataset = self.surprise_dataset
        if test_size is None:
            test_size = self.test_size

        trainset, testset = train_test_split(surprise_dataset, test_size)

        if inplace:
            self.trainset = trainset
            self.testset = testset
        else:
            return trainset, testset
示例#8
0
def chose_yahoo(file_path):
    # mae= []
    # rmse = []
    reader = Reader(line_format='timestamp user item  rating', sep='\t')#timestamp
    #载入数据,包括多准则评分:故事,角色,表演,画面,音乐,以及整体评分
    story = Dataset.load_from_file(file_path + 'story.txt', reader=reader)
    role = Dataset.load_from_file(file_path + 'role.txt', reader=reader)
    show = Dataset.load_from_file(file_path + 'show.txt', reader=reader)
    image = Dataset.load_from_file(file_path + 'image.txt', reader=reader)
    music = Dataset.load_from_file(file_path + 'music.txt', reader=reader)
    total = Dataset.load_from_file(file_path + 'total.txt', reader=reader)
    # print('载入数据成功!\n')
    #按2:8拆分数据
    random_states = 180
    story_train, story_test = train_test_split(story, random_state = random_states)
    role_train, role_test = train_test_split(role, random_state = random_states)
    show_train, show_test = train_test_split(show, random_state = random_states)
    image_train, image_test = train_test_split(image, random_state = random_states)
    music_train, music_test = train_test_split(music, random_state = random_states)
    total_train, total_test = train_test_split(total, random_state = random_states)
    # print('数据划分成功!\n')
    #选择的是基于项目的协同过滤算法,项目相似度计算采用cosine方法
    sim_options = {'name': 'pearson',#用皮尔森基线相似度避免出现过拟合
                   'user_based': False} # 基于用户的协同过滤算法
    algo1 = KNNWithMeans(sim_options=sim_options)
    algo2 = KNNWithMeans(sim_options=sim_options)
    algo3 = KNNWithMeans(sim_options=sim_options)
    algo4 = KNNWithMeans(sim_options=sim_options)
    algo5 = KNNWithMeans(sim_options=sim_options)
    algo6 = KNNWithMeans(sim_options=sim_options)
    algo1.fit(story_train)
    algo2.fit(role_train)
    algo3.fit(show_train)
    algo4.fit(image_train)
    algo5.fit(music_train)
    algo6.fit(total_train)
    story_p = algo1.test(story_test)
    role_p = algo2.test(role_test)
    show_p = algo3.test(show_test)
    image_p =algo4.test(image_test)
    music_p = algo5.test(music_test)
    single_p = algo6.test(total_test)
    # rmse.append(accuracy.rmse(single_p))
    #平均法
    # multi_p = avg(story_p, role_p, show_p, image_p, music_p, single_p)
    #整体回归
    P = combine(story_p, role_p, show_p, image_p, music_p, single_p)
    df = pd.read_csv(file_path + 'all.txt', sep = '\t', names = ['id', 'uid', 'mid', 'total', 'story', 'role', 'show', 'image', 'music'])
    k, b = totalRegModel(df)
    multi_p = totalReg(P, k, b, single_p)
    #基于每个用户的回归
    
    mae = (accuracy.mae(single_p),accuracy.mae(multi_p))
    # rmse.append(accuracy.rmse(multi_p))
    return mae#, rmse
def new_recommendations(df, new_ratings):
    df = df[['user_id', 'isbn', 'rating']]
    new_ratings = pd.DataFrame(new_ratings)[['user_id', 'isbn', 'rating']]
    new_df = pd.concat([df, new_ratings]).reset_index(drop=True)
    reader = Reader(rating_scale=(1,5))
    data = Dataset.load_from_df(new_df,reader)
    train, test = train_test_split(data, test_size=.2)
    model = SVD(n_epochs=17, lr_all=.015, reg_all=.125, n_factors=17)
    model.fit(train)
    preds = model.test(test)
    user_id = new_ratings.user_id[0]
    book_list = []
    for x in new_df.isbn.unique():
        book_list.append((x, model.predict(user_id,x)[3]))
    ranked_books = sorted(book_list, key=lambda x: x[1], reverse=True)
    return ranked_books
示例#10
0
    def __init__(self,data,withSim=False):
        self.trainSet, self.testSet = train_test_split(data, test_size=0.25, random_state=0)

        LOOX = LeaveOneOut(1, random_state=1)
        for xtrain, xtest in LOOX.split(data):
            self.LOOX_trainSet = xtrain
            self.LOOX_testSet = xtest
            del xtrain, xtest
        self.LOOX_antitestSet = self.LOOX_trainSet.build_anti_testset()

        self.full_trainSet = data.build_full_trainset()
        self.full_antitestSet = self.full_trainSet.build_anti_testset()
        if withSim:
            sim_options = {'name': 'cosine', 'user_based': False}
            self.simAlgo = KNNBaseline(sim_options=sim_options)
            self.simAlgo.fit(self.full_trainSet)
示例#11
0
def surprise_build_train_test(data_frame, reader, full=True):
    """Returns tuple of `surprise` full train set and test set.
    Args:
        data_frame: Pandas data-frame.
        reader: `surprise` `Reader` instance.
        full: Whether to build full trainset or split one
    """
    raw_data = Dataset.load_from_df(data_frame, reader=reader)

    if full:
        train_set = raw_data.build_full_trainset()
        test_set = train_set.build_testset()

        return train_set, test_set

    return train_test_split(raw_data, test_size=0.2)
示例#12
0
def train(data):

    reader = Reader(rating_scale=(0, 9))
    data = Dataset.load_from_df(data[['userID', 'ISBN', 'bookRating']], reader)
    print('Using ALS')
    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
    algo = BaselineOnly(bsl_options=bsl_options)
    print(cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False))
    trainset, testset = train_test_split(data, test_size=0.25)
    algo = BaselineOnly(bsl_options=bsl_options)
    predictions = algo.fit(trainset).test(testset)
    print(accuracy.rmse(predictions))

    def get_Iu(uid):
        """ return the number of items rated by given user
        args:
          uid: the id of the user
        returns:
          the number of items rated by the user
        """
        try:
            return len(trainset.ur[trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(iid):
        """ return number of users that have rated given item
        args:
          iid: the raw id of the item
        returns:
          the number of users that have rated the item.
        """
        try:
            return len(trainset.ir[trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    df = pd.DataFrame(predictions,
                      columns=['uid', 'iid', 'rui', 'est', 'details'])
    df['Iu'] = df.uid.apply(get_Iu)
    df['Ui'] = df.iid.apply(get_Ui)
    df['err'] = abs(df.est - df.rui)

    best_predictions = df.sort_values(by='est')[:10]
    worst_predictions = df.sort_values(by='err')[-10:]
    print(best_predictions)
    print(worst_predictions)
示例#13
0
def predict():
    global top_n
    global user_id
    print("--predict start--------------------------------")

    # request
    data = pd.DataFrame(get_default_ratings())

    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(df=data, reader=reader)

    trainset_2, testset_2 = train_test_split(data, test_size=0.3)

    algo = SVDpp()
    predictions = algo.fit(trainset_2).test(testset_2)

    top_n = get_top_n(predictions, n=10)
def main():
    row_num = 5000
    #reading the important ratings file to make it a pandas dataframe in order to be used by surprise
    ratings_data = pd.read_csv('datasets/song_dataset_ranking.txt', sep="\t", header=None, nrows = row_num)
    #define the document's columns
    ratings_data.columns = ['userId', 'songId', 'rating']
    #read the csv where it is the songs data
    song_data = open('datasets/song_data.csv', 'rt')
    c_reader = csv.reader(song_data, delimiter=',', quotechar='|')
    #create a hash where we will store the important info from all songs
    song_dict = {}
    #update the hash, example
    #keysonisonioiaofnai: ['Smoke on the water', 'Deep purple']
    for row in c_reader:
        song_dict.update({row[0]: [row[1], row[3]]})
    #surprise reader, define the rating scale to use
    reader = Reader(rating_scale=(1,100))
    #transform info to a surprise dataset
    data = Dataset.load_from_df(ratings_data, reader)
    #split data into training and testSet
    training_set, testSet = train_test_split(data, test_size=.25)
    #define the algorithm to use
    knn = KNNBasic(name="cosine", user_based=False)
    #train the algorithm
    knn.fit(training_set)
    print("Done training")
    print("Test set length", len(testSet))
    print("testing")
    #make predictions
    predictions = knn.test(testSet)
    print("getting recommendations")
    #measure accuracy, Compute FCP (Fraction of Concordant Pairs).
    accuracy.fcp(predictions)
    #get top n predictions
    top_n = get_top_n(predictions,4)
    file = open('predictions.txt', 'w')

    for uid, user_ratings in top_n.items():
        file.write("prediction for " +str(uid) +":\n")
        result_array = [find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings]
        for item in result_array:
            file.write("\t")
            file.write('-'.join(item))
            file.write("\n")
        #print("prediction for " +str(uid) +"\n" +str([find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings]) + "\n")
    file.close()
示例#15
0
    def __init__(self, data, popularityRankings, movies):

        self.rankings = popularityRankings
        self.movies = movies
        print(len(movies), movies)
        #Build a full training set for evaluating overall properties
        self.fullTrainSet = data.build_full_trainset()
        self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()

        #Build a 75/25 train/test split for measuring accuracy
        self.trainSet, self.testSet = train_test_split(data,
                                                       test_size=.25,
                                                       random_state=1)

        #Compute similarty matrix between items so we can measure diversity
        sim_options = {'name': 'cosine', 'user_based': False}
        self.simsAlgo = KNNBaseline(sim_options=sim_options)
    def perform_operation(self):
        self.LOG_HANDLE.info(
            "Running the collaborative filtering algorithms...")
        latest_ratings_file_name = self.get_latest_output_file_name(
            configurations.RATINGS_FILE_IN_REQUIRED_FORMAT_FILE_NAME,
            next=False)[1]
        latest_ratings_file_location = os.path.join(
            configurations.OUTPUT_FILES_DIRECTORY, latest_ratings_file_name)
        self.LOG_HANDLE.info("Running recommender models on the file here: " +
                             latest_ratings_file_location)
        print("Running all recommender models")

        # Params from here: http://surprise.readthedocs.io/en/stable/reader.html
        reader = Reader(sep=constants.COMMA_STR)

        # Params from here: http://surprise.readthedocs.io/en/stable/dataset.html
        ratings_dataset = Dataset.load_from_file(latest_ratings_file_location,
                                                 reader)

        # Divide the data set into the training and test sets
        trainset, testset = train_test_split(
            ratings_dataset, test_size=model_params.test_set_size)

        # Add different algorithms here - Removed SVD PP algorithm
        collaborative_algorithms = [
            normal_algo_wrapper(),
            knn_algo_wrapper(),
            svd_algo_wrapper()
        ]

        rmse_values = {}

        for collaborative_algorithm in collaborative_algorithms:
            print("Started Algorithm: " + collaborative_algorithm.algo_name)
            rmse_values[collaborative_algorithm.
                        algo_name] = collaborative_algorithm.evaluate_on_test(
                            trainset, testset)
            collaborative_algorithm.perform_grid_search_with_cv(
                ratings_dataset)
            print("Completed Algorithm: " + collaborative_algorithm.algo_name)

        print("All recommender models have been run...")
        plt.scatter(rmse_values.keys(), rmse_values.values())
        plt.xlabel('Collaborative filtering algorithm')
        plt.ylabel('Root mean square error (RMSE) on test predictions')
        plt.show()
示例#17
0
def plot_all_ROC():
    rang = 5.0
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    trainset, testset = train_test_split(data, test_size=0.1)
    knn = KNNWithMeans(22, sim_options=sim_options)
    nmf = NMF(n_factors=18)
    svd = SVD(n_factors=8)
    fp = {}
    tp = {}
    area = np.array([])
    for model, key in zip([knn, nmf, svd], ['KNN','NNMF','SVD']):
        model.fit(trainset)
        pred = model.test(testset)
        np_true = np.array([])
        np_score = np.array([])
        for _, _, t, p, _ in pred:
            if t >= 3:
                t = 1
            else:
                t = 0
            np_true = np.append(np_true, t)
            np_score = np.append(np_score, p/rang)
        fpr, tpr, thresholds = metrics.roc_curve(np_true, np_score)
        print(fpr.shape, tpr.shape)
        roc_auc = metrics.auc(fpr, tpr)
        fp[key] = fpr
        tp[key] = tpr
        area = np.append(area, roc_auc)
    plt.figure()
    lw = 2
    for mod, f, t, roc_auc in zip(['k-NN','NNMF','MF'], fp, tp, area):
        fpr = fp[f]
        tpr = tp[t]
        plt.plot(fpr, tpr, lw=lw, label='%s'%mod)
    plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()
    plt.show()
示例#18
0
def train_test(data):
    """
	分割训练集和测试集

	如果不想运行完整的交叉验证过程,则可以使用 train_test_split()
	来给定大小的训练集和测试集采样.
	您将需要使用将在训练集上训练算法的方法,
	以及将返回从测试集得出的预测的方法:accuracy metric fit()test()
	:return:
	"""
    trainset, testset = train_test_split(data, test_size=0.25)
    algo = SVD()
    algo.fit(trainset)
    #预测
    predictions = algo.test(testset)
    #准确率
    print(accuracy.rmse(predictions))
    return None
示例#19
0
def GetAccuracy():
    d = Data()
    data = d.loadData()

    trainSet = data.build_full_trainset()

    _, testSet = train_test_split(data, test_size=.25, random_state=1)

    model = KNNBasic(sim_options=sim_options, verbose=False)
    model.fit(trainSet)
    predictions = model.test(testSet)


    mae = accuracy.mae(predictions, verbose=False)

    rmse = accuracy.rmse(predictions, verbose=False)

    return mae, rmse
示例#20
0
def SVDFun(data, userSet, movieSet, userID):
    # Evaluate performances of our algorithm on the dataset.
    #	itemList = [[0] * userNumber] * itemNumber
    #	userList = [[0] * itemNumber] * userNumber
    algo = SVD()
    # perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
    # trainset = data.build_full_trainset()
    trainset, testset = train_test_split(data, test_size=.25)
    # algo.fit(trainset)
    # predictions = algo.test(testset)
    algo = joblib.load('svdmodel.pkl')
    # meanRMSE = average(perf['RMSE'])
    # meanMAE = average(perf['MAE'])
    movielist = dict()
    for movie in movieSet:
        est = algo.predict(userID, movie).est
        movielist[movie] = est
    return movielist
def ComputeCollaborativeFiltering_User_User(recipe_df, train_rating_df, pd, benchmark, knnmeans=False):
    print("\n###### Compute CollaborativeFiltering_User_User ######")
    df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner')
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader)
    trainSet, testSet = train_test_split(data, test_size=.2, random_state=0)

    # compute  similarities between items
    sim_options = {'name': 'cosine', 'user_based': True}

    if knnmeans:
        algo = KNNWithMeans(sim_options=sim_options, verbose=False)
    else:
        algo = KNNBasic(sim_options=sim_options, verbose=False)
    algo.fit(trainSet)
    predictions = algo.test(testSet)

    Evaluators.RunAllEvals(predictions, benchmark)
示例#22
0
def main():
    book_df = pd.read_csv("../../data/processed/filtered_ratings.csv")
    # Reader object and rating scale specification
    book_df = book_df.drop('Unnamed: 0', axis=1)
    reader = Reader(rating_scale=(1, 5))
    # Load data
    data = Dataset.load_from_df(book_df[["user_id", "book_id", "rating"]],
                                reader)

    # Spilt data into train and test sets
    train_set, test_set = train_test_split(data, test_size=0.20)

    algorithm_list = [
        NormalPredictor(),
        BaselineOnly(),
        KNNWithZScore(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNWithMeans(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNBaseline(k=10, sim_options=similarity_measure('pearson', 1)),
        KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)),
        SVDpp(),
        SVD(),
        NMF()
    ]

    # # Fit model for normal predictor and get rmse
    # basic_model_based(train_set, test_set, NormalPredictor())
    #
    # # Fit model for Baselineonly algorithm
    # basic_model_based(train_set, test_set, BaselineOnly())
    #
    # # Fit model for KNN algorithms
    # basic_model_based(train_set, test_set, KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)))
    #
    # plot_for_rmse(train_set, test_set)
    # Crossvalidation results
    # res = crossvalidate(data)
    # print(res)
    results = {}
    for algo in algorithm_list:
        rmse, preci, recall, f1 = basic_model_based(train_set, test_set, algo)
        print("Algorithm:", algo, preci, recall, f1)
        print(
            "**------------------------------------------------------------------------------------------**"
        )
示例#23
0
def algo_metrics(df):
    '''
    Return metrics algo metrics for df: rmse
    ---Parameters---
    df (Pandas DataFrame) RUS DataFrame
    u (int) Number of ratings threshold for users
    r (int) Number of ratings threshold for routeIDs
    ---Returns---
    RMSE metrics
    '''
    reader = sp.Reader(line_format='user item rating', sep=',', skip_lines=1)
    data = sp.Dataset.load_from_df(df, reader=reader)
    trainset, testset = train_test_split(data, test_size=.2)

    # Fit out of the box SVD to trainset and predict on test set
    algo = sp.SVD()
    algo.fit(trainset)
    predictions = algo.test(testset)
    return sp.accuracy.rmse(predictions)
def subswipe_data(df, n_train, n_test, n_users):
    np.random.seed(0)

    # Use only the first n users.
    users = [f"synthetic_{i}" for i in range(n_users)]

    # Create a sample with missing swipes. Number of swipes in training set is n_strain. Number in test set is n_test.
    user_ids = {
        user: np.random.choice(df["id"], n_train + n_test, replace=False)
        for user in users
    }

    sparse = pd.DataFrame(columns=["uid", "iid", "swipe"])

    for user in users:
        for train_id in user_ids[user]:
            sparse = sparse.append(
                {
                    "uid": user,
                    "iid": train_id,
                    "swipe": df[user].loc[train_id]
                },
                ignore_index=True)

    reader = surprise.Reader(rating_scale=(0, 1))
    data = surprise.Dataset.load_from_df(sparse, reader)

    train, test = train_test_split(data,
                                   test_size=n_test * n_users,
                                   random_state=0)

    # print(train, test)
    #
    # print('test', len(test), sorted(test))
    # iterator = train.all_ratings()
    # new_df = pd.DataFrame(columns=['uid', 'iid', 'rating'])
    # i = 0
    # for (uid, iid, rating) in iterator:
    #     new_df.loc[i] = [uid, iid, rating]
    #     i = i + 1
    # print('train', new_df)

    return train, test
def get_rating_predictions(dataset_no_zeros, dataset_zeros, dataset):

    # import data into suprise dataset
    from surprise import Reader, Dataset
    reader = Reader(rating_scale=(0, 1))
    data_no_zeros = Dataset.load_from_df(
        dataset_no_zeros[['userId', 'artistId', 'rating']], reader)

    # split dataset into train and test
    from surprise.model_selection import train_test_split
    trainset, testset = train_test_split(data_no_zeros, test_size=0.2)
    trainset = data_no_zeros.build_full_trainset()

    # fit SVD model
    from surprise import SVD, accuracy
    algo = SVD()
    algo.fit(trainset)

    # Make prediction
    predictions = algo.test(testset)

    # Test model accuracy use root mean squared error
    from surprise import accuracy
    accuracy.rmse(predictions)

    # Load in full data df into suprise dataset object
    import pandas as pd
    data = Dataset.load_from_df(dataset[['userId', 'artistId', 'rating']],
                                reader)

    # cast suprise dataset object into suprise trainset object
    data = data.build_full_trainset()

    # Get predictions for NaN values
    uids = [data.to_raw_uid(uid) for uid in dataset_zeros['userId']]
    iids = [data.to_raw_iid(iid) for iid in dataset_zeros['artistId']]
    r_ui = [r_ui for r_ui in dataset_zeros['rating']]
    predictions = [(uids[x], iids[x],
                    algo.predict(uids[x], iids[x], r_ui[x], verbose=False)[3])
                   for x in range(len(dataset_zeros['userId']))]

    return predictions
示例#26
0
def main():
    # save path to training data csv
    # convert to panda Dataframe to bypass an error
    #file_path = os.path.expanduser('../data/train.csv')
    #df = pd.read_csv(path=file_path, sep = ';')

    # pickle_dict = pickle.load('../data/train_update.csv')
    # df = pd.DataFrame(ratings_dict)

    # load dataset into dataframe
    train = pd.read_csv('../data/train_update.csv', sep=';')
    test = pd.read_csv('../data/test_update.csv', sep=';')

    reader = Reader(rating_scale=(0, 10))

    print

    train_set = Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']],
                                     reader=reader)
    test_set = Dataset.load_from_df(test[['User-ID', 'ISBN', 'Book-Rating']],
                                    reader=reader)
    # load data from file
    # data = Dataset.load_from_df(df[['User-ID', 'ISBN', 'Book-Rating']], reader=reader)

    # to use when train on full train set
    # trainset = train_set.build_full_trainset()
    # validationset = trainset.build_testset()

    # create classifier (using a basic k nearest neighbors approach)
    algo = KNNBasic()

    trainset, testset = train_test_split(train_set,
                                         test_size=.9,
                                         random_state=1234)
    algo.fit(train_set)

    #cross_validate(algo, trainset, verbose=True)
    predictions = algo.test(testset)

    # compute MAE and RMSE
    accuracy.mae(predictions)
    accuracy.rmse(predictions)
示例#27
0
def SVDFun(data, userSet, movieSet, userID):
    # Evaluate performances of our algorithm on the dataset.
    algo = SVD()
    # perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
    # trainset = data.build_full_trainset()
    trainset, testset = train_test_split(data, test_size=.25)
    # algo.fit(trainset)
    # predictions = algo.test(testset)

    #algo = joblib.load('/Users/esthertang/Desktop/movieRecommd/myMovie/static/svdmodel.pkl')

    #algo = joblib.load("/Users/huangzeqian/Documents/movieRecommd/myMovie/static/svdmodel.pkl")

    # meanRMSE = average(perf['RMSE'])
    # meanMAE = average(perf['MAE'])
    movielist = dict()
    for movie in movieSet:
        est = algo.predict(userID, movie).est
        movielist[movie] = est
    return movielist
示例#28
0
def PR(algo, data, num_fold):

    kf = KFold(n_splits=num_fold)

    #solo 1 vez, sin tecnica kFold
    trainset, testset = train_test_split(data, test_size=.2, shuffle=False)

    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions,
                                                k=5,
                                                threshold=3.5)

    # Precision and recall can then be averaged over all users
    print('     Precision')
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print('     Recall')
    print(sum(rec for rec in recalls.values()) / len(recalls))

    return 0
示例#29
0
文件: app.py 项目: Mcdonoughd/DS3001
def rec():
    resp = {}
    if request.method == 'POST':
        req = request.form.to_dict()

        wine_names = reviews.index.values.tolist()
        newvector = pd.DataFrame(index=["givenValues"], columns=wine_names)
        for key in req.keys():
            newvector[key] = req[key]
        newvector.fillna(0, inplace=True)

        trainset, testset = train_test_split(reviews, test_size=0.25)

        algo.fit(trainset)
        predictions = algo.predict(newvector, )

        print(predictions)

    #     todo make vector from input and do reccomender from scores
    return render_template('recommend.html', resp=resp)
    def evaluate(self, test_size=.25):
        from surprise.model_selection import cross_validate, train_test_split
        from surprise import accuracy

        recommendation_dataset = RecommendationsDataset()
        cross_validate(self.algorithm,
                       recommendation_dataset.dataset,
                       measures=['RMSE', 'MSE'],
                       cv=5,
                       verbose=True)

        train, test = train_test_split(recommendation_dataset.dataset)
        # train.ur
        # train.ir
        # test
        self.fit(train)
        test_predictions = self.test(test)
        # result
        print("MAE: ", accuracy.mae(test_predictions, verbose=0))
        print("RMSE: ", accuracy.rmse(test_predictions, verbose=0))
示例#31
0
def test_unknown_user_or_item(toy_data):
    """Ensure that all algorithms act gracefully when asked to predict a rating
    of an unknown user, an unknown item, and when both are unknown.
    """

    trainset = toy_data.build_full_trainset()

    klasses = (NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans,
               KNNBaseline, SVD, SVDpp, NMF, SlopeOne, CoClustering,
               KNNWithZScore)
    for klass in klasses:
        algo = klass()
        algo.fit(trainset)
        algo.predict('user0', 'unknown_item', None)
        algo.predict('unkown_user', 'item0', None)
        algo.predict('unkown_user', 'unknown_item', None)

    # unrelated, but test the fit().test() one-liner:
    trainset, testset = train_test_split(toy_data, test_size=2)
    for klass in klasses:
        algo = klass()
        algo.fit(trainset).test(testset)
        with pytest.warns(UserWarning):
            algo.train(trainset).test(testset)
示例#32
0
"""
This module describes how to use the train_test_split() function.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)
示例#33
0
algos_name.append('NMF')
algos.append(NMF(n_factors=2,n_epochs=10,biased=True,reg_pu=0.06,reg_qi=0.06,reg_bu=0.01,reg_bi=0.01,lr_bu=0.01,lr_bi=0.01,random_state=1)_

algos_name.append('SVD')
algos.append(SVD(n_factors=5, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=1))

algos_name.append('SVDpp')
algos.append(SVDpp(n_factors=1, random_state=1))

#algos_name.append('KNN')
#algos.append(KNNBasic())

for name, algo in zip(algos_name, algos):
    print('===', name)
    trainset, testset = train_test_split(data, test_size=0.2, random_state=1)

    # train and test algorithm.
    predictions = algo.fit(trainset).test(testset)

    # Compute and print Root Mean Absolute Error
    accuracy.mae(predictions, verbose=True)

    # predict
    pred_test = []
    for u,b in zip(user_test, book_test):
        pred_test.append(algo.predict(u,b).est)
    pred_test = np.array(pred_test)
    pred_test[pred_test > 10] = 10
    pred_test[pred_test < 1] = 1
示例#34
0
def test_train_test_split(toy_data):

    # test test_size to int and train_size to None (complement)
    trainset, testset = train_test_split(toy_data, test_size=2, train_size=None)
    assert len(testset) == 2
    assert trainset.n_ratings == 3

    # test test_size to float and train_size to None (complement)
    trainset, testset = train_test_split(toy_data, test_size=.2, train_size=None)
    assert len(testset) == 1
    assert trainset.n_ratings == 4

    # test test_size to int and train_size to int
    trainset, testset = train_test_split(toy_data, test_size=2, train_size=3)
    assert len(testset) == 2
    assert trainset.n_ratings == 3

    # test test_size to None (complement) and train_size to int
    trainset, testset = train_test_split(toy_data, test_size=None, train_size=2)
    assert len(testset) == 3
    assert trainset.n_ratings == 2

    # test test_size to None (complement) and train_size to float
    trainset, testset = train_test_split(toy_data, test_size=None, train_size=.2)
    assert len(testset) == 4
    assert trainset.n_ratings == 1

    # Test random_state parameter
    # If random_state is None, you get different split each time (conditioned
    # by rng of course)
    _, testset_a = train_test_split(toy_data, random_state=None)
    _, testset_b = train_test_split(toy_data, random_state=None)
    assert testset_a != testset_b

    # Repeated called to split when random_state is set lead to the same folds
    _, testset_a = train_test_split(toy_data, random_state=1)
    _, testset_b = train_test_split(toy_data, random_state=1)
    assert testset_a == testset_b

    # Test shuffle parameter, if False then splits are the same regardless of
    # random_state.
    _, testset_a = train_test_split(toy_data, random_state=1, shuffle=None)
    _, testset_b = train_test_split(toy_data, random_state=1, shuffle=None)
    assert testset_a == testset_b