예제 #1
0
def train_test_from_df(df, cols, rating_scale=(1, 5), train_size=None, test_size=None):
    if train_size is None and test_size is None:
        raise ValueError('train size or test size required')
    # reader = Reader(rating_scale=(1, 5), line_format='item user rating')
    reader = Reader(rating_scale=(1, 5), )
    data = Dataset.load_from_df(df[cols], reader)
    if test_size:
        if test_size == 0:
            return data.build_full_trainset()
        return train_test_split(data, test_size=test_size)
    return train_test_split(data, train_size=train_size)
예제 #2
0
def svd_model(df):
    """
    Creates svd model for predcitions and cross validation
    Returns: data 
    """
    from surprise.model_selection.split import train_test_split
    data = df[['user_id', 'business_id',
               'average_stars']].loc[df.city == 'Scottsdale']

    reader = Reader()

    data = Dataset.load_from_df(data, reader)

    trainset, testset = train_test_split(data, test_size=0.25)

    algo = SVD()
    algo.fit(trainset)

    predictions = algo.test(testset)

    acc = accuracy.rmse(predictions)

    svd_cv = cross_validate(SVD(), data, cv=5)

    return data, acc, svd_cv['test_rmse']
    def drawRoc(model, i, k):
        print('Start drawing ROC curve of NMF with optimal k = ' + str(k) +
              ', threshold = ' + str(thresholds[i]) + '!')
        train, test = train_test_split(binary[i],
                                       train_size=0.9,
                                       test_size=0.1)
        model.fit(train)
        labels = model.test(test)
        y_true = [label.r_ui for label in labels]
        y_pred = [label.est for label in labels]
        fpr, tpr, _ = roc_curve(y_true, y_pred)
        roc_auc = auc(fpr, tpr)

        plt.figure()
        plt.plot(fpr,
                 tpr,
                 color='darkorange',
                 lw=2,
                 label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC curve of NMF with optimal k = ' + str(k) +
                  ', threshold = ' + str(thresholds[i]))
        plt.legend(loc="lower right")
        print('Finish drawing ROC curve of NMF with optimal k = ' + str(k) +
              ', threshold = ' + str(thresholds[i]) + '!')
예제 #4
0
def global_mean():
    """ Global Mean prediction """
    trainset, testset = split.train_test_split(data, test_size=.17, random_state=1)
    labels = list(zip(*testset))[2]
    err = labels - trainset.global_mean
    rmse = np.sqrt(np.sum(err**2) / len(testset))
    print('RMSE with global mean: ',rmse)
def fit_model(data):
    train, test = train_test_split(data, test_size=0.25)
    svd = SVD(n_epochs=25, lr_all=0.01, reg_all=0.4)
    svd.fit(train)
    pred = svd.test(test)
    print('RMSE for test set: {}'.format(accuracy.rmse(pred)))
    print('MAE for test set: {}'.format(accuracy.mae(pred)))
    # save model
    path = '../Models/Collaborative_filtering2.model'
    pickle.dump(svd, open(path, 'wb'))
    print("Model is saved to: {}".format(path))
    def primaryTest(self, predictor):

        trainSet, testset = train_test_split(self.dataTuning, test_size=0.2)
        prediction = predictor.fit(trainSet).test(testset)
        result = pd.DataFrame(prediction,
                              columns=[
                                  'user_id', 'item_id', 'base_event',
                                  'predict_event', 'details'
                              ])
        result.drop(columns={'details'}, inplace=True)
        result['error'] = abs(result['base_event'] - result['predict_event'])
        cross_validate(predictor,
                       self.dataTuning,
                       measures=['RMSE', 'MAE'],
                       cv=5,
                       verbose=True)
        print(result.head())
예제 #7
0
def train_model(df, make_cv=True, make_train_test_split=False, user_col="userId", item_col="imdbId", rating_col="rating"):
    reader = Reader(rating_scale=(0.5, 5))
    # df (Dataframe) – The d ataframe containing the ratings. It must have three columns, corresponding to the user (raw) ids, the item (raw) ids, and the ratings, in this order.
    df[user_col]=df[user_col].astype(str)
    df[item_col]=df[item_col].astype(str)
    data = Dataset.load_from_df(df[[user_col, item_col, rating_col]], reader)

    # data.raw_ratings[0]



    if make_train_test_split:
        trainset, testset = train_test_split(data, test_size=.25)
    else:
        trainset = data.build_full_trainset()

    algo = SVD()
    algo.fit(trainset)
    #trainset.to_raw_uid(1)

    if make_train_test_split:
        # predict ratings for the testset
        predictions = algo.test(testset)
        # Then compute RMSE
        accuracy.rmse(predictions)

    if make_cv:
        cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    # Compute predictions of the 'original' algorithm.
    predictions = algo.test(trainset.build_testset())

    # sample pred
    uid = str(1)  # raw user id (as in the ratings file). They are **strings**!
    iid = str(114709)  # raw item id (as in the ratings file). They are **strings**!
    a=algo.predict(uid, iid, verbose=True)

    print("Algo trained")
    return algo
예제 #8
0
    plt.ylabel('True Positive Rate', fontsize=15)

    plt.legend(loc="lower right")
    plt.savefig('plot/q15_knn_roc_' + str(threshold) + '.png')
    plt.clf()


if __name__ == "__main__":
    threshold = [2.5, 3, 3.5, 4]
    file_path = os.path.expanduser("ml-latest-small/ratings_new.csv")
    reader = Reader(sep=',')
    data = Dataset.load_from_file(file_path, reader=reader)

    sim_options = {'name': 'pearson', 'user_based': True}

    trainset, testset = train_test_split(data, test_size=0.1)

    for th in threshold:
        algo = KNNWithMeans(k=34, sim_options=sim_options)
        algo.fit(trainset)
        predictions = algo.test(testset)

        y_true = []
        y_estimate = []

        for row in predictions:
            if row[2] >= th:
                y_true.append(1)
            else:
                y_true.append(0)
            y_estimate.append(row[3])
예제 #9
0
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import split
from surprise import SVD,SVDpp
import time

# 数据读取
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file(r'C:\Users\yy\Desktop\BI\L4\L4-2\L4-code\MovieLens\ratings.csv', reader=reader)
train_s,test_s = split.train_test_split(data, train_size=0.8)

algo1 = SVD(biased = False)
algo2 = SVD()
algo3 = SVDpp()

"""SVD"""
print('SVD结果:')
time1=time.time()
algo1.fit(train_s)
pred = algo1.test(test_s)
accuracy.rmse(pred, verbose=True)
time2=time.time()
print('SVD用时: %.2fs' % (time2-time1))
uid = str(196)
iid = str(302)
algo1.predict(uid, iid, r_ui=4, verbose=True)   # 输出uid对iid的预测结果
print('-'*30)

"""SVDbias"""
print('SVDbias结果:')
예제 #10
0
def NN_Model(df, n_factors=10, ep=5):
    from sklearn.model_selection import train_test_split
    user_rev_biz_scott = df[[
        'user_id', 'user_name', 'business_id', 'biz_name', 'average_stars'
    ]].loc[df.city == 'Scottsdale']

    user_df = user_rev_biz_scott.groupby(['user_id', 'user_name'
                                          ]).size().reset_index(name="Freq")
    user_df.drop('Freq', axis=1, inplace=True)

    user_id_list = list(user_df.user_id)
    user_id_dict = {y: x for (x, y) in enumerate(user_id_list)}
    user_rev_biz_scott['user_num'] = user_rev_biz_scott.user_id.map(
        user_id_dict)

    biz_df = user_rev_biz_scott.groupby(['business_id', 'biz_name'
                                         ]).size().reset_index(name="Freq")
    biz_df.drop('Freq', axis=1, inplace=True)

    biz_id_list = list(biz_df.business_id)
    biz_id_dict = {y: x for (x, y) in enumerate(biz_id_list)}
    user_rev_biz_scott['biz_num'] = user_rev_biz_scott.business_id.map(
        biz_id_dict)

    X = user_rev_biz_scott[[
        'user_num', 'user_name', 'biz_num', 'biz_name', 'average_stars'
    ]]
    y = user_rev_biz_scott.average_stars

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)

    n_users = user_rev_biz_scott.user_id.nunique()
    n_biz = user_rev_biz_scott.business_id.nunique()

    biz_input = Input(shape=[1], name='Biz_Input')
    biz_embedding = Embedding(n_biz, n_factors, name='Biz_Embed')(biz_input)
    biz_vac = Flatten(name='Flatten_Biz')(biz_embedding)

    user_input = Input(shape=[1], name='User_Input')
    user_embedding = Embedding(n_users, n_factors,
                               name='User_Embed')(user_input)
    user_vac = Flatten(name="Flatten_User")(user_embedding)

    prod = Dot(name='Dot_Product', axes=1)([biz_vac, user_vac])
    model = Model([user_input, biz_input], prod)
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

    class TestCallback(Callback):
        def __init__(self, test_data):
            self.test_data = test_data

        def on_epoch_end(self, epoch, logs={}):
            x, y = self.test_data
            loss, acc = self.model.evaluate(x, y, verbose=0)
            print('\nTesting loss: {}, acc: {}\n'.format(loss, acc))

    if os.path.exists('biz_model.h5'):
        model = load_model('biz_model.h5')
    else:
        history = model.fit([X_train.user_num, X_train.biz_num],
                            y_train,
                            epochs=ep,
                            verbose=False,
                            validation_data=([X_test.user_num,
                                              X_test.biz_num], y_test),
                            callbacks=[
                                TestCallback(
                                    ([X_test.user_num,
                                      X_test.biz_num], y_test))
                            ])
    model.save('NN_Embed_Model')
    return user_id_dict, biz_id_dict, user_df, biz_df, X, X_test, model, history