def train_test_from_df(df, cols, rating_scale=(1, 5), train_size=None, test_size=None): if train_size is None and test_size is None: raise ValueError('train size or test size required') # reader = Reader(rating_scale=(1, 5), line_format='item user rating') reader = Reader(rating_scale=(1, 5), ) data = Dataset.load_from_df(df[cols], reader) if test_size: if test_size == 0: return data.build_full_trainset() return train_test_split(data, test_size=test_size) return train_test_split(data, train_size=train_size)
def svd_model(df): """ Creates svd model for predcitions and cross validation Returns: data """ from surprise.model_selection.split import train_test_split data = df[['user_id', 'business_id', 'average_stars']].loc[df.city == 'Scottsdale'] reader = Reader() data = Dataset.load_from_df(data, reader) trainset, testset = train_test_split(data, test_size=0.25) algo = SVD() algo.fit(trainset) predictions = algo.test(testset) acc = accuracy.rmse(predictions) svd_cv = cross_validate(SVD(), data, cv=5) return data, acc, svd_cv['test_rmse']
def drawRoc(model, i, k): print('Start drawing ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i]) + '!') train, test = train_test_split(binary[i], train_size=0.9, test_size=0.1) model.fit(train) labels = model.test(test) y_true = [label.r_ui for label in labels] y_pred = [label.est for label in labels] fpr, tpr, _ = roc_curve(y_true, y_pred) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i])) plt.legend(loc="lower right") print('Finish drawing ROC curve of NMF with optimal k = ' + str(k) + ', threshold = ' + str(thresholds[i]) + '!')
def global_mean(): """ Global Mean prediction """ trainset, testset = split.train_test_split(data, test_size=.17, random_state=1) labels = list(zip(*testset))[2] err = labels - trainset.global_mean rmse = np.sqrt(np.sum(err**2) / len(testset)) print('RMSE with global mean: ',rmse)
def fit_model(data): train, test = train_test_split(data, test_size=0.25) svd = SVD(n_epochs=25, lr_all=0.01, reg_all=0.4) svd.fit(train) pred = svd.test(test) print('RMSE for test set: {}'.format(accuracy.rmse(pred))) print('MAE for test set: {}'.format(accuracy.mae(pred))) # save model path = '../Models/Collaborative_filtering2.model' pickle.dump(svd, open(path, 'wb')) print("Model is saved to: {}".format(path))
def primaryTest(self, predictor): trainSet, testset = train_test_split(self.dataTuning, test_size=0.2) prediction = predictor.fit(trainSet).test(testset) result = pd.DataFrame(prediction, columns=[ 'user_id', 'item_id', 'base_event', 'predict_event', 'details' ]) result.drop(columns={'details'}, inplace=True) result['error'] = abs(result['base_event'] - result['predict_event']) cross_validate(predictor, self.dataTuning, measures=['RMSE', 'MAE'], cv=5, verbose=True) print(result.head())
def train_model(df, make_cv=True, make_train_test_split=False, user_col="userId", item_col="imdbId", rating_col="rating"): reader = Reader(rating_scale=(0.5, 5)) # df (Dataframe) – The d ataframe containing the ratings. It must have three columns, corresponding to the user (raw) ids, the item (raw) ids, and the ratings, in this order. df[user_col]=df[user_col].astype(str) df[item_col]=df[item_col].astype(str) data = Dataset.load_from_df(df[[user_col, item_col, rating_col]], reader) # data.raw_ratings[0] if make_train_test_split: trainset, testset = train_test_split(data, test_size=.25) else: trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) #trainset.to_raw_uid(1) if make_train_test_split: # predict ratings for the testset predictions = algo.test(testset) # Then compute RMSE accuracy.rmse(predictions) if make_cv: cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Compute predictions of the 'original' algorithm. predictions = algo.test(trainset.build_testset()) # sample pred uid = str(1) # raw user id (as in the ratings file). They are **strings**! iid = str(114709) # raw item id (as in the ratings file). They are **strings**! a=algo.predict(uid, iid, verbose=True) print("Algo trained") return algo
plt.ylabel('True Positive Rate', fontsize=15) plt.legend(loc="lower right") plt.savefig('plot/q15_knn_roc_' + str(threshold) + '.png') plt.clf() if __name__ == "__main__": threshold = [2.5, 3, 3.5, 4] file_path = os.path.expanduser("ml-latest-small/ratings_new.csv") reader = Reader(sep=',') data = Dataset.load_from_file(file_path, reader=reader) sim_options = {'name': 'pearson', 'user_based': True} trainset, testset = train_test_split(data, test_size=0.1) for th in threshold: algo = KNNWithMeans(k=34, sim_options=sim_options) algo.fit(trainset) predictions = algo.test(testset) y_true = [] y_estimate = [] for row in predictions: if row[2] >= th: y_true.append(1) else: y_true.append(0) y_estimate.append(row[3])
from surprise import Dataset from surprise import Reader from surprise import accuracy from surprise.model_selection import split from surprise import SVD,SVDpp import time # 数据读取 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file(r'C:\Users\yy\Desktop\BI\L4\L4-2\L4-code\MovieLens\ratings.csv', reader=reader) train_s,test_s = split.train_test_split(data, train_size=0.8) algo1 = SVD(biased = False) algo2 = SVD() algo3 = SVDpp() """SVD""" print('SVD结果:') time1=time.time() algo1.fit(train_s) pred = algo1.test(test_s) accuracy.rmse(pred, verbose=True) time2=time.time() print('SVD用时: %.2fs' % (time2-time1)) uid = str(196) iid = str(302) algo1.predict(uid, iid, r_ui=4, verbose=True) # 输出uid对iid的预测结果 print('-'*30) """SVDbias""" print('SVDbias结果:')
def NN_Model(df, n_factors=10, ep=5): from sklearn.model_selection import train_test_split user_rev_biz_scott = df[[ 'user_id', 'user_name', 'business_id', 'biz_name', 'average_stars' ]].loc[df.city == 'Scottsdale'] user_df = user_rev_biz_scott.groupby(['user_id', 'user_name' ]).size().reset_index(name="Freq") user_df.drop('Freq', axis=1, inplace=True) user_id_list = list(user_df.user_id) user_id_dict = {y: x for (x, y) in enumerate(user_id_list)} user_rev_biz_scott['user_num'] = user_rev_biz_scott.user_id.map( user_id_dict) biz_df = user_rev_biz_scott.groupby(['business_id', 'biz_name' ]).size().reset_index(name="Freq") biz_df.drop('Freq', axis=1, inplace=True) biz_id_list = list(biz_df.business_id) biz_id_dict = {y: x for (x, y) in enumerate(biz_id_list)} user_rev_biz_scott['biz_num'] = user_rev_biz_scott.business_id.map( biz_id_dict) X = user_rev_biz_scott[[ 'user_num', 'user_name', 'biz_num', 'biz_name', 'average_stars' ]] y = user_rev_biz_scott.average_stars X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) n_users = user_rev_biz_scott.user_id.nunique() n_biz = user_rev_biz_scott.business_id.nunique() biz_input = Input(shape=[1], name='Biz_Input') biz_embedding = Embedding(n_biz, n_factors, name='Biz_Embed')(biz_input) biz_vac = Flatten(name='Flatten_Biz')(biz_embedding) user_input = Input(shape=[1], name='User_Input') user_embedding = Embedding(n_users, n_factors, name='User_Embed')(user_input) user_vac = Flatten(name="Flatten_User")(user_embedding) prod = Dot(name='Dot_Product', axes=1)([biz_vac, user_vac]) model = Model([user_input, biz_input], prod) model.compile(optimizer='adam', loss='mse', metrics=['accuracy']) class TestCallback(Callback): def __init__(self, test_data): self.test_data = test_data def on_epoch_end(self, epoch, logs={}): x, y = self.test_data loss, acc = self.model.evaluate(x, y, verbose=0) print('\nTesting loss: {}, acc: {}\n'.format(loss, acc)) if os.path.exists('biz_model.h5'): model = load_model('biz_model.h5') else: history = model.fit([X_train.user_num, X_train.biz_num], y_train, epochs=ep, verbose=False, validation_data=([X_test.user_num, X_test.biz_num], y_test), callbacks=[ TestCallback( ([X_test.user_num, X_test.biz_num], y_test)) ]) model.save('NN_Embed_Model') return user_id_dict, biz_id_dict, user_df, biz_df, X, X_test, model, history