def eval_branches(tree_results): eval_info = {} Y_true = [tree_results[i]['true_label'] for i in list(tree_results.keys())] Y_pred = [ tree_results[i]['tree_prediction_from_avg_softmax'] for i in list(tree_results.keys()) ] confidence_softmax = [ np.max(tree_results[i]['softmax_raw_avg']) for i in list(tree_results.keys()) ] eval_info['accuracy'] = accuracy_score(Y_true, Y_pred) eval_info['macroF'] = f1_score(Y_true, Y_pred, average='macro', labels=[0, 1, 2]) eval_info['rmse_softmax'] = rmse(Y_true, Y_pred, confidence_softmax) return eval_info
def fitting_wrapper(self, df, df_out): F = FitMedlyn(fluxnet=False) params = F.setup_model_params() (result, success) = F.minimise_params(params, df, df["Cond"]) if success: g1 = result.params['g1'].value g1_se = result.params['g1'].stderr g0 = 0.0 model = F.gs_model(df["VPD"], df["Photo"], df["CO2S"], g0, g1) rsq = (pearsonr(df["Cond"], model)[0])**2 num_pts = len(df["Cond"]) rmse_val = rmse(df["Cond"], model) row = pd.Series([ df.Scale.iloc[0], g1, g1_se, rsq, rmse_val, num_pts, df.Species.iloc[0], df.Datacontrib.iloc[0], df.Location.iloc[0], df.latitude.iloc[0], df.longitude.iloc[0], df.PFT.iloc[0], df.Pathway.iloc[0], df.Type.iloc[0], df.Plantform.iloc[0], df.Leafspan.iloc[0], df.Tregion.iloc[0] ], index=self.out_cols) df_out = df_out.append(row, ignore_index=True) return (df_out)
def fitting_wrapper(self, df, df_out): F = FitMedlyn(fluxnet=False) params = F.setup_model_params() (result, success) = F.minimise_params(params, df, df["Cond"]) if success: g1 = result.params['g1'].value g1_se = result.params['g1'].stderr g0 = 0.0 model = F.gs_model(df["VPD"], df["Photo"], df["CO2S"], g0, g1) rsq = (pearsonr(df["Cond"], model)[0])**2 num_pts = len(df["Cond"]) rmse_val = rmse(df["Cond"], model) row = pd.Series([df.Scale.iloc[0], g1, g1_se, rsq, rmse_val, num_pts, df.Species.iloc[0], df.Datacontrib.iloc[0], df.Location.iloc[0], df.latitude.iloc[0], df.longitude.iloc[0], df.PFT.iloc[0], df.Pathway.iloc[0], df.Type.iloc[0], df.Plantform.iloc[0], df.Leafspan.iloc[0], df.Tregion.iloc[0]], index=self.out_cols) df_out = df_out.append(row, ignore_index=True) return (df_out)
def update_fit_stats(self, d, result, F, df): d['g1'] = result.params['g1'].value d['g1_se'] = result.params['g1'].stderr d['g0'] = 0.0 model = F.gs_model(df["VPD_f"], df["GPP_f"], df["CO2"], d['g0'], d['g1']) d['rsq'] = (pearsonr(df["gs_est"], model)[0])**2 d['num_pts'] = len(df["gs_est"]) d['rmse_val'] = rmse(df["gs_est"], model) return (d, model)
def nfoldCrossValidation(Xtrain,Ytrain, K, maxIter,nfold = 5,LambdaRange=None,MuRange=None,GammaRange=None,BetaRange=None): #### split the data into training data and cross validation data #X, Xcv, Y, Ycv = randomsplit(Xtrain, Ytrain, cvRatio=0.3) #T = len(Xtrain) minRMSE = float("inf") Lambda_opt, Mu_opt, Gamma_opt, Beta_opt = [], [], [], [] #dummyN, D = Xtrain[0].shape # startL, endL, numL = 0, 1, 1#1 # LambdaRange = np.linspace(startL,endL,numL) # startM, endM, numM = 0, 5, 1#6 # MuRange = np.linspace(startM,endM,numM) # startG, endG, numG = 0, 1, 1#11 # GammaRange = np.linspace(startG,endG,numG) # startB, endB, numB = 0, 5, 1#6 # BetaRange = np.linspace(startB,endB,numB) parameters = {} for Lambda in LambdaRange: for Mu in MuRange: for Gamma in GammaRange: for Beta in BetaRange: print('Lambda={},Mu={},Gamma={},Beta={}'.format(Lambda,Mu,Gamma,Beta)) err = np.zeros((nfold,1)) for i in range(1,nfold+1): print('i={}'.format(i)) X, Xcv, Y, Ycv = nfoldsplit(Xtrain, Ytrain, nfold, i)#ith fold data W,L,S,Omega = MTL(X, Y, K, Lambda, Mu, Gamma, Beta,maxIter) ## cross validation testing error # Ycv_est = [] # testrmse = np.zeros((1,T)) # for t in range(T): # Xtestt = Xcv[t] # Ntest by D matrix # Ntest, dummyD = Xtestt.shape # Wt = W[:,t] # D by 1 column vector # Ycv_est.append(np.dot(Xtestt,Wt).reshape((Ntest,1))) # temp = 1.0/Ntest*np.sum(np.power((Ycv[t]-Ycv_est[t]),2)) # testrmse[0,t] = np.sqrt(temp) testrmse = rmse(Xcv,Ycv,W) err[i-1,0] = np.mean(testrmse) meanTestRMSE = np.mean(err) if(minRMSE>meanTestRMSE): minRMSE = meanTestRMSE Lambda_opt, Mu_opt, Gamma_opt, Beta_opt = Lambda,Mu,Gamma,Beta parameters['W'] = W parameters['L'] = L parameters['S'] = S parameters['Omega'] = Omega return Lambda_opt, Mu_opt, Gamma_opt, Beta_opt, parameters
def run(): for i in range(1, 6): fold = 'fold' + str(i) command = "cut " + source + "/" + fold + "/test" + " -d' ' -f 3 | paste -d ' ' - " + output + "/" + fold + "/test.predict > tmp.txt" os.popen(command) r = rmse.rmse(r'tmp.txt') entity.setValue('fold_name', 'FOLD' + str(i)) entity.setValue('subset_name', 'TEST') entity.setValue('rmse', str(r)) print entity entity.persist()
def run(): for i in range(1,6): fold = 'fold' + str(i) command = "cut " + source + "/" + fold + "/test" + " -d' ' -f 3 | paste -d ' ' - " + output + "/" + fold + "/test.predict > tmp.txt" os.popen(command) r = rmse.rmse(r'tmp.txt') entity.setValue('fold_name','FOLD'+str(i)) entity.setValue('subset_name','TEST') entity.setValue('rmse',str(r)) print entity entity.persist()
def eval_timeline(predictions): eval_info = {} Y_true = predictions['true_label'] Y_pred = predictions['prediction_from_softmax_raw'] confidence = [ np.max(predictions['softmax_raw'][i]) for i in range(len(Y_true)) ] eval_info['accuracy'] = accuracy_score(Y_true, Y_pred) eval_info['macroF'] = f1_score(Y_true, Y_pred, average='macro', labels=[0, 1, 2]) eval_info['rmse'] = rmse(Y_true, Y_pred, confidence) return eval_info
basic_u[:, u] = numpy.dot(numpy.dot(R_matrix[u, :], basic_v.T), numpy.linalg.pinv(numpy.asmatrix(numpy.dot(basic_v, basic_v.T)))) for v in range(m): #for each row basic_v[:, v] = numpy.dot(numpy.dot(R_matrix[:, v].T, basic_u.T), numpy.linalg.pinv(numpy.asmatrix(numpy.dot(basic_u, basic_u.T)))) basic_R = numpy.dot(basic_u.T, basic_v) <<<<<<< HEAD pr.disable() s = StringIO.StringIO() sortby = "cumulative" ps = pstats.Stats(pr, stream = s).sort_stats(sortby) ps.print_stats() ps.dump_stats("output_stats.txt") #print s.getvalue() ======= err = rmse(R_matrix, basic_R) i += 1 >>>>>>> 3db25c535f07af3fc549e118689b68ad6a31d0d1 t1 = time.time() basic_time = t1 - t0 # ZERO MATRIX FACTORIZATION un,um = zero_u.shape t0 = time.time() for i in range(iterations): for u in range (n): # u = row zero_u[:, u] = numpy.dot(numpy.dot(R_matrix[u,R_matrix[u,:]!=0], zero_v[:,R_matrix[u,:]!=0].T), numpy.linalg.pinv(numpy.asmatrix(numpy.dot(zero_v[:,R_matrix[u,:]!=0], zero_v[:,R_matrix[u,:]!=0].T)))) for v in range(m): #for each row zero_v[:, v] = numpy.dot(numpy.dot(R_matrix[R_matrix[:,v]!=0, v].T, zero_u[:, R_matrix[0:um,v]!=0].T), numpy.linalg.pinv(numpy.asmatrix(numpy.dot(zero_u[:, R_matrix[0:um,v]!=0], zero_u[:, R_matrix[0:um,v]!=0].T))))
diff = ground_truth - predicted_value diff_square = np.dot(diff, diff) #rmse = np.sqrt(np.divide(diff_square, ground_truth.shape[0])) rmse = np.sqrt(diff_square/ground_truth.shape[0]) return rmse """ # In[ ]: #1-dimensional input variables using the training set #first feature for the test set test_fixed_acidity = red_test_data[:, 0].reshape(-1, 1) test_X_acidity = np.hstack((test_fixed_acidity, np.repeat(1, test_fixed_acidity.shape[0]).reshape(-1, 1))) predicted_score_acidity = np.dot(test_X_acidity, train_w_acidity.T) #predicted_score_acidity = predicted_value(train_fixed_acidity, test_fixed_acidity, red_test_score) rmse.rmse(predicted_score_acidity, red_test_score) #0.7860892754162216 # In[ ]: #full 11-dimensional input variables test_X = np.hstack((red_test_data, np.repeat(1, red_test_data.shape[0]).reshape(-1, 1))) predicted_score = np.dot(test_X, w_all.T) rmse.rmse(predicted_score, red_test_score) #0.644717277241364
def hierarchical(self, cluster_goal=25): clusters = [[element] for element in self.elements] while (len(clusters) > cluster_goal): sys.stdout.write('\rCombining %d clusters' % len(clusters)) (clust_A, clust_B) = self.findCloseClusters(clusters) clusters = self.combineClusters(clusters, clust_A, clust_B) gc.collect() #print self.global_matrix predictions = [] for cluster in clusters: plurality = None doc_freq = dict() for doc in cluster: if (plurality is None): plurality = doc.get_short_file() file_abbrev = re.sub('_.*-', '', doc.get_short_file()) if file_abbrev not in doc_freq: doc_freq[file_abbrev] = 1 else: doc_freq[file_abbrev] += 1 plurality = max(doc_freq) for doc in cluster: predictions.append((doc.get_short_file(), plurality)) author_rmse_calc = [] sim_matrix = open('matrix', 'w+') for prediction in predictions: if (re.match(prediction[1][:-2], prediction[0])): author_rmse_calc.append(0) sim_matrix.write("**match**\n") else: author_rmse_calc.append(1) sim_matrix.write("guess %s => %s\n" % (prediction[0], prediction[1])) author_actual = [0 for ndx in author_rmse_calc] rmse_val = rmse(author_rmse_calc, author_actual) sim_matrix.write("\nrmse: '%f'\n\n" % rmse_val) for author in self.global_matrix.keys(): sim_matrix.write(",%s" % author.strip()) sim_matrix.write("\n") for auth_A in self.global_matrix.keys(): sim_matrix.write("%s," % auth_A.strip()) tmp_matrix = self.global_matrix[auth_A] for auth_B in self.global_matrix.keys(): if (auth_B in tmp_matrix): dissim = tmp_matrix[auth_B] sim_matrix.write("%.03f," % dissim) else: sim_matrix.write("--,") sim_matrix.write("\n") sim_matrix.close() return predictions
def problem2(test_docs, train_docs, lower_bound=0.25, upper_bound=0.75,curr_classifier="NaiveBayes"): data_list = [] two_count, three_count, four_count, five_count, all_count = 0,0,0,0,0 #creates awesome data object for each document for doc in train_docs: #pars = [par.lower() for par in doc.get_pars() if par is not None] #ratings = [rating for rating in doc.get_ratings() if rating is not None] par = doc.get_pars()[3] rating = doc.get_ratings()[3] if (par is not None and rating is not None): #Tally up the rating counts from the training data if(rating == '2'): two_count += 1 elif(rating == '3'): three_count += 1 if(rating == '4'): four_count += 1 if(rating == '5'): five_count += 1 data_list.append(data(par, rating, doc.filename, doc.author)) else: print "Found bad review by -> " + doc.author + " (this comes from: par = None) in file '%s'" % doc.get_filename() all_count = two_count + three_count + four_count + five_count #4 fold cross validation fold_size = math.floor(len(data_list) / DEFAULT_NUM_FOLDS) folds = [[], [], [], []] temp_data_list = copy.deepcopy(data_list) #divide into 4 folds while len(temp_data_list) != 0: ndx = random.randrange(0, len(temp_data_list)) fold = random.randrange(0, DEFAULT_NUM_FOLDS) if len(folds[fold]) <= fold_size + 1: folds[fold].append(temp_data_list[ndx]) del(temp_data_list[ndx]) rmses = [] #Big loop: For each fold for fold_num in range(DEFAULT_NUM_FOLDS): test_data = folds[fold_num] train_data = [] for ndx in range(len(folds)): if ndx != fold_num: train_data.extend(folds[ndx]) # Begin Filtering bag_words_train = [] for train_datum in train_data: bag_words_train.extend([(train_datum.get_bag_of_words(), train_datum.rating)]) bag_words_test = [] for test_datum in test_data: bag_words_test.extend([(test_datum.get_bag_of_words(), test_datum.rating)]) all_rating_dist = dict() for bag, rating in bag_words_train: if rating not in all_rating_dist: all_rating_dist[rating] = [] all_rating_dist[rating].extend(bag) rating_freq_dist = dict() for rating, words in all_rating_dist.iteritems(): rating_freq_dist[rating] = nltk.FreqDist(words) rating_filters = dict() for rating in all_rating_dist.keys(): word_freqs = [(word, freq) for (word, freq) in rating_freq_dist[rating].iteritems()] #Ascending sort sorted_wfreq = sorted(word_freqs, key=lambda x:x[1]) start = int(lower_bound * len(sorted_wfreq)) end = int(upper_bound * len(sorted_wfreq)) rating_filters[rating] = sorted_wfreq[:start] + sorted_wfreq[end:] for train_datum in train_data: train_datum.set_filtered_words(rating_filters[train_datum.rating]) # Generate mega-cool feature set thing filtered_bag_of_words_train = dict() for train_datum in train_data: if train_datum.rating not in filtered_bag_of_words_train: filtered_bag_of_words_train[train_datum.rating] = [] filtered_bag_of_words_train[train_datum.rating].extend(train_datum.get_bag_of_words()) feature_set = [] for rating in filtered_bag_of_words_train: feature_set.append((bag_of_words_to_presence(filtered_bag_of_words_train[rating]), rating)) #Train with the classifier specified by the user, default is NaiveBayes if(curr_classifier == "NaiveBayes"): classifier = nltk.classify.NaiveBayesClassifier.train(feature_set) elif(curr_classifier == "Maxent"): algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier = nltk.classify.MaxentClassifier.train(feature_set, algorithm,max_iter=0, trace=0) elif(curr_classifier == "DecisionTree"): classifier = nltk.classify.DecisionTreeClassifier.train(feature_set,binary=True) elif (curr_classifier == "MaxBayes"): classifier1 = nltk.classify.NaiveBayesClassifier.train(feature_set) algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0] classifier2 = nltk.classify.MaxentClassifier.train(feature_set, algorithm,max_iter=0, trace=0) else: classifier = nltk.classify.NaiveBayesClassifier.train(feature_set) if(curr_classifier == "MaxBayes"): guesses, new_guesses1, new_guesses2, actuals, guesses1_dist, guesses2_dist = [], [], [], [], [], [] #Classify the test data using the probability classifier for both MaxEnt and NaiveBayes (returns probabilities for tdata in test_data: guesses1_dist.append(classifier1.prob_classify(bag_of_words_to_presence(tdata.get_bag_of_words()))) guesses2_dist.append(classifier2.prob_classify(bag_of_words_to_presence(tdata.get_bag_of_words()))) actuals.append(tdata.rating) #Calculate the probabilities for a rating to occur (from the training data) two_prob = float(two_count) / float(all_count) three_prob = three_count / float(all_count) three_prob = three_count / float(all_count) four_prob = four_count / float(all_count) five_prob = five_count / float(all_count) #Get the final probabilities for NaiveBayes for dist in guesses1_dist: new_guesses1.append( (float(dist.prob('2')+two_prob)/2*2) + (float(dist.prob('3')+three_prob)/2*3) + (float(dist.prob('4')+four_prob)/2*4) + (float(dist.prob('5')+five_prob)/2*5) ) #Get the final probabilities for MaxEnt for dist in guesses2_dist: new_guesses2.append( (float(dist.prob('2')+two_prob)/2*2) + (float(dist.prob('3')+three_prob)/2*3) + (float(dist.prob('4')+four_prob)/2*4) + (float(dist.prob('5')+five_prob)/2*5) ) #Average the probabilities of NaiveBayes and MaxEnt for x in range(len(new_guesses1)): val = float(new_guesses1[x])*0.5 + float(new_guesses2[x])*0.5 guesses.append(val) rmses.append(rmse(guesses, actuals)) print "Validation run " + str(fold_num + 1) print "Need to output validation set filenames...." print " RMSE: " + str(rmses[fold_num]) else: guesses, guesses_dist, actuals, new_guesses = [], [], [], [] #Classify the test data using the probability classifier (returns probabilities) for tdata in test_data: if(curr_classifier == "DecisionTree"): guesses.append(classifier.classify(bag_of_words_to_presence(tdata.get_bag_of_words()))) else: guesses_dist.append(classifier.prob_classify(bag_of_words_to_presence(tdata.get_bag_of_words()))) actuals.append(tdata.rating) #Calculate the probabilities for a rating to occur (from the training data) two_prob = float(two_count) / float(all_count) three_prob = three_count / float(all_count) three_prob = three_count / float(all_count) four_prob = four_count / float(all_count) five_prob = five_count / float(all_count) #Get the final probabilities for the selected classifier if(curr_classifier == "DecisionTree"): new_guesses = guesses else: for dist in guesses_dist: #new_guesses.append( (float(dist.prob('2')+two_prob)/2*2) + (float(dist.prob('3')+three_prob)/2*3) + (float(dist.prob('4')+four_prob)/2*4) + (float(dist.prob('5')+five_prob)/2*5) ) new_guesses.append( float(dist.prob('2')*2) + float(dist.prob('3')*3) + float(dist.prob('4')*4) + float(dist.prob('5')*5) ) rmses.append(rmse(new_guesses, actuals)) print "Validation run " + str(fold_num + 1) #print set(test_data) print " RMSE: " + str(rmses[fold_num]) print "\nAVG RMSE: " + str(sum(rmses)/DEFAULT_NUM_FOLDS)
import scipy.sparse as sp from scipy.sparse.linalg import svds from rmse import rmse import numpy as np from work_with_data import train_data_matrix, test_data_matrix, n_users, n_items # делаем SVD u, s, vt = svds(train_data_matrix, k=10) s_diag_matrix = np.diag(s) # предсказываем X_pred = np.dot(np.dot(u, s_diag_matrix), vt) # выводим метрику print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))
# print() # plt.plot(Y_pred_nat, Y_test, 'bo') # plt.show() # # time0_gd = time.time() # time1_gd = time.time() - time0_gd # theta_gd = gradient_descent(X_train, Y_train) # Y_pred_gd = np.dot(X_test, theta_gd) # loss_gd = rmse(Y_pred_gd, Y_test) # print('Theta, gradient descent:') # print(theta_gd) # print('Time to find solution with gradient descent: ', time1_gd) # print('Gradient descent RMSE: ', loss_gd) # print() # plt.plot(Y_pred_gd, Y_test, 'bo') # plt.show() time0_gen = time.time() time1_gen = time.time() - time0_gen theta_gen = gen(X_train, Y_train) Y_pred_gen = np.dot(X_test, theta_gen) loss_gd = rmse(Y_pred_gen, Y_test) print('Theta, evolution:') print(theta_gen) print('Time to find solution with evolution strategy: ', time1_gen) print('Evolution strategy RMSE: ', loss_gd) print() plt.plot(Y_pred_gen, Y_test, 'bo') plt.show()
#GammaRange = [0.001,0.1,10] #BetaRange = [0.001,0.1,10] # ============================================================================= # Lambda,Mu,Gamma,Beta, Parameters = nfoldCrossValidation(Xtrain,Ytrain, K, maxIter,nfold = 5, # LambdaRange=LambdaRange, # MuRange=MuRange, # GammaRange=GammaRange, # BetaRange=BetaRange) # #Lambda,Mu,Gamma,Beta = 1e-3, 1e-3, 0.1, 1e-3 # ============================================================================= Lambda,Mu,Gamma,Beta = 0.1, 0.1, 0.1, 50. #toy-3tasks-nonoverlap-DATA #### train on all training data to get results W,L,S,Omega = MTL(Xtrain, Ytrain, K, Lambda, Mu, Gamma, Beta, maxIter) #### testing error testrmse = rmse(Xtest,Ytest,W) meanTestRMSE = np.mean(testrmse) print('the mean test RMSE is ' + str(meanTestRMSE)) #### save results import scipy.io as sio result = {} result['W_est'] = W result['L_est'] = L result['S_est'] = S result['Omega'] = Omega result['testrmse'] = testrmse result['Lambda'] = Lambda result['Mu'] = Mu result['Gamma'] = Gamma result['Beta'] = Beta
def complete(self, data_list): #4 fold cross validation fold_size = math.floor(len(data_list) / DEFAULT_NUM_FOLDS) folds = [[], [], [], []] temp_data_list = copy.deepcopy(data_list) #divide into 4 folds while len(temp_data_list) != 0: ndx = random.randrange(0, len(temp_data_list)) fold = random.randrange(0, DEFAULT_NUM_FOLDS) if len(folds[fold]) <= fold_size + 1: folds[fold].append(temp_data_list[ndx]) del(temp_data_list[ndx]) #get sentiment for words sent = buildSenti() rmses = [] #for each fold, get bag of words for fold in folds: guesses = [] actuals = [] for datum in fold: #get words bag_of_words = datum.get_bag_of_words() #get actual rating actual_rating = datum.rating good_seed = ['excellent', 'amazing', 'best', 'delicious', 'tradition', 'fastest', 'clean', 'favorite', 'taste', 'worth', 'nice', 'friendly', 'positive', 'quality', 'great', 'prompt', 'amazing'] bad_seed = ['horrible', 'terrible', 'metro', 'alright', 'cannot', 'mediocre', 'bad', 'wrong', 'messing', 'long', 'took', 'unfortunately', 'obvious', 'drops', 'incorrect'] senti_word = [] for word in bag_of_words: if word in good_seed: senti_word.append(6) elif word in bad_seed: senti_word.append(-1) elif word in sent: sentiment = sent[word] #augment sent value if sentiment[1] > sentiment[0]: actual_sent = round((sentiment[1] * 0.9) + 4.6) senti_word.append(actual_sent) else: actual_sent = round((sentiment[0] * 5 ) + 1.5) senti_word.append(actual_sent) if (len(senti_word) > 0): prediction = round(sum(senti_word) / len(senti_word), 1) # print "\nHERE: " # print "Prediction: " # print prediction # print "Actual: " # print actual_rating guesses.append(prediction) actuals.append(actual_rating) temp_rm = rmse(guesses, actuals) print "For this fold: %f" % (temp_rm) rmses.append(temp_rm) print "Average RMSE: %f" % (sum(rmses) /len(rmses))
from sklearn.metrics.pairwise import pairwise_distances import numpy as np from rmse import rmse from work_with_data import train_data_matrix, test_data_matrix, n_users, n_items def predict(ratings, similarity, type='user'): if type == 'user': mean_user_rating = ratings.mean(axis=1) ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) pred = mean_user_rating[:, np.newaxis] + similarity.dot( ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T elif type == 'item': pred = ratings.dot(similarity) / np.array( [np.abs(similarity).sum(axis=1)]) return pred # считаем косинусное расстояние для пользователей и фильмов user_similarity = pairwise_distances(train_data_matrix, metric='cosine') item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine') item_prediction = predict(train_data_matrix, item_similarity, type='item') user_prediction = predict(train_data_matrix, user_similarity, type='user') print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))) print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))