class RecommenderSVDpp(Recommender): def __init__(self, recommendation_dataset: RecommendationDataSet): super(RecommenderSVDpp, self).__init__(recommendation_dataset.movies) self.algorithm = SVDpp() self.recommendation_dataset = recommendation_dataset def fit(self, dataset): return self.algorithm.fit(dataset) def test(self, test_set): return self.algorithm.test(test_set) def get_recommendation(self, watched, k=20): # get dataset new_user_id, full_dataset = self.recommendation_dataset.get_dataset_with_extended_user(watched) inner_user_id = full_dataset.to_inner_uid(new_user_id) # after new dataset we need again train our model with the new user for the whole # dataset with the new user. self.algorithm.fit(full_dataset) # watched movies watched = {full_dataset.to_inner_iid(key): value for key,value in watched.items()} # Calculate for all similar user, predictions test_items = [ self.algorithm.predict(new_user_id, full_dataset.to_raw_iid(i)) for i in range(0, full_dataset.n_items) if i not in watched ] topn_items = [i[0] for i in get_top_n(test_items, n=k, minimum_rating=1.0)[new_user_id]] return self.movies.get_movie_by_movie_ids(topn_items)
def SVDPP(PointFrame, RecommendNum=10, TypeNum=5): OutUserList = [] OutFundList = [] PointFrameList = [] UserType = 0 # 拆分评分矩阵为5类: for Type in range(5): PointFrameList.append(PointFrame.ix[PointFrame.Type == Type]) # 对每一类用户分别评分: for Frame in PointFrameList: Frame = Frame.loc[:, 'User':'******'] UserList = Frame.User.unique() FundList = Frame.FundCode.unique() UserType = UserType + 1 reader = Reader(rating_scale=(0, 2)) data = Dataset.load_from_df(Frame, reader=reader).build_full_trainset() if UserType == 4: model = SVDpp(n_factors=5) else: model = SVDpp() model.fit(data) for User in UserList: UserPointList = [] for Fund in FundList: UserPointList.append(model.predict(User, Fund).est) RecommendList = np.argsort(UserPointList)[::-1][0:RecommendNum] for FundIndex in RecommendList: OutUserList.append(User) OutFundList.append(FundList[FundIndex]) OutFrame = pd.DataFrame({ "User": OutUserList, "RecommendFundCode": OutFundList }) return OutFrame
def svd(user_id, area): algo = SVDpp() algo = SVDpp(n_factors=100, n_epochs=15) # 3. train model 저장 file_name = os.path.expanduser('./dump') #dump.dump(file_name, algo=algo) # 한번 학습하고 여기는 주석처리 _, algo = dump.load(file_name) Area = pd.read_csv('./area.csv') ## { 상품아이디(학습데이터), area, 상품ID } #nowarea="C" #user=str("A2CX7LUOHB2NDG") # usre ID 받아오기 neww = Area[Area['area'] == area]['productID'].tolist() # 구역 받아오기 predictions = [ algo.predict(str(user_id), str(productID)) for productID in neww ] # 예측 ###### def sortkey_est(pred): return pred.est predictions.sort(key=sortkey_est, reverse=True) #print(predictions) top_product_id = [int(pred.iid) for pred in predictions] top_product_id = top_product_id[:5] return top_product_id
class SvdPP(RecommenderBase): """ SVDpp algorithm. Actually woring bad, just a draft """ def __init__(self, URM): print('train set built') # double check if training set is built fine for sgd # for u, i, r in self.trainset.all_ratings(): # a = 1 def fit(self, urm, n_factors=20, n_epochs=20, lr_all=0.007, reg_all=0.02, init_mean=0, init_std_dev=0.1, verbose=True): # create the training set r, c = urm.nonzero() ones = np.ones(len(r), dtype=np.int32) d = np.vstack((r, c, ones)).transpose() df = pd.DataFrame(d) df.columns = ['userID', 'itemID', 'rating'] reader = Reader() data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) self.trainset = data.build_full_trainset() # fit self.algo = SVDpp(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, init_mean=init_mean, init_std_dev=init_std_dev, verbose=verbose) self.algo.fit(self.trainset) def recommend(self, userid, N=10, urm=None, filter_already_liked=True, with_scores=True, items_to_exclude=[]): if len(items_to_exclude) > 1: raise NotImplementedError('Items to exclude functionality is not implemented yet') r = np.empty([1]) for i in range(d.N_TRACKS): p = self.algo.predict(userid, i) r = np.array([p[3]]) if i == 0 else np.concatenate((r, np.array([p[3]]))) if filter_already_liked: if urm == None: raise ValueError('Please provide a URM in order to items already liked') else: r[urm.getrow(userid).nonzero()[1]] = 0 l = [userid] ind = np.argpartition(r, -10)[-10:] for i in ind: if with_scores: l.append((i, r[i])) else: l.append(i) return l
def computeSVDpp(data, test_np): """Compute the SVD++ method and return the predictions on the test The method has the following parameter: - Number of factors : 6 - All regularization parameter : 0.025 data : data frame which represent the train set test_np : data frame on which the prediction will be returned return : test_np with a column of prediction named 'svdpp_rating'""" trainset, test = dataTrainSurprise(data, test_np) svdpp_algo = SVDpp(n_factors = 6, reg_all=0.025).fit(trainset) test['svdpp_rating'] = test[['user_id', 'movie_id']] \ .apply(lambda row: svdpp_algo.predict(row['user_id'], row['movie_id'])[3], axis=1) return test
def SVD_pp(): algo = SVDpp() # 定义K折交叉验证迭代器,k=3 kf = KFold(n_splits=3) for trainset, testset in kf.split(data): # 训练并预测 algo.fit(trainset) predictions = algo.test(testset) # 计算RMSE accuracy.rmse(predictions, verbose=True) # verbose 输出当前跌代,默认False uid = str(196) iid = str(302) # 输出uid对iid的预测结果 pred = algo.predict(uid, iid, r_ui=4, verbose=True) time2 = time.time() print(time2 - time1)
class RecommenderSVDppSimilarUsers(Recommender): """ Instead of building new dataset when the new user is in, we get similar users, and based on that try to get similar movies """ def __init__(self, movies): super(RecommenderSVDppSimilarUsers, self).__init__(movies) self.algorithm = SVDpp() def fit(self, dataset): return self.algorithm.fit(dataset) def test(self, test_set): return self.algorithm.test(test_set) def get_recommendation(self, watched, k=20, k_inner_item=10): # get dataset full_dataset = self.algorithm.trainset # watched movies watched = { full_dataset.to_inner_iid(key): value for key, value in watched.items() } # get similar users similar_users = self.get_similar_user_ids(watched, k=k_inner_item) # Calculate for all similar user, predictions candidates = defaultdict(float) for inner_move_id in range(0, full_dataset.n_items): if inner_move_id not in watched: movie_id = full_dataset.to_raw_iid(inner_move_id) for inner_user_id, similarity in similar_users.items(): prediction = self.algorithm.predict( full_dataset.to_raw_uid(inner_user_id), movie_id) candidates[movie_id] += similarity * prediction.est # heapq.nlargest(k, candidates.items(), key=itemgetter(1)) return self.movies.get_movie_by_movie_ids( heapq.nlargest(k, candidates, key=candidates.get))
def svdpp(train, test, ids, Xtest, Xids): """ Extension of svd taking the implicit ratings into account Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('SVD++') algo = SVDpp(n_factors=100, n_epochs=10, lr_all=0.0015, reg_all=0.05, random_state=15) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def SVDPPThreadFuc(Frame): OutUserList = [] OutFundList = [] Frame = Frame.loc[:, 'User':'******'] UserList = Frame.User.unique() FundList = Frame.FundCode.unique() reader = Reader(rating_scale=(0, 2)) data = Dataset.load_from_df(Frame, reader=reader).build_full_trainset() model = SVDpp() model.fit(data) for User in UserList: UserPointList = [] for Fund in FundList: UserPointList.append(model.predict(User, Fund).est) RecommendList = np.argsort(UserPointList)[::-1][0:10] for FundIndex in RecommendList: OutUserList.append(User) OutFundList.append(FundList[FundIndex]) OutFrame = pd.DataFrame({ "User": OutUserList, "RecommendFundCode": OutFundList }) return OutFrame
def svd_model(df): """ Apply SVD. """ df = pd.melt(df, id_vars='smiles', value_vars=list(df.columns[1:]), var_name='Target', value_name='TargetValue') mark = df.TargetValue.isna() unknown = df.loc[mark] known = df.loc[~mark] reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(known[['smiles', 'Target', 'TargetValue']], reader) kf = KFold(n_splits=3, random_state=57) algo = SVDpp(n_factors=12, reg_all=0.003, lr_all=0.006, random_state=132) for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) rmse = round(accuracy.rmse(predictions, verbose=True), 3) print('RMSE of SVD model for cross validation' + str(rmse)) result = unknown.copy() result['ToxicProb'] = result.apply( lambda x: algo.predict(x.smiles, x.Target).est, axis=1) result = result.drop(columns='TargetValue') return result
uid = str(196) iid = str(302) algo1.predict(uid, iid, r_ui=4, verbose=True) # 输出uid对iid的预测结果 print('-'*30) """SVDbias""" print('SVDbias结果:') time1=time.time() algo2.fit(train_s) pred = algo2.test(test_s) accuracy.rmse(pred, verbose=True) time2=time.time() print('SVDbias用时: %.2fs' % (time2-time1)) uid = str(196) iid = str(302) algo2.predict(uid, iid, r_ui=4, verbose=True) print('-'*30) """SVD++""" print('SVD++结果:') time1=time.time() algo3.fit(train_s) pred = algo3.test(test_s) accuracy.rmse(pred, verbose=True) time2=time.time() print('SVD++用时: %.2fs' % (time2-time1)) uid = str(196) iid = str(302) algo3.predict(uid, iid, r_ui=4, verbose=True) print('-'*30)
# Load the movielens-100k dataset UserID::MovieID::Rating::Timestamp data = Dataset.load_builtin('ml-1m') trainset, testset = train_test_split(data, test_size=.15) # Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item. algoritmo = SVDpp(n_epochs=5) algoritmo.fit(trainset) # Selecionamos o usuário e o filme que será analisado # User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas uid = str(49) # Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4 iid = str(2058) # raw item id # get a prediction for specific users and items. pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True) # run the trained model against the testset test_pred = algoritmo.test(testset) # Avalia RMSE print("Avaliação RMSE: ") accuracy.rmse(test_pred, verbose=True) # Avalia MAE print("Avaliação MAE: ") accuracy.mae(test_pred, verbose=True)
dfRatings = pd.read_csv(sys.argv[1]) dfTest = pd.read_csv(sys.argv[2]) # Delete unused columns del dfRatings['date'] del dfRatings['train_id'] del dfTest['date'] del dfTest['test_id'] # Set the rating scale and create the data for Surprise to use reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( dfRatings[['user_id', 'business_id', 'rating']], reader) train_set = data.build_full_trainset() # Use SVD with surprise algo = SVDpp() algo.train(train_set) f = open('SVDOutput.csv', 'w') f.write("test_id,rating\n") for i in range(len(dfTest)): prediction = algo.predict(dfTest.at[i, 'user_id'], dfTest.at[i, 'business_id'], r_ui=4, verbose=True) predRating = prediction.est f.write(str(i) + "," + str(predRating) + '\n') f.close()
trainset = data.build_full_trainset() #testset = data1.build_full_trainset() # Use the famous SVD algorithm. algo = SVDpp() # Run 5-fold cross-validation and print results. #cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) algo.fit(trainset) ''' predictions = algo.test(testset) print(predictions) ''' ''' uid = str(0) # raw user id (as in the ratings file). They are **strings**! iid = str(35546) # raw item id (as in the ratings file). They are **strings**! # get a prediction for specific users and items. pred = algo.predict(uid, iid, verbose=True) print(pred[3]) ''' testset = open("../test1.csv", "r") result = open("result_SVD++.txt", "w") for line in testset: temp = line.split(",") pred = algo.predict(temp[0], temp[1], verbose=True) score = round(pred[3]) #print() result.write(str(score) + "\n")
# Reload _, algo = dump.load(filename) # Show distribution of ratings by users df_users['user'].value_counts() df_users['title'].value_counts() df_users[df_users['user'] == 'lschmidt'] # For a given user and recipe, compare true rating with predicted rating uid = 'lschmidt' iid = 'acorn-squash-with-kale-and-sausage-51203850' r = float(df_users.loc[(df_users['user'] == uid) & (df_users['title'] == iid), 'rating'].values) # get a prediction for specific users and items. pred = algo.predict(uid, iid, r_ui=r, verbose=True) # I can try this for all recipes this user liked def show_user_predictions(uid, df, algo): rated_recipes = df.loc[df['user'] == uid, 'title'].values for iid in rated_recipes: r = float(df.loc[(df['user'] == uid) & (df['title'] == iid), 'rating'].values) pred = algo.predict(uid, iid, r_ui=r, verbose=True) print(pred) show_user_predictions('lschmidt', df_users, algo)
results_df.to_csv('svdpp_grid_search.csv') #entrenar con todo y los mejores parametros algo = SVD(n_epochs=100, lr_all=0.002, reg_all=0.2) trainset = data.build_full_trainset() algo.train(trainset) #predict con test test_ambiente = pd.DataFrame() for i in range(0, len(test.index)): variable = pd.DataFrame( pd.Series( algo.predict( test.id_usuario.astype(str)[i], test.id_restaurante.astype(str)[i]).est).values) test_ambiente = test_ambiente.append(variable, ignore_index=True) ## busqueda de rating_comida train[['id_usuario', 'id_restaurante', 'rating_comida', 'fecha']].to_csv('surprise_comida.csv', index=False) file_path = 'surprise_comida.csv' reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file(file_path, reader=reader)
## first model and training reader = Reader(rating_scale=(0, 10)) data = Dataset.load_from_df(user_book_rate, reader) trainset = data.build_full_trainset() algo = SVDpp(n_factors=100,n_epochs=300,lr_all=0.01,reg_all=0.2) algo.fit(trainset) # user_latent = algo.pu # book_latent = algo.qi ## final result of first model final_df = user_data[['User ID', 'User Read Books (2017)', 'User Read Books (2018)', 'Average Rating (2017)']] final_ar = [[a[0], (len(a[1].split(', ')) * float(a[3]) + sum([algo.predict(a[0], int(b)).est for b in a[2].split(', ')])) / (len(a[1].split(', ')) + len(a[2].split(', ')))] for a in final_df.values] ## user difficulty embedding user_diff_ar = user_data['User Difficulty Choice'].values mlb = MultiLabelBinarizer(classes = [1,2,3,4,5]) user_diff_code = mlb.fit_transform([([int(a)]) if a in '12345' else (1,2,3,4,5) for a in user_diff_ar]) dic_user_diff = dict(zip(user_data['User ID'].values, user_diff_code)) ## book difficulty embedding book_diff_ar = book_info['Difficulty (Reader suggested)'].values mlb = MultiLabelBinarizer(classes = [1,2,3,4,5]) book_diff_code = mlb.fit_transform([([int(a)]) for a in book_diff_ar]) dic_book_diff = dict(zip(book_info['Book ID'].values, book_diff_code)) ## book genre embedding
alg.fit(data_train.build_full_trainset()) end = time.time() print("***********************************************") print("Exe time:") print(end - start) # %% Loading Test Data file_path = "Data/sample_submission.csv" data_test = utils.load_data_desired(file_path) # %% Prediction Predict_Test = [] for line in data_test: Predict_Test.append(alg.predict(str(line[1]), str(line[0])).est) # %% Save Prediction file = open("Details.txt", "w") file.write("+ Best Score: \n \n") file.write(str(Train_CV.best_score) + "\n \n") file.write("************************************************************ \n") file.write("+ Best Param: \n \n") file.write(str(Train_CV.best_params) + "\n \n") file.write("************************************************************ \n") file.write("+ CV Summary: \n \n") file.write(str(Train_CV.cv_results) + "\n \n") file.write("************************************************************ \n") file.close()
user_inner_id = 300 print('用户内部id', user_inner_id) user_rating = trainset.ur[user_inner_id] print('用户评价过的歌曲数量', len(user_rating)) items = map(lambda x:x[0], user_rating) real_song_id=[] real_song_name=[] for song in items: real_song_id.append(algo.trainset.to_raw_iid(song)) real_song_name.append(song_id_name_dict[algo.trainset.to_raw_iid(song)]) t_l=10 song_list1=list(song_id_name_dict.keys()) rank=[] for song in song_list1: rank.append(algo.predict(str(user_inner_id), str(song))[3]) rank=Series(rank) rank1=rank.sort_values(ascending=False) predict_song_id=[] predict_song_name=[] for i in range(t_l): predict_song_id.append(song_list1[list(rank1.index)[i]]) predict_song_name.append(song_id_name_dict[song_list1[list(rank1.index)[i]]]) #from pandas import Series a=Series(real_song_name) b=Series(predict_song_name) c=pd.DataFrame({'real':a,'predict':b}) #t_l=20 #取top的长度 #if len(user_rating)<=t_l: # pre_song=list(rank1.index[range(t_l)])
# %% [markdown] # ## Deployed App # # # [Link to the deployed streamlit app](###) # %% # Prepare Kaggle submission test = pd.read_csv('test.csv') # Make predictions on test data pred_list = [] for _, row in test.iterrows(): x = (SVDpp_model.predict(row.userId, row.movieId)) pred = x[3] pred_list.append(pred) # %% # Convert values to strings test['userId'] = test['userId'].astype(str) test['movieId'] = test['movieId'].astype(str) # %% # Create submission column test['Id'] = test['userId'] + '_' + test['movieId'] # %%
def post(self, request): heydict = dict(request.POST.lists()) user = heydict['usuario'][0] #Database server = 'LOCALHOST\\SQLEXPRESS' database = 'MoviesHub' username = '******' password = '******' cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password) cursor = cnxn.cursor() queryMoviesRatings = "SELECT * FROM MoviesRatings WHERE movieId<250;" df_movies = pd.read_sql(queryMoviesRatings, cnxn) #Este dataframe tiene que tener siempre los ID de los usuarios, Id de peliculas y ratting dado por un usuario df_movies_to_model = df_movies[df_movies.columns[:-3]] queryRecommend = "SELECT title, genres FROM RecommendedMovies WHERE userId = {0};".format(user) df_recommend = pd.read_sql(queryRecommend, cnxn) queryErrores = "SELECT rmse FROM Errores WHERE userId = {0};".format(user) df_errores = pd.read_sql(queryErrores, cnxn) #Creamos una función que pasandole, un usuario, un DataFrame, un algoritmo y el número de recomendaciones que queremos def recommend_system(userId, dataframe, algorithm, n_commends): movie_ids = dataframe['movieId'].to_list() movies_watched = dataframe[dataframe["userId"] == userId]["movieId"] movies_no_watched = [movie for movie in movie_ids if movie not in movies_watched] preds = [algorithm.predict(uid=userId, iid=movie) for movie in movies_no_watched] commends_ratting = {pred[1]:pred[3] for pred in preds} order_dict = {k: v for k, v in sorted(commends_ratting.items(), key=lambda item: item[1])} top_predictions = list(order_dict.keys())[:n_commends] return dataframe[dataframe["movieId"].isin(top_predictions)][["title", "genres"]].drop_duplicates() if(len(df_recommend.index) == 0): #Usamos Reader() del paquete Surprise para poner los datos en el formato que nos piden los algoritmos reader = Reader() data = Dataset.load_from_df(df_movies_to_model, reader) #Separo en train y test train, test = train_test_split(data, test_size=0.25) #Instanciamos el algoritmo y entrenamos svd = SVDpp() svd.fit(train) preds = svd.test(test) #Métricas de evaluacin # mae = accuracy.mae(preds) rmse = accuracy.rmse(preds) rmse = rmse * 100 rmse = format(rmse, '.2f') cursor.execute("INSERT INTO Errores (userId,rmse) values(?,?)", user, rmse) # Creamos todo el dataset completo con Train y Test trainfull = data.build_full_trainset() #Instanciamos de nuevo el algoritmo svd = SVDpp() #Entrenamos el algoritmo svd.fit(trainfull) #realizamos una prediccin para ver que todo funciona svd.predict(uid=1, iid=1) movies_recommended = recommend_system(user, df_movies, svd, 10) for index, row in movies_recommended.iterrows(): cursor.execute("INSERT INTO RecommendedMovies (userId,movieId,title,genres) values(?,?,?,?)", user, index, row.title, row.genres) cnxn.commit() cursor.close() df_recommend = pd.read_sql(queryRecommend, cnxn) df_errores = pd.read_sql(queryErrores, cnxn) context = { 'title': "Recomendaciones" } return render(request, "home/recomendacionesUser.html",{'df': df_recommend.values, 'user':user, 'dfErrores': df_errores.values}, context)
#%% reader = surprise.Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(merge, reader) del merge train, test = train_test_split(data, random_state=123, test_size=0.1) #%%训练模型(未调参) algo = SVDpp() #声明模型 algo.biased = False algo.fit(train) predictions = algo.test(test) accuracy.mae(predictions) a = algo.predict('15cbc496d67626ad90514b4243e7c045', '2204590') print(a) dump.dump(file_name='SVDmodel.pkl', algo=algo) #%% algo = dump.load('best_model.pkl')[1] #%%瞎猜模型(供对比) algocompare = surprise.NormalPredictor() algocompare.fit(train) preCompare = algocompare.test(test) accuracy.mae(preCompare) #%%计算precision and recall ## code from scikit-surprise documentation FAQs from collections import defaultdict
users = matrix.userid.unique() movies = matrix.movieid.unique() movies.sort() users.sort() # Getting estimations and creating the output file according to the guidelines my_recs = [] for uid in users: user_rats = np.array(user_item_matrix.iloc[uid - 1]) for iid in movies: rating = user_rats[iid - 1] if rating not in range(1, 6): # Check if user is already rated the movie estimation = algo.predict(uid=uid, iid=iid).est if isinstance(estimation, int): my_recs.append((uid, iid, estimation)) else: my_recs.append((uid, iid, estimation.round())) else: my_recs.append((uid, iid, rating)) output = pd.DataFrame(my_recs, columns=['uid', 'iid', 'predictions']).sort_values(['uid', 'iid'], ascending=True) np.savetxt(r'submit_sample.txt', output.values, fmt='%d') print("submit_sample.txt created.")
trainset = data.build_full_trainset() # algo = SVDpp(n_factors=2,n_epochs=75,lr_all=0.05,reg_all=0.1) algo = SVDpp(n_factors=3, n_epochs=300, lr_all=0.01, reg_all=0.2) algo.fit(trainset) user_latent = algo.pu book_latent = algo.qi ## final result of first model final_df = user_data[[ 'User ID', 'User Read Books (2017)', 'User Read Books (2018)', 'Average Rating (2017)' ]] final_ar = [[ a[0], (len(a[1].split(', ')) * float(a[3]) + sum([algo.predict(a[0], int(b)).est for b in a[2].split(', ')])) / (len(a[1].split(', ')) + len(a[2].split(', '))) ] for a in final_df.values] ## user difficulty embedding user_diff_ar = user_data['User Difficulty Choice'].values mlb = MultiLabelBinarizer(classes=[1, 2, 3, 4, 5]) user_diff_code = mlb.fit_transform([([int(a)]) if a in '12345' else (1, 2, 3, 4, 5) for a in user_diff_ar]) dic_user_diff = dict(zip(user_data['User ID'].values, user_diff_code)) ## book difficulty embedding book_diff_ar = book_info['Difficulty (Reader suggested)'].values mlb = MultiLabelBinarizer(classes=[1, 2, 3, 4, 5]) book_diff_code = mlb.fit_transform([([int(a)]) for a in book_diff_ar]) dic_book_diff = dict(zip(book_info['Book ID'].values, book_diff_code))
# cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # # Run 5-fold cross-validation and print results. # algo.fit(trainset) # print(algo.predict('5218791','100642618')) # # Use the famous SVD algorithm. # algo = SlopeOne() # cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # # Run 5-fold cross-validation and print results. # algo.fit(trainset) # print(algo.predict('5218791','100642618')) # algo = KNNWithMeans(sim_options=sim_options) # cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # # Run 5-fold cross-validation and print results. # algo.fit(trainset) # print(algo.predict('5218791','100642618')) # algo = KNNWithZScore(sim_options=sim_options) # cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # # Run 5-fold cross-validation and print results. # algo.fit(trainset) # print(algo.predict('5218791','100642618')) # algo = CoClustering(n_cltr_u=300, n_cltr_i=600, n_epochs=100, verbose=True) # cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Run 5-fold cross-validation and print results. # algo.fit(trainset) print(algo.predict('5218791', '100648984')) print(algo.estimate('5218791', '100648984')) print(algo.predict('52550', '100644648')) print(algo.estimate('52550', '100644648')) print(algo.predict('10663402', '100651469')) print(algo.estimate('10663402', '100651469'))
1): #for loop goes from (1 to 1682) in this case #I did try Except here cause in user_item_matrix there are some missing columns which will create error when we write #(user_item_matrix.iloc[user_id][movie_id]) since there is no data about the missing columns #Since my model can predict everycolumn eventhough it is missing I wrote except: part to just take out the predictions of #those missing columns to output file. This way I did not need to create the actual columns for the missing data. #Instead I output the prediction from model.predict to output file. try: #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iloc.html for iloc[][] usage. if pd.isnull( user_item_matrix.iloc[user_id][movie_id] ): #user_item_matrix.iloc[0][1] means first users rating for 1 movie #model.predict documentation https://surprise.readthedocs.io/en/stable/getting_started.html #predict(1,1) means the prediction that user 1 made for movie 1 thats why I wrote user_id+1 pred = model.predict( user_id + 1, movie_id ).est #doing prediction with SVDpp model for each user to each movie one by one if type( pred ) == int: #if the prediction is int. It gives error I put the condition to handle the error rating = pred else: rating = pred.round() rating = int( rating) #to be able to put integers instead of floats else: #if the user already gave rating to movie we just take that instead of predicting the rating. rating = int(user_item_matrix.iloc[user_id][movie_id]) except: # if the movie does not exists in dataframe the codes gives error so instead I handled the error by making the prediction and insert it to rating value for that spesific user to non existed movie.