def collaborative(self,ratings,user_id): reader = Reader() #ratings.head() temp_ratings = ratings data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) ## Training the data ## svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) #svd.train(trainset) ## Testing the data ## from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) count = 0 for uid, iid, true_r, est, _ in predictions: if uid == user_id: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print("count\n") #print(count) #print("\n--------here-------\n") #print(temp_ratings) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] #print("\n--------here-------\n") #print(cb) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] return(cb)
for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n total_review_df = pd.read_csv("../data/total_review_df.csv") total_df = pd.read_csv("../data/total_df.csv") # Load the dataset (download it if needed) reader = Reader(rating_scale=(0.5, 5.0)) data = Dataset.load_from_df(total_review_df[["user_name", "res_id", "rating"]], reader) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. # testset = trainset.build_full_trainset() testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=10) # Print the recommended items for each user for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings])
# Requires Numpy and scikit-surprise installed from surprise import Reader, Dataset, SVD, evaluate # Read data into an array of strings with open('./ml-100k/u.data') as f: all_lines = f.readlines() # Let's prepare data to be used in Surprise reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file('./ml-100k/u.data', reader=reader) # We Split the dataset into 5 folds and choose the algorithm data.split(n_folds=5) algo = SVD() # our chosen algorithm # We now train and test reporting the RMSE and MAE scores evaluate(algo, data, measures=['RMSE', 'MAE']) # Retrieving the trainset. trainset = data.build_full_trainset() algo.train(trainset) # Predict a sample item userid = str(196) itemid = str(302) actual_rating = 4 #Printing out our predictions print(algo.predict(userid, itemid, actual_rating))
'Com menos rating:', df.groupby('book_id')['rating'].count().reset_index().sort_values( 'rating', ascending=True)[:10]) from surprise.model_selection import cross_validate from surprise import Reader, Dataset, NormalPredictor, KNNBasic, KNNWithMeans from surprise import KNNWithZScore, KNNBaseline, SVD, BaselineOnly, SVDpp from surprise import NMF, SlopeOne, CoClustering reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader) benchmark = [] # Testa todos os algoritimos for algoritimo in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: print('Inicio algoritimo', algoritimo) # Cross validation resultados = cross_validate(algoritimo, data,
# initialize a random seed so that train test split is the same across runs seed = np.random.RandomState(42) # split the data into 80% training 20% validation # Note we pass in the random seed as the second argument to ensure that the 'randomness' of the split # is consistent every time the file is ran trainset, testset = train_test_split(data, test_size=.2, random_state=seed) # Initialize the algorithm that we are going to use to train on the dataset # Here we use standard SVD algorithm (matrix factorization with user and item biases) # n_factors specifies the number of factors to be used, n_epochs specifies the number of iterations # of stochastic gradient descent, and verbose=True gives us progress on the epochs # Check Surprise documentation on SVD for full list of specifiable parameters print("Training model...") algo = SVD(n_factors=50, n_epochs=10, verbose=True) # This call to fit() on the trainset actually performs the training of the model algo.fit(trainset) # The call to test() on the testset makes predictions on ratings of user-items in the testset # according to the trained model above predictions = algo.test(testset) # This line gives us the accuracy in terms of RMSE of the predictions made above accuracy.rmse(predictions) # Test again with different params # Note when we don't specify the number of epochs, the default is 20 print("Training model...") algo = SVD(n_factors=50, verbose=True)
def get_top_n(predictions, n=10): # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # retrain on the whole set A algo = SVD() trainset = data.build_full_trainset() algo.train(trainset) # Compute biased accuracy on A testset = data.construct_testset(A_raw_ratings) # testset is now the set A predictions = algo.test(trainset.build_testset()) print('Biased accuracy on A,', end=' ') accuracy.rmse(predictions) # predict r for all pairs (user, item) that are NOT in the training set # by setting the pairs that were to 0 and the pairs that were not in the # training set to mean of all ratings. testset = trainset.build_anti_testset() predictions = algo.test(testset)
product_idx = dict(zip(products, np.arange(0, len(products)))) # Now transform it to the format expected by the Python # recommendation package 'surprise'. The current data # has columns 'product', 'ip', 'date_logged' and 'url'. # Surprise requires columns corresponding to user id, # item id and rating in that order. grouped_series = local_df.groupby(['ip', 'product']).size() ratings_dict = { "userId": [idx[0] for idx in grouped_series.index], "itemId": [idx[1] for idx in grouped_series.index], "rating": list(grouped_series) } surprise_df = pandas.DataFrame(ratings_dict, columns=['userId', 'itemId', 'rating']) # Load the dataframe into a surprise Dataset object reader = Reader(rating_scale=(1, surprise_df['rating'].max())) data = Dataset.load_from_df(surprise_df, reader) # We'll use the famous SVD algorithm to train the # recommender. algo = SVD() trainset = data.build_full_trainset() algo.fit(trainset) # Save the trained recommender to disk so we can deploy # the predictor as a model. joblib.dump(algo, 'recommender.pkl')
ordering = 'mu' # rows correspond to movie_ids; cols correspond to user_ids submit = True # set to True to save a submission on qual save_model = False # set to True to save model parameters for future predictions print('Loading data...') df = pd.read_csv(os.path.join('data', 'mu_train.csv')) # modify dataframe to reduce memory del df['Unnamed: 0'] del df['Date Number'] df = df.astype('int32') df_val = pd.read_csv(os.path.join('data', 'mu_val.csv')) print('Solving SVD...') reader = Reader(rating_scale=(1, 5)) model = SVD(n_epochs=20, verbose=True) train_raw = Dataset.load_from_df(df[['User Number', 'Movie Number', 'Rating']], reader) train = train_raw.build_full_trainset() model.fit(train) gc.collect() ''' train_pred = model.test(train.build_testset()) val_raw = Dataset.load_from_df(df_val[['User Number', 'Movie Number', 'Rating']], reader) val = val_raw.build_full_trainset() val_pred = model.test(val.build_testset()) print('Train RMSE:', accuracy.rmse(train_pred)) print('Val RMSE:', accuracy.rmse(val_pred))
data.to_csv("abc.txt", index=None, header=None, columns=["users", "items", "rates"]) reader = Reader(line_format='user item rating', rating_scale=(0, 10), sep=',') data = Dataset.load_from_file("abc.txt", reader=reader) data.split(n_folds=10) # sim_options = {'name': 'cosine', # 'user_based': False # compute similarities between items # } # algo = KNNBasic(sim_options=sim_options) # We'll use the famous SVD algorithm. algo = SVD(verbose=True) for _ in range(10): perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf) dump_obj = {'predictions': perf, 'algo': algo} pickle.dump(dump_obj, open(result_path, 'wb')) exit() start_time = time.time() for trainset, testset in data.folds(): # train and test algorithm. algo.train(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error
class surpriseRecommender(): def __init__(self, stories, reviews, users): #self.stories = stories #self.reviews = reviews #self.users = users self.storyLinkToIdDict = {} self.IdToStoryDict = {} #create a dict between storied and their id self.lastStoryId = 0 for story in stories: self.storyLinkToIdDict[story['storyLink']] = self.lastStoryId self.IdToStoryDict[self.lastStoryId] = story self.lastStoryId += 1 self.userLinkToIdDict = {} self.IdToUserDict = {} self.lastUserId = 0 for user in users: self.userLinkToIdDict[user['name']] = self.lastUserId self.IdToUserDict[self.lastUserId] = user self.lastUserId += 1 for review in reviews: if (review['r'] not in self.userLinkToIdDict): self.userLinkToIdDict[review['r']] = self.lastUserId self.IdToUserDict[self.lastUserId] = review['r'] self.lastUserId += 1 self.reviewLinkToIdDict = {} self.IdToReviewDict = {} self.lastReviewId = 0 for review in reviews: self.reviewLinkToIdDict[review['rO'] + '|' + review['r']] = self.lastReviewId self.IdToReviewDict[self.lastReviewId] = review self.lastReviewId += 1 # ## make scores dict storyReviewDic = Counter({}) storyScores = {} cnt = 0 self.minScore = 0 self.maxScore = 0 for review in reviews: if (review['rO'] in self.storyLinkToIdDict): userId = self.userLinkToIdDict[review['r']] storyId = self.storyLinkToIdDict[review['rO']] score = review['sS'] self.minScore = min(score, self.minScore) self.maxScore = max(score, self.maxScore) storyScores[(userId, storyId)] = { "storyId": storyId, "userId": userId, "score": score } cnt += 1 print(self.minScore, self.maxScore) # ### add in favorites data # bias favorites over reviews for user in users: userId = self.userLinkToIdDict[user['name']] for favorite in user['favorites']: if (favorite['S'] in self.storyLinkToIdDict): storyId = self.storyLinkToIdDict[favorite['S']] score = 10 if ((userId, storyId) not in storyScores): storyScores[(userId, storyId)] = { "storyId": storyId, "userId": userId, "score": 0 } storyScores[(userId, storyId)]['score'] += score self.inputScores = [] for score, body in storyScores.items(): self.inputScores.append(body) def train(self): df = pd.DataFrame(self.inputScores) reader = Reader(rating_scale=(self.minScore, self.maxScore)) data = Dataset.load_from_df(df[['userId', 'storyId', 'score']], reader) trainset = data.build_full_trainset() self.algo = SVD() self.algo.fit(trainset) def getTopPredictions(self, userId, stories): df = pd.DataFrame(self.inputScores) df_filtered = df.query('userId==' + str(userId)) #print(df_filtered) test_items = [] for story in stories: storyId = self.storyLinkToIdDict[story['storyLink']] test_items.append({ "storyId": storyId, "userId": userId, "score": 0 }) df = pd.DataFrame(test_items) #remove values the user already knows mask = np.logical_not(df['storyId'].isin(set(df_filtered['storyId']))) df = df[mask] reader = Reader(rating_scale=(self.minScore, self.maxScore)) data = Dataset.load_from_df(df[['userId', 'storyId', 'score']], reader) trainset = data.build_full_trainset() testset = trainset.build_testset() predictions = self.algo.test(testset) scores = {} for uid, iid, true_r, est, _ in predictions: scores[self.IdToStoryDict[iid]['storyLink']] = est return scores def predict(self, link, stories): link = link.replace('https://www.fanfiction.net', '') #print(link, link in self.userLinkToIdDict) #if we havent seen this user before if (link not in self.userLinkToIdDict): '''self.userLinkToIdDict[user['name']] = self.lastUserId self.IdToUserDict[self.lastUserId] = user self.lastUserId += 1 self.train()''' print('user not found') return {} return self.getTopPredictions(self.userLinkToIdDict[link], stories)
class Main(): def __init__(self): #所有的Item项 self.items = [] #所有的User项 self.users = [] #评分数据 self.ratings = [] #测试数据集 self.test = [] #用户平均评分 self.rating_aves = [] #Item id到self.items的映射 self.item_dic = {} #user id到self.users的映射 self.user_dic = {} def getData(self): #获取users with open(TRAIN_PATH, 'r') as f: user_no = 0 item_no = 0 while True: line = f.readline() if not line or line == '\n': break id, item_num = line.split('|') item_num = int(item_num[:-1]) user = User(id, item_num) for i in range(item_num): line = f.readline() item_id, score = line.split(" ")[:2] score = int(score) if score == 0: score = 1 user.setItems([item_id, score]) self.ratings.append([id, item_id, score / 20]) if item_id not in self.item_dic: self.item_dic[item_id] = item_no item_no += 1 self.items.append(Item(item_id)) self.user_dic[id] = user_no user_no += 1 # print(id) self.users.append(user) self.user_num = len(self.users) self.item_num = len(self.items) self.rating_matrix = sparse.dok_matrix((self.user_num, self.item_num)) # print(self.item_dic['507696']) for i in range(self.user_num): for j in range(self.users[i].item_num): self.rating_matrix[ self.user_dic[self.users[i].id], self.item_dic[self.users[i]. items[j][0]]] = self.users[i].items[j][1] for i in range(self.user_num): self.rating_aves.append(self.users[i].getAverage()) #获取测试数据 with open(TEST_PATH, 'r') as f: while True: line = f.readline() if not line or line == '\n': break id, item_num = line.split('|') item_num = int(item_num[:-1]) user = User(id, item_num) for i in range(item_num): line = f.readline() item_id = line[:-1] user.setItems([item_id]) self.test.append(user) self.test_num = len(self.test) # for i in self.test: # print(i.id,i.items) print('finish getData') def mySVD(self): self.reader = Reader(rating_scale=(1, 5)) self.data = Dataset.load_from_df(pd.DataFrame(self.ratings), self.reader) print(self.data) trainset, testset = train_test_split(self.data, test_size=.15) self.model = SVD(n_factors=SVD_PARAMETER) self.model.fit(trainset) self.model a_user = "******" a_product = "507696" print(self.model.predict(a_user, a_product)) def predict(self): for i in range(self.test_num): with open(RESULT_PATH, 'a') as f: f.write(self.test[i].id) f.write('\n') for j in range(len(self.test[i].items)): self.test[i].items[j].append( self.model.predict(self.test[i].id, self.test[i].items[j][0])[3] * 20) f.write(self.test[i].items[j][0]) f.write(':') f.write(str(self.test[i].items[j][1])) f.write('\n') def mainMethod(self): self.getData() start = time.clock() self.mySVD() self.predict() elapsed = (time.clock() - start) print(elapsed)
ratings_mean_count['rating'].hist(bins=50) plt.show() # pivot ratings into movie features df_movie_features = ratings.pivot(index='movies', columns='users', values='rating').fillna(0) mat_movie_features = csr_matrix(df_movie_features.values) print(df_movie_features) # A reader is still needed but only the rating_scale param is required. # The Reader class is used to parse a file containing ratings. reader = Reader(rating_scale=(0, 5.0)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['users', 'movies', 'rating']], reader) # Split data into 5 folds # data.split(n_folds=5) # Split the dataset into 5 folds and choose the algorithm algo = SVD() # Train and test reporting the RMSE and MAE scores # Run 5-fold cross-validation and print results cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Retrieve the trainset. trainset = data.build_full_trainset() algo.fit(trainset) # Predict a certain item users = str(414) movies = str(410) actual_rating = 5 print(algo.predict(users, movies, actual_rating))
df = pandas.DataFrame(trainset) reader=Reader(line_format='user item rating',sep=',',skip_lines=1) data=Dataset.load_from_df(df,reader) hyper={'n_factors':[5,6,7],'reg_all':[0.1,1,10]} clf = GridSearchCV(SVD,hyper,cv=5,measures=['mae','rmse'])#rmse clf.fit(data) print(clf.best_params) print(clf.best_score['mae']) # In[2]: data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo = SVD(n_factors=7, reg_all=0.1) algo.fit(trainset) testset = trainset.build_testset() predictions = algo.test(testset) print('task 1') mae = accuracy.mae(predictions) print('accuracy: ',mae) #task 1 # In[5]: def get_top_n(predictions, n=5): '''Return the top-N recommendation for each user from a set of predictions.
movies = pd.read_csv('movies.csv') ratings = pd.read_csv('ratings.csv') #print(ratings.head()) movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId') movies_with_ratings.dropna(inplace=True) dataset = pd.DataFrame({ 'uid': movies_with_ratings.userId, 'iid': movies_with_ratings.title, 'rating': movies_with_ratings.rating }) #print(dataset.head(30)) reader = Reader(rating_scale=(0.5, 5.0)) data = Dataset.load_from_df(dataset, reader) #print(data.df.head(30)) #print(list(zip(dataset.rating.head(100), data.df.rating.head(100)))) trainset, testset = train_test_split(data, test_size=.15, random_state=42) algo = SVD(n_factors=20, n_epochs=20) algo.fit(trainset) test_pred = algo.test(testset) print('rmse = ', accuracy.rmse(test_pred, verbose=True)) print('prediction = ', algo.predict(uid=5.0, iid='MortalKombat(1995)'))
then reloaded and can be used again for making predictions. """ from __future__ import (absolute_import, division, print_function, unicode_literals) import os from surprise import SVD from surprise import Dataset from surprise import dump data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) # Compute predictions of the 'original' algorithm. predictions = algo.test(trainset.build_testset()) # Dump algorithm and reload it. file_name = os.path.expanduser('~/dump_file') dump.dump(file_name, algo=algo) _, loaded_algo = dump.load(file_name) # We now ensure that the algo is still the same by checking the predictions. predictions_loaded_algo = loaded_algo.test(trainset.build_testset()) assert predictions == predictions_loaded_algo print('Predictions are the same')
def iniciarfiesta(fiesta_id, nombre_fiesta): conn = sqlite3.connect('spotify.db') sql = '''SELECT * FROM CancionUsuario''' fecha = str(date.today()) uri_usuarios = [] c = conn.cursor() sqlselectusers = ''' SELECT uri_usuario FROM FiestaUsuario where fiesta_id = ? ''' cur = conn.cursor() cur.execute(sqlselectusers, (str(fiesta_id), )) conn.commit() select_invitados = cur.fetchall() for uri_usuario in select_invitados: uri_usuarios.append(uri_usuario[0]) conn.close() invitado_para_sacar_info = uri_usuarios[0][13:] scope = 'user-library-read,user-top-read,playlist-modify-public' token_info = util.prompt_for_user_token(username=invitado_para_sacar_info, scope=scope) playlistid = crearplaylist(token_info, invitado_para_sacar_info, nombre_fiesta) playlisturi = 'spotify:playlist:{}'.format(playlistid) df = fn_database(sql) df = df.loc[df['uri_usuario'].isin(uri_usuarios)] df_mas_recientes = df.groupby(['uri_usuario'], as_index=False)['date'].max() df = df.merge(df_mas_recientes, on='uri_usuario', how='left') df = df.loc[df['date_x'] == df['date_y']] print(token_info) uri_canciones = [] for index, row in df.iterrows(): print(row['uri_cancion']) uri_canciones.append(row['uri_cancion']) ObtenerCaracteristicas(uri_canciones, token_info) sqlcaracteristicas = '''SELECT uri AS uri_cancion, danceability*energy*5 AS fiesticidad, duration_ms FROM CancionCaracteristicas''' df_caracteristicas = fn_database(sqlcaracteristicas) print(df_caracteristicas) df['rating'] = df.apply(lambda row: elrating(row), axis=1) dfmodelo = df.groupby(['uri_cancion', 'uri_usuario'], as_index=False)['rating'].max() dfsimple = dfmodelo.groupby(['uri_cancion'], as_index=False)['rating'].sum() dfrecommender = dfmodelo[['uri_usuario', 'uri_cancion', 'rating']].copy() reader = Reader(rating_scale=(0, 5)) data = Dataset.load_from_df( dfrecommender[['uri_usuario', 'uri_cancion', 'rating']], reader) algo = SVD() cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) param_grid = { 'n_factors': [5, 10, 15, 20], 'n_epochs': [5, 10, 20, 25, 30], 'lr_all': [0.001, 0.005, 0.01], 'reg_all': [0.02, 0.1, 0.2, 0.3] } #param_grid = {'n_factors':[1],'n_epochs':[5,10,20,25,30], 'lr_all':[0.001,0.005,0.007,0.01],'reg_all':[0.02,0.1,0.2,0.3]} gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3) gs.fit(data) params = gs.best_params['rmse'] print(params) svdtuned = SVD(n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) #svdtuned = SVD(n_factors=20, n_epochs=params['n_epochs'],lr_all=params['lr_all'], reg_all=params['reg_all']) trainingSet = data.build_full_trainset() algo = svdtuned print print(algo.n_epochs) #sp = spotipy.Spotify(auth=token_info['access_token']) sp = spotipy.Spotify(auth=token_info) #descripcion_anterior = sp.user_playlist(user=token[1],playlist_id=playlistid,fields='description') descripcion_anterior = sp.user_playlist(user=invitado_para_sacar_info, playlist_id=playlistid, fields='description') descripcion = descripcion_anterior['description'] + ' ' + str(params) sp.user_playlist_change_details(user=invitado_para_sacar_info, playlist_id=playlistid, description=descripcion) algo.fit(trainingSet) prediction = algo.predict('spotify:user:jorged_94', 'spotify:track:0aZ5EsW90SpCbsYfMQ7HRf', r_ui=0.995, verbose=True) folder_path = r'Matrix\{}'.format(nombre_fiesta) #os.makedirs(os.path.dirname(filename_inputSI), exist_ok=True) os.makedirs(folder_path, exist_ok=True) rm = np.dot(algo.pu, algo.qi.T) np.savetxt(r'{}\algo.pu.csv'.format(folder_path), algo.pu, delimiter=',') np.savetxt(r'{}\algo.qi.csv'.format(folder_path), algo.qi, delimiter=',') np.savetxt(r'{}\rm.csv'.format(folder_path), rm, delimiter=',') #pu: User factors #qi: Item factors #bu: User bias #bi: Item bias group_pu = algo.pu.mean(axis=0) latent_factors = np.dot(group_pu, algo.qi.T) np.savetxt('{}\latent_factors.csv'.format(folder_path), latent_factors, delimiter=',') #print(algo.bu) #print(algo.bu.mean()) numero_de_canciones = algo.qi.shape[0] recomendacion_grupal = [] for i_iid in range(numero_de_canciones): group_estimacion = latent_factors[i_iid] + algo.bi[ i_iid] + dfrecommender['rating'].mean() cancion = trainingSet.to_raw_iid(i_iid) #print (cancion,group_estimacion) recomendacion_grupal.append([cancion, group_estimacion]) def Sort(sub_li): # reverse = None (Sorts in Ascending order) # key is set to sort using second element of # sublist lambda has been used sub_li.sort(key=lambda x: x[1], reverse=True) return sub_li # Driver Code recomendacion_grupal_ordenada = Sort(recomendacion_grupal) df_final = pd.DataFrame(recomendacion_grupal_ordenada, columns=['uri_cancion', 'estimacion']) print(df_final) #df_final.join(other.set_index('key'), on='key') #df_final.merge(df_caracteristicas, left_on='uri',right_on='uri_cancion', how='left') df_final = df_final.merge(df_caracteristicas, on='uri_cancion', how='left') print(df_final) #df_final['puntaje_final']=(50*df_final['estimacion']+df_final['fiesticidad'])/3 fiesticidad_threshold = 2 df_final.loc[df_final['fiesticidad'] <= fiesticidad_threshold, 'puntaje_final'] = 0 df_final.loc[df_final['fiesticidad'] > fiesticidad_threshold, 'puntaje_final'] = df_final['estimacion'] #df_final['puntaje_final'] = df_final['set_of_numbers'].apply(lambda x: 'True' if x <= 4 else 'False') print(df_final) #df.loc[['viper', 'sidewinder'], ['shield']] = 50 #df_final.loc[df_final['duration_ms'] > 420000, 'puntaje_final'] = 0 df_final.loc[df_final.duration_ms > 420000, 'puntaje_final'] = 0 df_final = df_final.sort_values(by='puntaje_final', ascending=False) print(df_final) df_final.to_csv(r'{}\estimacion_final.csv'.format(folder_path), index=False, header=True) dfsimple = dfsimple.sort_values(by='rating', ascending=False) dfsimple.to_csv(r'{}\estimacion_simple.csv'.format(folder_path), index=False, header=True) aleatorio = random.choice([1, 2]) #aleatorio = 1 if (aleatorio == 1): cancionesasonar = df_final elif (aleatorio == 2): cancionesasonar = dfsimple canciones = [] for index, row in cancionesasonar.iterrows(): #print(row['uri_cancion']) canciones.append(row['uri_cancion']) print(canciones) #for a,b in recomendacion_grupal_ordenada: #canciones.append(a) canciones_seccionado = split_list(canciones, 100) #canciones = canciones[:100] #print (canciones) sp = spotipy.Spotify(auth=token_info) for seccion in canciones_seccionado: snapshot = sp.user_playlist_add_tracks(user=invitado_para_sacar_info, playlist_id=playlistid, tracks=seccion) #sp.start_playback(devi) sp.shuffle(state=False) headers = { 'Accept': 'application/json', 'Content-Type': 'application/json', 'Authorization': 'Bearer {}'.format(token[0]), } data = '{{"context_uri":"{}","offset":{{"position":{}}},"position_ms":0}}'.format( playlisturi, 0) response = requests.put('https://api.spotify.com/v1/me/player/play', headers=headers, data=data) if response: print('Se reprodujo la playlist') else: print(response, response.text) conn = sqlite3.connect('spotify.db') while True: # making a loop try: # used try so that if user pressed other than the given key error will not be shown if keyboard.is_pressed('n'): # if key 'q' is pressed print('Next song!') headers = { 'Accept': 'application/json', 'Content-Type': 'application/json', 'Authorization': 'Bearer {}'.format(token[0]), } responsecancion = requests.get( 'https://api.spotify.com/v1/me/player/currently-playing', headers=headers) #print(responsecancion.json()['progress_ms'],responsecancion.json()['item']['duration_ms']) sql = ''' INSERT INTO Salto(uri,porcentaje) VALUES(?,?) ''' cur = conn.cursor() porce = responsecancion.json( )['progress_ms'] / responsecancion.json( )['item']['duration_ms'] t = (responsecancion.json()['item']['uri'], porce) cur.execute(sql, t) conn.commit() #conn.close() response = requests.post( 'https://api.spotify.com/v1/me/player/next', headers=headers) #break # finishing the loop except Exception as e: print(e) break # if user pressed a key other than the given key the loop will break print('Bien hecho campeón')
def trainModel(data): trainset = data.build_full_trainset() model = SVD(n_epochs=20, n_factors=50, verbose=True) model.fit(trainset) return model
# Unzip ml-100k.zip zipfile = zipfile.ZipFile('ml-100k.zip', 'r') zipfile.extractall() zipfile.close() # Read data into an array of strings with open('./ml-100k/u.data') as f: all_lines = f.readlines() # Prepare the data to be used in Surprise reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file('./ml-100k/u.data', reader=reader) # Split the dataset into 5 folds and choose the algorithm data.split(n_folds=5) algo = SVD() # Train and test reporting the RMSE and MAE scores evaluate(algo, data, measures=['RMSE', 'MAE']) # Retrieve the trainset. trainset = data.build_full_trainset() algo.train(trainset) # Predict a certain item userid = str(196) itemid = str(302) actual_rating = 4 print(algo.predict(userid, itemid, actual_rating))
print("searching for the best parameters for svd...") param_grid = { 'n_epochs': [14, 14], 'lr_all': [0.005, 0.005], 'n_factors': [10, 5] } gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(evaluationData) print("Best RMSE score attained: ", gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print("Best parameters: ", gs.best_params['rmse']) evaluator = Evaluator(evaluationData, rankings) params = gs.best_params['rmse'] SVDtuned = SVD(n_epochs=params['n_epochs'], lr_all=params['lr_all'], n_factors=params['n_factors']) evaluator.AddAlgorithm(SVDtuned, "SVD - Tuned") SVDUntuned = SVD() evaluator.AddAlgorithm(SVDUntuned, "SVD - Untuned") evaluator.Evaluate(True) evaluator.SampleTopNRecs(ml)
f.close() for train in train_file: line = train.strip() if line.find('|') != -1: user_id, user_item_count = line.split('|') else: if line == "": continue item_id, rate_str = line.split() write_file.write('%s,%s,%s\n' % (user_id, item_id, rate_str)) write_file.close() print("reading......") reader = Reader(line_format='user item rating', sep=',', rating_scale=(0, 100)) data = Dataset.load_from_file("train.csv", reader=reader) algo = SVD(n_factors=10, n_epochs=10, lr_all=0.015, reg_all=0.01) ''' bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5 } ''' #algo = BaselineOnly(bsl_options=bsl_options) ''' kf = KFold(n_splits=3) print('------begin train user cf model------------') for trainset, testset in kf.split(train_cf): # 训练并测试算法 print("aaa") algo.fit(trainset)
def Cal_Svd(filepath, user_id): # 1. raw dataset rating = pd.read_csv(filepath) rating['userId'].value_counts() rating['placeId'].value_counts() # 관광 vs 미관광 tab = pd.crosstab(rating['userId'], rating['placeId']) #print(tab) # rating # 두 개의 집단변수를 가지고 나머지 rating을 그룹화 rating_g = rating.groupby(['userId', 'placeId']) rating_g.sum() tab = rating_g.sum().unstack() # 행렬구조로 변환 #print(tab) #print(tab.info()) #사용자 2이 가지 않은 곳, 1,15, 39.... # 2. rating 데이터셋 생성 reader = Reader(rating_scale= (1, 5)) # 평점 범위 data = Dataset.load_from_df(df=rating, reader=reader) # rating이라는 데이터프레임은 reader(1~5)의 평점 범위를 가진다. #print(data) # 3. train/test set train = data.build_full_trainset() # 훈련셋 test = train.build_testset() # 검정셋 # 4. model 생성 #help(SVD) model = SVD(n_factors=100, n_epochs=20, random_state=123) model.fit(train) # model 생성 # 5. user_id 입력 #user_id = 1 # 추천대상자 item_ids = range(0, 2106) # placeId 범위 actual_rating = 0 # 평점 predict_result = [] for item_id in item_ids : if not actual_rating in tab: actual_rating = 0 predict_result.append(model.predict(user_id, item_id, actual_rating)) ddff = pd.DataFrame(predict_result) #print(ddff) # 유저 1 추천 여행지 상위 5개 result = ddff.sort_values(by='est', ascending=False)[:5] #print(result) results.append(result) return result # # if __name__ == '__main__': # Cal_Svd(filepath, user_id) # print(results[0]) #print(type(results[0])) #dataframe #print(results[0]['iid']) # placeId
import pandas as pd path = '../Datasets/BookCrossings' os.chdir(path) trans = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1") trans.columns = ['user', 'item', 'rating'] trans = trans[trans.rating != 0] min_item_ratings = 10 popular_items = trans['item'].value_counts() >= min_item_ratings popular_items = popular_items[popular_items].index.tolist() min_user_ratings = 10 active_users = trans['user'].value_counts() >= min_user_ratings active_users = active_users[active_users].index.tolist() trans = trans[(trans['item'].isin(popular_items)) & (trans['user'].isin(active_users))] reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(trans, reader) trainset, testset = train_test_split(data, test_size=0.002) algo = SVD(n_factors=50) algo.fit(trainset) preds = algo.test(testset) accuracy.mae(preds)
stop_words='english') count_matrix = count.fit_transform(books['soup']) cosine_sim = cosine_similarity(count_matrix, count_matrix) indices = pd.Series(books.index, index=books['title']) titles = books['title'] books.to_csv('ob.csv', columns=books.columns.tolist()) ratings.to_csv('or.csv', columns=ratings.columns.tolist()) book_tags.to_csv('obt.csv', columns=book_tags.columns.tolist()) tags.to_csv('ot.csv', columns=tags.columns.tolist()) print("beforeNY") svd = SVD() print("NY") reader = Reader() data = Dataset.load_from_df(new_ratings[['user_id', 'book_id', 'rating']], reader) cross_validate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() svd.fit(trainset) # save the model to disk filename = 'model.sav' pickle.dump(svd, open(filename, 'wb'))
qualified['vote_average'] = qualified['vote_average'].astype('int') qualified['wr'] = qualified.apply(weighted_rating, axis=1) qualified = qualified.sort_values('wr', ascending=False).head(10) return qualified print(improved_recommendations('Chungking Express')) # print(list(improved_recommendations('Se7en'))) print("end of metadata") #Collaborative Filtering reader = Reader() ratings = pd.read_csv('ratings_small.csv') print(ratings.head(5)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) data.split(n_folds=5) svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() svd.train(trainset) print(ratings[ratings['userId'] == 554]) print(svd.predict(554, 509, 4)) print(type(svd)) #end Collaborative Filtering def convert_int(x): try: return int(x) except: return np.nan
import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from surprise import Reader, Dataset, SVD, model_selection from pprint import pprint # Soley predicts based on other ratings reader = Reader() ratings = pd.read_csv('archive/ratings_small.csv') ratings.head() data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) svd = SVD() pprint( model_selection.cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)) trainset = data.build_full_trainset() svd.trainset(trainset) ratings[ratings['userId'] == 1] svd.predict(1, 302, 3) def convert_int(x): try: return int(x) except:
# -*- coding: utf-8 -*- """ Created on Sun Nov 24 15:03:41 2019 @author: Jon """ from helper import * from surprise import SVD, NormalPredictor, accuracy from surprise import NormalPredictor from surprise.model_selection import train_test_split, GridSearchCV, KFold import random import numpy as np np.random.seed(0) random.seed(0) GetBookData(density_filter=False) data = GetBookData(density_filter=False) trainset, testset = train_test_split(data, test_size=0.25) ##SVD Out of the Box SVD_OOB = SVD() SVD_OOB.fit(trainset) oob_predictions = SVD_OOB.test(testset) oob_rmse = accuracy.rmse(oob_predictions) oob_mae = accuracy.mae(oob_predictions) precisions, recalls = precision_recall_at_k(oob_predictions, k=10, threshold=4) oob_avg_precision = sum(prec for prec in precisions.values()) / len(precisions) oob_avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
from surprise import evaluate, print_perf from surprise import SVD from surprise import NMF from surprise import KNNBasic import os # 3 file_path = os.path.expanduser('restaurant_ratings.txt') reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # 5 print('\n#{} SVD -------------------------------\n'.format(5)) data.split(n_folds=3) algo = SVD() perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf) # 6 print('\n#{} PMF-------------------------------\n'.format(6)) algo = SVD(biased=False) #PMF perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf) # 7 print('\n#{} NMF-------------------------------\n'.format(7)) algo = NMF() perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
train_averages['user'] = get_average_ratings(of_users=True) train_averages['food'] = get_average_ratings(of_users=False) ## svd 학습시키기 from surprise import SVD, Reader, Dataset, accuracy import surprise from surprise.model_selection import train_test_split reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( rating_data_mf[['user_id', 'smallCategory_id', 'rating']], reader) trainset = data.build_full_trainset() svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True) svd.fit(trainset) ## svd로 빈 rating 예측해서 채우기 rating_data_svd = rating_data.copy() for user_id in rating_data.index: for smallCategory_id in rating_data.columns: if rating_data.loc[user_id][smallCategory_id] == 0: rating_data_svd.loc[user_id][smallCategory_id] = (svd.test([ (user_id, smallCategory_id, 0) ]))[0].est ## user based filtering - rating_data rating_data_svd_t = rating_data_svd.transpose() user_rating_sim = rating_data_svd_t.corr(method='pearson')
def CollabFilteringModel(data, option=1, gridsearch=True): if option==1: sim_options = { "name": "pearson_baseline", "min_support": 2, "user_based": False, } if gridsearch: sim_options = { "name": ["pearson_baseline"], "min_support": [2], "user_based": [False], } param_grid = {"sim_options": sim_options} gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3) gs.fit(data) print(gs.best_score["rmse"]) print(gs.best_params["rmse"]) print(gs.best_score["mae"]) print(gs.best_params["mae"]) sim_options = { "name": gs.best_params["rmse"]["sim_options"]["name"], "min_support": gs.best_params["rmse"]["sim_options"]["min_support"], "user_based": gs.best_params["rmse"]["sim_options"]["user_based"], } algo = KNNWithMeans(sim_options=sim_options) trainingSet = data.build_full_trainset() algo.fit(trainingSet) elif option==2: n_epochs = 200 lr_all = .01 reg_all = .05 if gridsearch: param_grid = { "n_epochs": [10, 200], "lr_all": [0.002, 0.1], "reg_all": [0.05, 0.9] } gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3) gs.fit(data) print(gs.best_score["rmse"]) print(gs.best_params["rmse"]) print(gs.best_score["mae"]) print(gs.best_params["mae"]) n_epochs = gs.best_params["mae"]["n_epochs"] lr_all = gs.best_params["mae"]["lr_all"] reg_all = gs.best_params["mae"]["reg_all"] algo = SVD(n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all) trainingSet = data.build_full_trainset() algo.fit(trainingSet) else: n_cltr_u = 3 n_cltr_i = 3 n_epochs = 200 if gridsearch: param_grid = { "n_epochs": [10, 200], "n_cltr_u": [2,3,4,5,6], "n_cltr_i": [2,3,4,5,6] } gs = GridSearchCV(CoClustering, param_grid, measures=["rmse", "mae"], cv=3) gs.fit(data) print(gs.best_score["rmse"]) print(gs.best_params["rmse"]) print(gs.best_score["mae"]) print(gs.best_params["mae"]) n_epochs = gs.best_params["rmse"]["n_epochs"] n_cltr_u = gs.best_params["rmse"]["n_cltr_u"] n_cltr_i = gs.best_params["rmse"]["n_cltr_i"] algo = CoClustering(n_cltr_u =n_cltr_u , n_epochs=n_epochs, n_cltr_i=n_cltr_i) trainingSet = data.build_full_trainset() algo.fit(trainingSet) return algo
def __init__(self, module_type, baseline_type, cf_type, similar, sim_type, params): assert baseline_type in {"ALS", "SGD", "default"} assert cf_type in {None, "base_user", "base_item"} assert similar in { None, "COSINE", "cosine", "MSD", "msd", "PEARSON", "pearson", "PEARSON_BASELINE", "pearson_baseline", "JACCARD", "jaccard", "EUCLIDEAN", "euclidean" } assert sim_type in {None, "default"} self.module_type = module_type self.baseline_type = baseline_type self.cf_type = cf_type self.similar = similar self.sim_type = sim_type self.bu = None self.bi = None self.sim = None if self.baseline_type == "ALS": bsl_options = { 'method': params["bsl_options"].get("method", 'als'), 'n_epochs': params["bsl_options"].get("n_epochs", 10), 'reg_u': params["bsl_options"].get("reg_u", 15), 'reg_i': params["bsl_options"].get("reg_i", 10) } elif self.baseline_type == "SGD": bsl_options = { 'method': params["bsl_options"].get("method", 'sgd'), 'n_epochs': params["bsl_options"].get("n_epochs", 20), 'reg': params["bsl_options"].get("reg", 0.02), 'learning_rate': params["bsl_options"].get("learning_rate", 0.005) } else: # 默认值 bsl_options = {} params["sim_options"] = {} if self.cf_type == "base_user": params["sim_options"]["user_based"] = True elif self.cf_type == "base_item": params["sim_options"]["item_based"] = False else: params["sim_options"]["user_based"] = True if self.similar == "COSINE" or self.similar == "cosine": params["sim_options"]["name"] = "cosine" elif self.similar == "MSD" or self.similar == "msd": params["sim_options"]["name"] = "msd" elif self.similar == "PEARSON" or self.similar == "pearson": params["sim_options"]["name"] = "pearson" elif self.similar == "PEARSON_BASELINE" or self.similar == "pearson_baseline": params["sim_options"]["name"] = "pearson_baseline" elif self.similar == "JACCARD" or self.similar == "jaccard": params["sim_options"]["name"] = "jaccard" elif self.similar == "EUCLIDEAN" or self.similar == "euclidean": params["sim_options"]["name"] = "euclidean" else: params["sim_options"]["name"] = "msd" if self.sim_type == "default": sim_options = {} else: sim_options = { "name": params["sim_options"].get("name", "MSD"), "user_based": params["sim_options"].get("user_based", True), "min_support": params["sim_options"].get("min_support", 5), "shrinkage": params["sim_options"].get("shrinkage", 100) } """ 'name':要使用的相似性名称,如similarities模块中所定义 。默认值为'MSD'。 'user_based':将计算用户之间还是项目之间的相似性。这对预测算法的性能有巨大影响。默认值为True。 'min_support':相似度不为零的最小公共项数('user_based' 为'True'时)或最小公共用户数('user_based'为 'False'时)。 简单地说,如果 |Iuv|<min_support 然后 sim(u,v)=0。项目也是如此。 'shrinkage': """ if self.module_type == "KNNmeans": # 在KNNBasic算法的基础上,考虑用户均值或项目均值 self.model = KNNWithMeans(k=params.get("k", 40), min_k=params.get("min_k", 1), sim_options=sim_options, verbose=params.get("verbose", True)) elif self.module_type == "KNNzscore": # 引入Z - Score的思想 self.model = KNNWithZScore(k=params.get("k", 40), min_k=params.get("min_k", 1), sim_options=sim_options, verbose=params.get("verbose", True)) elif self.module_type == "KNNbase": # 和KNNWithMeans的区别在于,用的不是均值而是bias self.model = KNNBaseline( k=params.get("k", 40), min_k=params.get("min_k", 1), # 最少的邻居个数 sim_options=sim_options, bsl_options=bsl_options, verbose=params.get("verbose", True)) elif self.module_type == "KNNbasic": # 最基础的KNN算法,可分为user - based KNN和item - based KNN self.model = KNNBasic(k=params.get("k", 40), min_k=params.get("min_k", 1), sim_options=sim_options, verbose=params.get("verbose", True)) elif self.module_type == "SVD": self.model = SVD(n_factors=params.get("n_factors", 100), n_epochs=params.get("n_epochs", 20), init_mean=params.get("init_mean", 0), init_std_dev=params.get("init_std_dev", 0.1), lr_all=params.get("lr_all", 0.005), reg_all=params.get("reg_all", 0.02), lr_bu=params.get("lr_bu", None), lr_bi=params.get("lr_bi", None), lr_pu=params.get("lr_pu", None), lr_qi=params.get("lr_qi", None), reg_bu=params.get("reg_bu", None), reg_bi=params.get("reg_bi", None), reg_pu=params.get("reg_pu", None), reg_qi=params.get("reg_qi", None), random_state=params.get("random_state", None), verbose=params.get("verbose", False)) """ n_factors –因素数。默认值为100。 n_epochs – SGD过程的迭代次数。默认值为 20。 偏见(bool)–是否使用基线(或偏见)。请参阅上面的注释。默认值为True。 init_mean –因子向量初始化的正态分布平均值。默认值为0。 init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0.1。 lr_all –所有参数的学习率。默认值为0.005。 reg_all –所有参数的正则项。默认值为 0.02。 lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。 lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。 lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。 lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。 reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。 reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。 reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。 reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。 如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。 如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细 –如果True,则打印当前纪元。默认值为False。 """ elif self.module_type == "SVDpp": self.model = SVDpp(n_factors=params.get("n_factors", 100), n_epochs=params.get("n_epochs", 20), init_mean=params.get("init_mean", 0), init_std_dev=params.get("init_std_dev", 0.1), lr_all=params.get("lr_all", 0.005), reg_all=params.get("reg_all", 0.02), lr_bu=params.get("lr_bu", None), lr_bi=params.get("lr_bi", None), lr_pu=params.get("lr_pu", None), lr_qi=params.get("lr_qi", None), reg_bu=params.get("reg_bu", None), reg_bi=params.get("reg_bi", None), reg_pu=params.get("reg_pu", None), reg_qi=params.get("reg_qi", None), random_state=params.get("random_state", None), verbose=params.get("verbose", False)) """ n_factors –因素数。默认值为20。 n_epochs – SGD过程的迭代次数。默认值为 20。 init_mean –因子向量初始化的正态分布平均值。默认值为0。 init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0 .1。 lr_all –所有参数的学习率。默认值为0 .007。 reg_all –所有参数的正则项。默认值为 0.02。 lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。 lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。 lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。 lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。 lr_yj –的学习率yj。lr_all如果设置优先 。默认值为None。 reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。 reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。 reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。 reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。 reg_yj –的正则化术语yj。reg_all如果设置优先。默认值为None。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细 –如果True,则打印当前纪元。默认值为False。 """ elif self.module_type == "NMF": # 非负矩阵分解,即要求p矩阵和q矩阵都是正的 self.model = NMF(n_factors=params.get("n_factors", 100), n_epochs=params.get("n_epochs", 20), init_mean=params.get("init_mean", 0), init_std_dev=params.get("init_std_dev", 0.1), lr_all=params.get("lr_all", 0.005), reg_all=params.get("reg_all", 0.02), lr_bu=params.get("lr_bu", None), lr_bi=params.get("lr_bi", None), lr_pu=params.get("lr_pu", None), lr_qi=params.get("lr_qi", None), reg_bu=params.get("reg_bu", None), reg_bi=params.get("reg_bi", None), reg_pu=params.get("reg_pu", None), reg_qi=params.get("reg_qi", None), random_state=params.get("random_state", None), verbose=params.get("verbose", False)) """ n_factors –因素数。默认值为15。 n_epochs – SGD过程的迭代次数。默认值为 50。 偏见(bool)–是否使用基线(或偏见)。默认值为 False。 reg_pu –用户的正则化术语λu。默认值为 0.06。 reg_qi –项目的正规化术语λi。默认值为 0.06。 reg_bu –的正则化术语bu。仅与偏置版本相关。默认值为0.02。 reg_bi –的正则化术语bi。仅与偏置版本相关。默认值为0.02。 lr_bu –的学习率bu。仅与偏置版本相关。默认值为0.005。 lr_bi –的学习率bi。仅与偏置版本相关。默认值为0.005。 init_low –因子的随机初始化的下限。必须大于0以确保非负因素。默认值为 0。 init_high –因子的随机初始化的上限。默认值为1。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。 如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。 如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细 –如果True,则打印当前纪元。默认值为False。 """ elif self.module_type == "SlopeOne": self.model = SlopeOne(**params) elif self.module_type == "cc": # 基于聚类的协同过滤 self.model = CoClustering(n_cltr_u=params.get("n_cltr_u", 3), n_cltr_i=params.get("n_cltr_i", 3), n_epochs=params.get("n_epochs", 20), random_state=params.get( "random_state", None), verbose=params.get("verbose", False)) """ n_cltr_u(int)–用户集群的数量。默认值为3。 n_cltr_i(int)–项目集群的数量。默认值为3。 n_epochs(int)–优化循环的迭代次数。默认值为 20。 random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。 如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。 如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。 详细(bool)–如果为True,则将打印当前纪元。默认值为 False。 """ elif self.module_type == "BaselineOnly": # 不考虑用户的偏好 self.model = BaselineOnly(bsl_options=bsl_options, verbose=True) elif self.module_type == "Np": # 该算法即随机预测算法,假设测试集的评分满足正态分布,然后生成正态分布的随机数进行预测, self.model = NormalPredictor()
from surprise import Dataset, Reader from surprise import SVD from surprise.model_selection import train_test_split from collections import defaultdict data = pd.read_csv('train_triplets.txt', sep="\t", header=None) data.columns = ['user', 'song', 'plays'] data = data[:30000] song_df = pd.read_csv('song_data.csv') data_surprise = Dataset.load_from_df( data, Reader(rating_scale=(1, data['plays'].max()))) # trainset, testset = train_test_split(data_surprise, test_size=.25) trainset = data_surprise.build_full_trainset() svd = SVD() svd.fit(trainset) testset = trainset.build_anti_testset() predictions = svd.test(testset) def get_top_n(user_id, n=10): '''Return the top-N recommendation for user from a set of predictions. Args: user_id: User ID n(int): The number of recommendation to output for each user. Default is 10. Returns:
# 我们依然沿用上面的评分矩阵进行预测。 # 同样的,我们用surprise库里面的SVD来进行矩阵分解方法。 # In[40]: # 矩阵分解(SVD) # 阅读器 reader = Reader(line_format='user item rating', sep=',') # 载入数据 raw_data = Dataset.load_from_df(user_item_rating, reader=reader) # 分割数据集 kf = KFold(n_splits=5) # 构建模型 algo = SVD(n_factors=40, biased=True) # 训练数据集,并返回rmse误差 for trainset, testset in kf.split(raw_data): algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions, verbose=True) # In[41]: # 矩阵分解 推荐 def recommendation_basedonMF(userID, N=5): # 用户听过的音乐列表 used_items = user_songs[userID]
def hybrid(userId,train_rd): #get_ipython().magic('matplotlib inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import warnings; warnings.simplefilter('ignore') # In[2]: #Popularity# md = pd.read_csv('CustomData/FinalData.csv') fd = pd.read_csv('avg_ratings1.csv') fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float') vote_averages= fd[fd['rating'].notnull()]['rating'] C = vote_averages.mean() fd1 = pd.read_csv('ratings_count.csv') fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float') vote_counts = fd1[fd1['rating'].notnull()]['rating'] # In[3]: m = vote_counts.quantile(0.75) # In[4]: md['ratings_count'] = fd1['rating'] md['average_rating'] = fd['rating'] # In[28]: #print(md.shape) qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']] qualified['ratings_count'] = qualified['ratings_count'].astype('float') qualified['average_rating'] = qualified['average_rating'].astype('float') #qualified.shape # In[29]: def weighted_rating(x): v = x['ratings_count'] R = x['average_rating'] return (v/(v+m) * R) + (m/(m+v) * C) # In[30]: qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1) #qualified['wr'] #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250) pop = qualified[['book_id','popularity_rating']] #print(qualified.shape) #print(pop.shape) # In[11]: ### Collaborative ## reader = Reader() ratings=train_rd #ratings = pd.read_csv('ratings.csv') #ratings.head() temp_ratings = ratings[0:1000] #print(temp_ratings) data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) # In[12]: svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) # In[13]: trainset = data.build_full_trainset() #svd.train(trainset) algo = SVD() algo.fit(trainset) ## usefule = temp_rating[rating] # In[14]: #print(len(temp_ratings[temp_ratings['user_id']==userId])) # In[ ]: def get_top_n(predictions, n=10): '''Return the top-N recommendation for each user from a set of predictions. Args: predictions(list of Prediction objects): The list of predictions, as returned by the test method of an algorithm. n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): #user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # In[15]: from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) ''' top_n = get_top_n(predictions, n=10000) #print(top_n) #result = pd.DataFrame(top_n) #print(result) for uid, user_ratings in top_n.items(): #print(uid, [iid for (iid , _) in user_ratings]) for uid, iid, true_r, est, _ in predictions: temp_ratings.loc[uid]= [uid,iid,est] #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']] ''' count = 0 for uid, iid, true_r, est, _ in predictions: if uid == userId: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print('here') #print(uid) #temp_ratings.append([uid,iid,est],ignore_index=True) #print(count) #print(temp_ratings) # In[16]: #print(len(temp_ratings[temp_ratings['user_id']==2])) # In[ ]: # In[46]: ##### CONTENT ###### import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import csv import warnings; warnings.simplefilter('ignore') # In[48]: md=pd.read_csv('CustomData/FinalData.csv') rd=train_rd #rd=pd.read_csv('ratings.csv') md['book_id'] = md['book_id'].astype('int') rd['book_id'] = rd['book_id'].astype('int') rd['user_id'] = rd['user_id'].astype('int') rd['rating'] = rd['rating'].astype('int') #print(md.head()) md['authors'] = md['authors'].str.replace(' ','') md['authors'] = md['authors'].str.lower() md['authors'] = md['authors'].str.replace(',',' ') #print(md.head()) md['authors'] = md['authors'].apply(lambda x: [x,x]) #print(md['authors']) md['Genres']=md['Genres'].str.split(';') #print(md['Genres']) md['soup'] = md['authors'] + md['Genres'] #print(md['soup']) md['soup'] = md['soup'].str.join(' ') #md['soup'].fillna({}) #print(md['soup']) count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english') count_matrix = count.fit_transform(md['soup']) #print (count_matrix.shape) #print np.array(count.get_feature_names()) #print(count_matrix.shape) cosine_sim = cosine_similarity(count_matrix, count_matrix) # In[91]: def build_user_profiles(): user_profiles=np.zeros((53421,999)) #print(rd.iloc[0]['user_id']) #len(rd['book_id']) for i in range(0,1000): u=rd.iloc[i]['user_id'] b=rd.iloc[i]['book_id'] #print(u,b) #print(i) #if b<999: #print("match at "+str(b)) user_profiles[u][b-1]=rd.iloc[i]['rating'] #print(user_profiles) return user_profiles user_profiles=build_user_profiles() def _get_similar_items_to_user_profile(person_id): #Computes the cosine similarity between the user profile and all item profiles #print(user_profiles[person_id]) #print("\n---------\n") #print(cosine_sim[0]) user_ratings = np.empty((999,1)) cnt=0 for i in range(0,998): book_sim=cosine_sim[i] user_sim=user_profiles[person_id] user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i]) maxval = max(user_ratings) #print(maxval) for i in range(0,998): user_ratings[i]=((user_ratings[i]*5.0)/(maxval)) #print(user_ratings[i]) if(user_ratings[i]>3): #print("MILA KUCCHHH") cnt+=1 #print(max(user_ratings)) #print (cnt) #print(cosine_similarities) #return similar_items return user_ratings content_ratings = _get_similar_items_to_user_profile(userId) # In[100]: num = md[['book_id']] #print(num) num1 = pd.DataFrame(data=content_ratings[0:,0:]) frames = [num, num1] #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index]) mer = pd.concat(frames, axis =1,join_axes=[num.index]) mer.columns=['book_id', 'content_rating'] #print(mer.shape) #print('here') #print(mer) # In[102]: ## for user 2 # #print(temp_ratings.shape) cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']] # print(cb.shape) # print(pop.shape) hyb = md[['book_id']] hyb = hyb.merge(cb,on = 'book_id') hyb = hyb.merge(pop, on='book_id') hyb = hyb.merge(mer, on='book_id') #hyb.shape # In[106]: def weighted_rating(x): v = x['rating'] R = x['popularity_rating'] c = x['content_rating'] return 0.4*v + 0.2*R + 0.4 * c # In[107]: print(hyb) hyb['final'] = hyb.apply(weighted_rating, axis=1) hyb = hyb.sort_values('final', ascending=False).head(999) #print(hyb['final']) print(hyb) return hyb
from surprise import Dataset from surprise import Reader from surprise import accuracy from surprise.model_selection import PredefinedKFold # path to dataset folder files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/') # This time, we'll use the built-in reader. reader = Reader('ml-100k') # folds_files is a list of tuples containing file paths: # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)] train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5)) pkf = PredefinedKFold() algo = SVD() for trainset, testset in pkf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)