def build_user_item_sim_CF(self): from collections import defaultdict self.user_sim = SimMatrix() self.item_sim = SimMatrix() self.user_k_neibor = defaultdict(dict) self.item_k_neibor = defaultdict(dict) # compute item-item similarity matrix print('constructing user-user similarity matrix...') # self.user_sim = util.load_data('../data/sim/ft_08_uu_tricf.pkl') for u1 in self.rg.user: for u2 in self.rg.user: if u1 != u2: if self.user_sim.contains(u1, u2): continue sim = pearson_sp(self.rg.get_row(u1), self.rg.get_row(u2)) sim = round(sim, 5) self.user_sim.set(u1, u2, sim) util.save_data(self.user_sim, '../data/sim/ft_08_uu_tricf_cv0.pkl') # compute the k neighbors of user # self.user_k_neibor = util.load_data( # '../data/neibor/ft_08_uu_' + str(self.config.user_near_num) + '_neibor_tricf.pkl') for user in self.rg.user: matchUsers = sorted(self.user_sim[user].items(), key=lambda x: x[1], reverse=True)[:self.config.user_near_num] matchUsers = matchUsers[:self.config.user_near_num] self.user_k_neibor[user] = dict(matchUsers) util.save_data( self.user_k_neibor, '../data/neibor/ft_08_uu_' + str(self.config.user_near_num) + '_neibor_tricf_cv0.pkl') # compute item-item similarity matrix print('constructing item-item similarity matrix...') # self.item_sim = util.load_data('../data/sim/ft_08_ii_tricf.pkl') for i1 in self.rg.item: for i2 in self.rg.item: if i1 != i2: if self.item_sim.contains(i1, i2): continue sim = pearson_sp(self.rg.get_col(i1), self.rg.get_col(i2)) sim = round(sim, 5) self.item_sim.set(i1, i2, sim) util.save_data(self.item_sim, '../data/sim/ft_08_ii_tricf_cv0.pkl') # compute the k neighbors of item # self.item_k_neibor = util.load_data( # '../data/neibor/ft_08_ii_' + str(self.config.item_near_num) + '_neibor_tricf.pkl') for item in self.rg.item: matchItems = sorted(self.item_sim[item].items(), key=lambda x: x[1], reverse=True)[:self.config.item_near_num] matchItems = matchItems[:self.config.item_near_num] self.item_k_neibor[item] = dict(matchItems) util.save_data( self.item_k_neibor, '../data/neibor/ft_08_ii_' + str(self.config.item_near_num) + '_neibor_tricf_cv0.pkl') pass
def __init__(self): super(SimBase, self).__init__() self.config = ConfigX() self.user_sim = SimMatrix() self.item_sim = SimMatrix() self.user_k_neibor = defaultdict(dict) self.item_k_neibor = defaultdict(dict)
class ItemCF(MF): """ docstring for ItemCF implement the ItemCF Sarwar B, Karypis G, Konstan J, et al. Item-based collaborative filtering recommendation algorithms[C]//Proceedings of the 10th international conference on World Wide Web. ACM, 2001: 285-295. """ def __init__(self): super(ItemCF, self).__init__() self.config.n = 50 # self.init_model() def init_model(self, k): super(ItemCF, self).init_model(k) self.item_sim = SimMatrix() for i_test in self.rg.testSet_i: for i_train in self.rg.item: if i_test != i_train: if self.item_sim.contains(i_test, i_train): continue sim = pearson_sp(self.rg.get_col(i_test), self.rg.get_col(i_train)) self.item_sim.set(i_test, i_train, sim) def predict(self, u, i): # item_sim=dict() # for i_train in self.rg.item: # if i != i_train: # if i_train in item_sim : # continue # sim=cosine_sp(self.rg.get_col(i), self.rg.get_col(i_train)) # item_sim[i_train]=sim matchItems = sorted(self.item_sim[i].items(), key=lambda x: x[1], reverse=True) itemCount = self.config.n if itemCount > len(matchItems): itemCount = len(matchItems) sum, denom = 0, 0 for n in range(itemCount): similarItem = matchItems[n][0] if self.rg.containsUserItem(u, similarItem): similarity = matchItems[n][1] rating = self.rg.trainSet_u[u][similarItem] sum += similarity * (rating - self.rg.itemMeans[similarItem]) denom += similarity if sum == 0: if not self.rg.containsItem(i): return self.rg.globalMean return self.rg.itemMeans[i] pred = self.rg.itemMeans[i] + sum / float(denom) # print('finished user:'+str(u)+" item:"+str(i)) return pred pass
def init_model(self): self.item_sim = SimMatrix() for i_test in self.rg.testSet_i: for i_train in self.rg.item: if i_test != i_train: if self.item_sim.contains(i_test, i_train): continue sim = cosine_sp(self.rg.get_col(i_test), self.rg.get_col(i_train)) self.item_sim.set(i_test, i_train, sim)
def init_model(self): self.user_sim = SimMatrix() for u_test in self.rg.testSet_u: for u_train in self.rg.user: if u_test != u_train: if self.user_sim.contains(u_test, u_train): continue sim = pearson_sp(self.rg.get_row(u_test), self.rg.get_row(u_train)) self.user_sim.set(u_test, u_train, sim)
def init_model(self, k): super(ItemCF, self).init_model(k) self.item_sim = SimMatrix() for i_test in self.rg.testSet_i: for i_train in self.rg.item: if i_test != i_train: if self.item_sim.contains(i_test, i_train): continue sim = pearson_sp(self.rg.get_col(i_test), self.rg.get_col(i_train)) self.item_sim.set(i_test, i_train, sim)
def build_user_item_sim_CF(self): from collections import defaultdict self.user_sim = SimMatrix() # 保存用户相似度矩阵-UI self.item_sim = SimMatrix() # 保存项目相似度矩阵-UI self.user_k_neibor = defaultdict(dict) # 保存用户k近邻 self.item_k_neibor = defaultdict(dict) # 保存项目k近邻 # 用户 # print('constructing user-user similarity matrix...') self.user_sim = util.load_data('../data/sim/ft_08_uu_tricf.pkl') # for u1 in self.rg.user: # for u2 in self.rg.user: # if u1!=u2: # if self.user_sim.contains(u1,u2): # continue # sim = pearson_sp(self.rg.get_row(u1),self.rg.get_row(u2)) # sim=round(sim,5) # self.user_sim.set(u1,u2,sim) # util.save_data(self.user_sim,'../data/sim/ft_08_uu_tricf_cv1.pkl') # 寻找用户的k近邻 self.user_k_neibor = util.load_data('../data/neibor/ft_08_uu_' + str(self.config.user_near_num) + '_neibor_tricf.pkl') # for user in self.rg.user: # matchUsers = sorted(self.user_sim[user].items(),key = lambda x:x[1],reverse=True)[:self.config.user_near_num] # matchUsers=matchUsers[:self.config.user_near_num] # self.user_k_neibor[user]=dict(matchUsers) # util.save_data(self.user_k_neibor,'../data/neibor/ft_08_uu_'+str(self.config.user_near_num)+'_neibor_tricf.pkl') # 项目 # print('constructing item-item similarity matrix...') self.item_sim = util.load_data('../data/sim/ft_08_ii_tricf.pkl') # for i1 in self.rg.item: # for i2 in self.rg.item: # if i1!=i2: # if self.item_sim.contains(i1,i2): # continue # sim = pearson_sp(self.rg.get_col(i1),self.rg.get_col(i2)) # sim=round(sim,5) # self.item_sim.set(i1,i2,sim) # util.save_data(self.item_sim,'../data/sim/ft_08_ii_tricf_cv1.pkl') # 寻找项目的k近邻 self.item_k_neibor = util.load_data('../data/neibor/ft_08_ii_' + str(self.config.item_near_num) + '_neibor_tricf.pkl') # for item in self.rg.item: # matchItems = sorted(self.item_sim[item].items(),key = lambda x:x[1],reverse=True)[:self.config.item_near_num] # matchItems=matchItems[:self.config.item_near_num] # self.item_k_neibor[item]=dict(matchItems) # util.save_data(self.item_k_neibor,'../data/neibor/ft_08_ii_'+str(self.config.item_near_num)+'_neibor_tricf_cv1.pkl') pass
def init_model(self, k): super(SocialReg, self).init_model(k) from collections import defaultdict self.user_sim = SimMatrix() print('constructing user-user similarity matrix...') # self.user_sim = util.load_data('../data/sim/ft_cf_soreg08_cv1.pkl') for u in self.rg.user: for f in self.tg.get_followees(u): if self.user_sim.contains(u, f): continue sim = self.get_sim(u, f) self.user_sim.set(u, f, sim)
def __init__(self): super(SimGe, self).__init__() self.config = ConfigX() self.config.walkCount = 30 self.config.walkLength = 20 self.config.walkDim = 20 self.config.winSize = 5 self.config.topK = 50 self.user_sim = SimMatrix() self.item_sim = SimMatrix() self.user_k_neibor = defaultdict(dict) self.item_k_neibor = defaultdict(dict)
def init_model(self, k): super(CUNE, self).init_model(k) self.user_sim = SimMatrix() self.generate_cu_net() # 构建uu网络 self.deep_walk() print('Constructing similarity matrix...') # self.W = np.zeros((self.rg.get_train_size()[0], self.config.walkDim)) self.topKSim = defaultdict(dict) i = 0 for user1 in self.CUNet: sims = {} for user2 in self.CUNet: if user1 != user2: wu1 = self.model[str(user1)] # 取出embedding wu2 = self.model[str(user2)] sims[user2]=cosine(wu1,wu2) # 计算uu相似性 self.topKSim[user1] = sorted(sims.items(), key=lambda d: d[1], reverse=True)[:self.config.topK] # 按照value来排序,{u1:{u2:1.0, ...}, ...}每个user的键值是该user的k个最相似好友 i += 1 if i % 200 == 0: print('progress:', i, '/', len(self.CUNet)) # 200个user为一组输出进度 # print(self.topKSim) #构建被关注列表 print('Constructing desimilarity matrix...') self.topKSimBy = defaultdict(dict) for user in self.topKSim: users = self.topKSim[user] for user2 in users: # user的相似好友中 self.topKSimBy[user2[0]][user] = user2[1] # 把“关注字典”的key和value互换,得到“被关注字典”:{u2:{u1:1.0, ...}, ...} print('Similarity matrix finished.')
class UserCF(MF): """ docstring for UserCF implement the UserCF Resnick P, Iacovou N, Suchak M, et al. GroupLens: an open architecture for collaborative filtering of netnews[C]//Proceedings of the 1994 ACM conference on Computer supported cooperative work. ACM, 1994: 175-186. """ def __init__(self): super(UserCF, self).__init__() self.config.n = 10 # self.init_model(k) def init_model(self, k): super(UserCF, self).init_model(k) self.user_sim = SimMatrix() for u_test in self.rg.testSet_u: for u_train in self.rg.user: if u_test != u_train: if self.user_sim.contains(u_test, u_train): continue sim = pearson_sp(self.rg.get_row(u_test), self.rg.get_row(u_train)) self.user_sim.set(u_test, u_train, sim) def predict(self, u, i): matchUsers = sorted(self.user_sim[u].items(), key=lambda x: x[1], reverse=True) userCount = self.config.n if userCount > len(matchUsers): userCount = len(matchUsers) sum, denom = 0, 0 for n in range(userCount): similarUser = matchUsers[n][0] if self.rg.containsUserItem(similarUser, i): similarity = matchUsers[n][1] rating = self.rg.trainSet_u[similarUser][i] sum += similarity * (rating - self.rg.userMeans[similarUser]) denom += similarity if sum == 0: if not self.rg.containsUser(u): return self.rg.globalMean return self.rg.userMeans[u] pred = self.rg.userMeans[u] + sum / float(denom) return pred
def getSimMatrix(self, sim_func=pearson_sp): self.log.info( "gettting sim matrix with '%s()' ... (will take some time) " % sim_func.__name__) sim_matrix = SimMatrix() count = 0 for i1 in tqdm(self.item): for i2 in (self.item): if i1 != i2: if sim_matrix.contains(i1, i2): continue a, b = self.get_col(i1), self.get_col(i2) # 皮尔逊相似度? 修改为余弦相似度; # sim = pearson_sp(a, b) # 计算 jaacard sim = sim_func(a.keys(), b.keys()) # if sim1 != 0 or sim2 != 0 or sim3 != 0: # print (i1, a, i2, b, sim1, sim2, sim3) # sim = sim1 sim = round(sim, 5) if sim != 0: # self.log.debug("sim: %s -- item %s item %s " % (sim, i1, i2)) sim_matrix.set(i1, i2, sim) count += 1 # if count > 10: # break; # 测试早期停止数据 self.log.info("'%s()' get %s sims " % (sim_func.__name__, sim_matrix.size())) return sim_matrix
class SocialReg(MF): """ docstring for SocialReg Ma H, Zhou D, Liu C, et al. Recommender systems with social regularization[C]//Proceedings of the fourth ACM international conference on Web search and data mining. ACM, 2011: 287-296. """ def __init__(self): super(SocialReg, self).__init__() # self.config.lambdaP = 0.001 # self.config.lambdaQ = 0.001 self.config.alpha = 0.1 self.tg = TrustGetter() # self.init_model() def init_model(self, k): super(SocialReg, self).init_model(k) from collections import defaultdict self.user_sim = SimMatrix() print('constructing user-user similarity matrix...') # self.user_sim = util.load_data('../data/sim/ft_cf_soreg08_cv1.pkl') for u in self.rg.user: for f in self.tg.get_followees(u): if self.user_sim.contains(u, f): continue sim = self.get_sim(u, f) self.user_sim.set(u, f, sim) # util.save_data(self.user_sim,'../data/sim/ft_cf_soreg08.pkl') def get_sim(self, u, k): sim = (pearson_sp(self.rg.get_row(u), self.rg.get_row(k)) + 1.0) / 2.0 # fit the value into range [0.0,1.0] return sim def train_model(self, k): super(SocialReg, self).train_model(k) iteration = 0 while iteration < self.config.maxIter: self.loss = 0 for index, line in enumerate(self.rg.trainSet()): user, item, rating = line u = self.rg.user[user] i = self.rg.item[item] error = rating - self.predict(user, item) self.loss += 0.5 * error**2 p, q = self.P[u], self.Q[i] social_term_p, social_term_loss = np.zeros( (self.config.factor)), 0.0 followees = self.tg.get_followees(user) for followee in followees: if self.rg.containsUser(followee): s = self.user_sim[user][followee] uf = self.P[self.rg.user[followee]] social_term_p += s * (p - uf) social_term_loss += s * ((p - uf).dot(p - uf)) social_term_m = np.zeros((self.config.factor)) followers = self.tg.get_followers(user) for follower in followers: if self.rg.containsUser(follower): s = self.user_sim[user][follower] ug = self.P[self.rg.user[follower]] social_term_m += s * (p - ug) # update latent vectors self.P[u] += self.config.lr * ( error * q - self.config.alpha * (social_term_p + social_term_m) - self.config.lambdaP * p) self.Q[i] += self.config.lr * (error * p - self.config.lambdaQ * q) self.loss += 0.5 * self.config.alpha * social_term_loss self.loss += 0.5 * self.config.lambdaP * (self.P * self.P).sum( ) + 0.5 * self.config.lambdaQ * (self.Q * self.Q).sum() iteration += 1 if self.isConverged(iteration): break
class SimGe(): def __init__(self): super(SimGe, self).__init__() self.config = ConfigX() self.config.walkCount = 30 self.config.walkLength = 20 self.config.walkDim = 20 self.config.winSize = 5 self.config.topK = 50 self.user_sim = SimMatrix() self.item_sim = SimMatrix() self.user_k_neibor = defaultdict(dict) self.item_k_neibor = defaultdict(dict) def check_dataset(self): super(SimGe, self).check_dataset() # if config.dataset_name != 'db': # print("WARN: 注意 config.dataset_name 未设置为 'db' - douban movie") # # config.dataset_name = 'ml' # # sys.exit() def build_user_item_sim(self, kfold, user_near_num=50, item_near_num=50, load_save_sim=False): """ 获取 user 与 item 的相似性 load_save_sim: 加载原有保存数据,提高测试速度 """ # 目前仅使用一个 SimCF # TODO: 下一步要混合多个Sim self.build_user_item_sim_CF(kfold, user_near_num=user_near_num, item_near_num=item_near_num, load_save_sim=load_save_sim) def build_user_item_sim_CF(self, kfold, user_near_num=50, item_near_num=50, load_save_sim=False): self.rg = RatingGetter(kfold) self.mg = MetaGetter(kfold) from collections import defaultdict # compute item-item similarity matrix print('构建 item-item 相似度矩阵 ...') if load_save_sim: self.item_sim = util.load_data( '../data/sim/%s_08_ii_gemf_cv0.pkl' % self.config.dataset_name) else: # 封装 item 相似度计算 self.item_sim = self.mg.getSimMatrix(jaccard_sim) util.save_data( self.item_sim, '../data/sim/%s_08_ii_gemf_cv0.pkl' % self.config.dataset_name) # compute the k neighbors of item if load_save_sim: self.item_k_neibor = util.load_data( '../data/neibor/%s_08_ii_%s_neibor_gemf_cv0.pkl' % (self.config.dataset_name, item_near_num)) for item in self.mg.item: matchItems = sorted(self.item_sim[item].items(), key=lambda x: x[1], reverse=True)[:item_near_num] matchItems = matchItems[:item_near_num] self.item_k_neibor[item] = dict(matchItems) util.save_data( self.item_k_neibor, '../data/neibor/%s_08_ii_%s_neibor_gemf_cv0.pkl' % (self.config.dataset_name, item_near_num)) # compute user-user similarity matrix print('构建 user-user 相似度矩阵 ...') if load_save_sim: # if True: self.user_sim = util.load_data( '../data/sim/%s_08_uu_gemf_cv0.pkl' % self.config.dataset_name) else: itemNet = {} for item in self.rg.trainSet_i: if len(self.rg.trainSet_i[item]) > 1: itemNet[item] = self.rg.trainSet_i[item] filteredRatings = defaultdict(list) for item in itemNet: for user in itemNet[item]: if itemNet[item][user] > 0: filteredRatings[user].append(item) self.CUNet = defaultdict(list) for user1 in tqdm(filteredRatings): s1 = set(filteredRatings[user1]) for user2 in filteredRatings: if user1 != user2: s2 = set(filteredRatings[user2]) weight = len(s1.intersection(s2)) if weight > 0: self.CUNet[user1] += [user2] print('Generating random deep walks...') self.walks = [] self.visited = defaultdict(dict) for user in tqdm(self.CUNet): for t in range(self.config.walkCount): path = [str(user)] lastNode = user for i in range(1, self.config.walkLength): nextNode = choice(self.CUNet[lastNode]) count = 0 while (nextNode in self.visited[lastNode]): nextNode = choice(self.CUNet[lastNode]) #break infinite loop count += 1 if count == self.config.walkLength: # 10 break path.append(str(nextNode)) self.visited[user][nextNode] = 1 lastNode = nextNode self.walks.append(path) self.model = w2v.Word2Vec(self.walks, size=self.config.walkDim, window=5, min_count=0, iter=3) self.topKSim = defaultdict(dict) i = 0 for u1 in tqdm(self.CUNet): sims = {} for u2 in self.CUNet: if user1 != user2: if self.user_sim.contains(u1, u2): continue wu1 = self.model[str(u1)] wu2 = self.model[str(u2)] sims[u2] = cosine(wu1, wu2) #若为空咋整 self.user_sim.set(u1, u2, sims[u2]) i += 1 if i % 200 == 0: print('progress:', i, '/', len(self.CUNet)) if not os.path.exists('../data/sim'): os.makedirs('../data/sim') print('../data/sim folder has been established.') util.save_data( self.user_sim, '../data/sim/%s_08_uu_gemf_cv0.pkl' % self.config.dataset_name) # compute the k neighbors of user if load_save_sim: self.user_k_neibor = util.load_data( '../data/neibor/%s_08_uu_%s_neibor_gemf_cv0.pkl' % (self.config.dataset_name, user_near_num)) for user in self.rg.user: self.topKSim[u1] = sorted(sims.items(), key=lambda d: d[1], reverse=True)[:self.config.topK] self.topKSim[u1] = self.topKSim[u1][:user_near_num] self.user_k_neibor[user] = dict(self.topKSim[u1]) if not os.path.exists('../data/neibor'): os.makedirs('../data/neibor') print('../data/neibor folder has been established.') util.save_data( self.user_k_neibor, '../data/neibor/%s_08_uu_%s_neibor_gemf_cv0.pkl' % (self.config.dataset_name, user_near_num))
class SimBase(): def __init__(self): super(SimBase, self).__init__() self.config = ConfigX() self.user_sim = SimMatrix() self.item_sim = SimMatrix() self.user_k_neibor = defaultdict(dict) self.item_k_neibor = defaultdict(dict) def check_dataset(self): super(SimBase, self).check_dataset() # if config.dataset_name != 'db': # print("WARN: 注意 config.dataset_name 未设置为 'db' - douban movie") # # config.dataset_name = 'ml' # # sys.exit() def build_user_item_sim(self, kfold, user_near_num=50, item_near_num=50, load_save_sim=False): """ 获取 user 与 item 的相似性 load_save_sim: 加载原有保存数据,提高测试速度 """ # 目前仅使用一个 SimCF # TODO: 下一步要混合多个Sim self.build_user_item_sim_CF(kfold, user_near_num=user_near_num, item_near_num=item_near_num, load_save_sim=load_save_sim) def build_user_item_sim_CF(self, kfold, user_near_num=50, item_near_num=50, load_save_sim=False): self.rg = RatingGetter(kfold) self.mg = MetaGetter(kfold) from collections import defaultdict # compute item-item similarity matrix print('构建 item-item 相似度矩阵 ...') if load_save_sim: self.item_sim = util.load_data( '../data/sim/%s_08_ii_cucmemf_cv0.pkl' % self.config.dataset_name) else: # 封装 item 相似度计算 self.item_sim = self.mg.getSimMatrix(jaccard_sim) util.save_data( self.item_sim, '../data/sim/%s_08_ii_cucmemf_cv0.pkl' % self.config.dataset_name) # compute the k neighbors of item if load_save_sim: self.item_k_neibor = util.load_data( '../data/neibor/%s_08_ii_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, item_near_num)) for item in self.mg.item: matchItems = sorted(self.item_sim[item].items(), key=lambda x: x[1], reverse=True)[:item_near_num] matchItems = matchItems[:item_near_num] self.item_k_neibor[item] = dict(matchItems) util.save_data( self.item_k_neibor, '../data/neibor/%s_08_ii_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, item_near_num)) # compute user-user similarity matrix print('构建 user-user 相似度矩阵 ...') if load_save_sim: # if True: self.user_sim = util.load_data( '../data/sim/%s_08_uu_cucmemf_cv0.pkl' % self.config.dataset_name) else: for u1 in tqdm(self.rg.user): for u2 in self.rg.user: if u1 != u2: if self.user_sim.contains(u1, u2): continue # 皮尔逊相似度? 修改为余弦相似度?; sim = pearson_sp(self.rg.get_row(u1), self.rg.get_row(u2)) sim = round(sim, 5) self.user_sim.set(u1, u2, sim) if not os.path.exists('../data/sim'): os.makedirs('../data/sim') print('../data/sim folder has been established.') util.save_data( self.user_sim, '../data/sim/%s_08_uu_cucmemf_cv0.pkl' % self.config.dataset_name) # compute the k neighbors of user if load_save_sim: self.user_k_neibor = util.load_data( '../data/neibor/%s_08_uu_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, user_near_num)) for user in self.rg.user: matchUsers = sorted(self.user_sim[user].items(), key=lambda x: x[1], reverse=True)[kfold:user_near_num] matchUsers = matchUsers[:user_near_num] self.user_k_neibor[user] = dict(matchUsers) if not os.path.exists('../data/neibor'): os.makedirs('../data/neibor') print('../data/neibor folder has been established.') util.save_data( self.user_k_neibor, '../data/neibor/%s_08_uu_%s_neibor_cucmemf_cv0.pkl' % (self.config.dataset_name, user_near_num))
class TriCFBias(MF): """ docstring for TriCFBias """ def __init__(self): super(TriCFBias, self).__init__() # self.config.lr=0.001 self.config.lambdaU = 0.002 self.config.lambdaI = 0.001 self.config.lambdaP = 0.02 self.config.lambdaQ = 0.03 self.config.lambdaB = 0.01 self.config.user_near_num = 50 self.config.item_near_num = 50 # self.init_model() def init_model(self, k): super(TriCFBias, self).init_model(k) np.random.seed(seed=self.config.random_state) self.Bu = np.random.rand( self.rg.get_train_size()[0]) # bias value of user np.random.seed(seed=self.config.random_state) # 固定随机种子 self.Bi = np.random.rand( self.rg.get_train_size()[1]) # bais value of item self.build_user_item_sim_CF() # construct the u-u,i-i similarity matirx and their's k neighbors def build_user_item_sim_CF(self): from collections import defaultdict self.user_sim = SimMatrix() self.item_sim = SimMatrix() self.user_k_neibor = defaultdict(dict) self.item_k_neibor = defaultdict(dict) # compute item-item similarity matrix print('constructing user-user similarity matrix...') # self.user_sim = util.load_data('../data/sim/ft_08_uu_tricf.pkl') for u1 in self.rg.user: for u2 in self.rg.user: if u1 != u2: if self.user_sim.contains(u1, u2): continue sim = pearson_sp(self.rg.get_row(u1), self.rg.get_row(u2)) sim = round(sim, 5) self.user_sim.set(u1, u2, sim) if not os.path.exists('../data/sim'): os.makedirs('../data/sim') print('../data/sim folder has been established.') print("save user sims size = %s" % (self.user_sim.size())) util.save_data(self.user_sim, '../data/sim/ft_08_uu_tricf_cv0.pkl') # compute the k neighbors of user # self.user_k_neibor = util.load_data( # '../data/neibor/ft_08_uu_' + str(self.config.user_near_num) + '_neibor_tricf.pkl') for user in self.rg.user: matchUsers = sorted(self.user_sim[user].items(), key=lambda x: x[1], reverse=True)[:self.config.user_near_num] matchUsers = matchUsers[:self.config.user_near_num] self.user_k_neibor[user] = dict(matchUsers) if not os.path.exists('../data/neibor'): os.makedirs('../data/neibor') print('../data/neibor folder has been established.') util.save_data( self.user_k_neibor, '../data/neibor/ft_08_uu_' + str(self.config.user_near_num) + '_neibor_tricf_cv0.pkl') # compute item-item similarity matrix print('constructing item-item similarity matrix...') # self.item_sim = util.load_data('../data/sim/ft_08_ii_tricf.pkl') for i1 in self.rg.item: for i2 in self.rg.item: if i1 != i2: if self.item_sim.contains(i1, i2): continue sim = pearson_sp(self.rg.get_col(i1), self.rg.get_col(i2)) sim = round(sim, 5) self.item_sim.set(i1, i2, sim) print("save item sims size = %s" % (self.item_sim.size())) util.save_data(self.item_sim, '../data/sim/ft_08_ii_tricf_cv0.pkl') # compute the k neighbors of item # self.item_k_neibor = util.load_data( # '../data/neibor/ft_08_ii_' + str(self.config.item_near_num) + '_neibor_tricf.pkl') for item in self.rg.item: matchItems = sorted(self.item_sim[item].items(), key=lambda x: x[1], reverse=True)[:self.config.item_near_num] matchItems = matchItems[:self.config.item_near_num] self.item_k_neibor[item] = dict(matchItems) util.save_data( self.item_k_neibor, '../data/neibor/ft_08_ii_' + str(self.config.item_near_num) + '_neibor_tricf_cv0.pkl') pass def train_model(self, k): super(TriCFBias, self).train_model(k) print('training model...') iteration = 0 # faflag=True while iteration < self.config.maxIter: self.loss = 0 self.u_near_total_dict = defaultdict() self.i_near_total_dict = defaultdict() for index, line in enumerate(self.rg.trainSet()): user, item, rating = line u = self.rg.user[user] i = self.rg.item[item] error = rating - self.predict(user, item) self.loss += error**2 p, q = self.P[u], self.Q[i] # get the k neighbors of user and item matchUsers = self.user_k_neibor[user] matchItems = self.item_k_neibor[item] u_near_sum, u_near_total, s = np.zeros( (self.config.factor)), 0.0, 0.0 for suser in matchUsers.keys(): near_user, sim_value = suser, matchUsers[suser] if sim_value != 0.0: s += sim_value pn = self.P[self.rg.user[near_user]] u_near_sum += sim_value * (pn - p) u_near_total += sim_value * ((pn - p).dot(pn - p)) if s != 0.0: u_near_sum /= s i_near_sum, i_near_total, ss = np.zeros( (self.config.factor)), 0.0, 0.0 for sitem in matchItems: near_item, sim_value = sitem, matchItems[sitem] if sim_value != 0.0: ss += sim_value qn = self.Q[self.rg.item[near_item]] i_near_sum += sim_value * (qn - q) i_near_total += sim_value * ((qn - q).dot(qn - q)) if ss != 0.0: i_near_sum /= ss if u not in self.u_near_total_dict: self.u_near_total_dict[u] = u_near_total if i not in self.i_near_total_dict: self.i_near_total_dict[i] = i_near_total self.Bu[u] += self.config.lr * ( error - self.config.lambdaB * self.Bu[u]) self.Bi[i] += self.config.lr * ( error - self.config.lambdaB * self.Bi[i]) self.P[u] += self.config.lr * ( error * q - self.config.lambdaU * u_near_sum - self.config.lambdaP * p) self.Q[i] += self.config.lr * ( error * p - self.config.lambdaI * i_near_sum - self.config.lambdaQ * q) self.loss += 0.5 * (self.config.lambdaU * u_near_total + self.config.lambdaI * i_near_total) self.loss += self.config.lambdaP * (self.P * self.P).sum() + self.config.lambdaQ * (self.Q * self.Q).sum() \ + self.config.lambdaB * ((self.Bu * self.Bu).sum() + (self.Bi * self.Bi).sum()) iteration += 1 if self.isConverged(iteration): break # test cold start users among test set def predict_model_cold_users_improved(self): res = [] for user in self.rg.testColdUserSet_u.keys(): for item in self.rg.testColdUserSet_u[user].keys(): rating = self.rg.testColdUserSet_u[user][item] pred = self.predict_improved(user, item) # denormalize pred = denormalize(pred, self.config.min_val, self.config.max_val) pred = self.checkRatingBoundary(pred) res.append([user, item, rating, pred]) rmse = Metric.RMSE(res) return rmse
def init_model(self, k): super(CUNE, self).init_model(k) self.user_sim = SimMatrix() self.generate_cu_net() self.deep_walk() self.compute_social_sim()