def __init__(self, dim, alpha, lambda_, num_user_cluster, num_woi_cluster): self.users = [] self.items = [] self.item_cluster = [] self.latent = dim * 3 self.d = dim for i in range(num_user_cluster): self.users.append(OriginalUCBUserStruct(self.latent, lambda_, i)) for i in range(1000): self.items.append(OriginalUCBItemStruct(dim)) for i in range(25): self.item_cluster.append(OriginalUCBItemClusterStruct(dim)) self.dim = dim self.alpha = alpha self.dic = {} self.item_dic = {} self.now_vec = None self.item_count = 0 self.sim_dic = self.clac_itemsim() self.sim_tuple = self.choice_itemsim() self.first_flg = False self.item_model = UserCluster(25).model_load('item50_25.pkl')
def user_clustering(): users = [] with open(FILE_DIR + file_path[0]) as f: for line in f: _, _, _, user_data, _ = InputData.split_data(line) if user_data not in users: users.append(user_data) if len(users) % 10000 == 0: print(len(users)) print(len(users)) kmeans = UserCluster(N_CLUSTERING) kmeans.fit(features) joblib.dump(kmeans, 'kmeans.pkl') return kmeans
def user_clustering(): users = [] with open('../analytics/stdev/usercluster200.csv') as f: for line in f: _, _, _, user_data, _ = InputData.split_data(line) if user_data not in users: users.append(user_data) if len(users) % 10000 == 0: print(len(users)) print(len(users)) kmeans = UserCluster(N_CLUSTERING) kmeans.fit(features) joblib.dump(kmeans, 'kmeans.pkl') return kmeans
if __name__ == "__main__": dimension = 6 alpha = 0.3 lambda_ = 0.1 global reward global count # 手法の呼び出し algorithms = {} # algorithms['Random'] = Random(dimension) algorithms['LinearedUCB'] = LinUCBAlgorithm(dimension, alpha, lambda_, N_CLUSTERING) algorithms['OriginalUCB'] = OriginalUCBAlgorithm(dimension, alpha, lambda_, N_CLUSTERING) for name in algorithms.keys(): algorithms[name].load_weight(name + '_weight_' + str(N_CLUSTERING) + '_3.csv') reward[name] = 0 count[name] = 1 # clus ter_model = user_clustering() print("=====Enviroment Start=====") run_enviroment( algorithms, cluster_model=UserCluster(N_CLUSTERING).model_load('model' + str(N_CLUSTERING) + '_3.pkl'))
alpha = 1.0 lambda_ = 0.1 n_stdev_cluster = 50 # model_file = 'model'+str(N_CLUSTERING)+'_3000000.pkl' model_file = 'model' + str(N_CLUSTERING) + '_3.pkl' global reward global count # 手法の呼び出し algorithms = {} # algorithms['Random'] = Random(dimension) # algorithms['LinearedUCB'] = LinUCBAlgorithm(dimension, alpha, lambda_, N_CLUSTERING) algorithms['Hybrid_LinUCB'] = Hybrid_LinUCBAlgorithm( dimension, alpha, lambda_) # algorithms['CLUB'] = CLUBAlgorithm(dimension, alpha, lambda_, N_CLUSTERING, alpha) # algorithms['GOBLin'] = GOBLinAlgorithm(dimension, alpha, lambda_, N_CLUSTERING) # algorithms['CoLinUCB'] = CoLinUCBAlgorithm(dimension, alpha, lambda_, N_CLUSTERING) # algorithms['OriginalUCB'] = OriginalUCBAlgorithm(dimension, alpha, lambda_, N_CLUSTERING, n_stdev_cluster) for name in algorithms.keys(): reward[name] = 1 count[name] = 1 # cluster_model = user_clustering() print("=====Enviroment Start=====") run_enviroment( algorithms, cluster_model=UserCluster(N_CLUSTERING).model_load(model_file)) elapsed_time = time.time() - start print("elapsed_time:{0}".format(elapsed_time) + "[sec]")
class OriginalUCBAlgorithm: def __init__(self, dim, alpha, lambda_, num_user_cluster, num_woi_cluster): self.users = [] self.items = [] self.item_cluster = [] self.latent = dim * 3 self.d = dim for i in range(num_user_cluster): self.users.append(OriginalUCBUserStruct(self.latent, lambda_, i)) for i in range(1000): self.items.append(OriginalUCBItemStruct(dim)) for i in range(25): self.item_cluster.append(OriginalUCBItemClusterStruct(dim)) self.dim = dim self.alpha = alpha self.dic = {} self.item_dic = {} self.now_vec = None self.item_count = 0 self.sim_dic = self.clac_itemsim() self.sim_tuple = self.choice_itemsim() self.first_flg = False self.item_model = UserCluster(25).model_load('item50_25.pkl') def decide(self, userID, user_data, pool_articles): maxprob = -10000 maxid = None for id_, article in pool_articles.items(): id_ = int(id_) article = np.array(article) article_vec = copy.deepcopy(article) itemID = self.get_itemID(id_) # 新出のアイテムは類似度1位の追加コンテキストをコピー if self.items[itemID].time == 0: self.copy_vecs(id_, self.items[itemID]) itemClusterID = self.get_itemClusterID(article) article = np.append( self.items[itemID].get_itemvec(article), self.item_cluster[itemClusterID].item_cluster_theta) prob_lin = self.users[userID].get_prob(self.alpha, article) prob = prob_lin if maxprob < prob: maxprob = prob maxid = id_ return maxid def decide_try(self, userID, user_data, pool_articles): maxprob = -10000 maxid = None for id_, article in pool_articles.items(): id_ = int(id_) self.now_vec = article article = np.array(article) article_vec = copy.deepcopy(article) itemID = self.regist_itemID(id_, article) if self.items[itemID].time == 0: self.copy_vecs(id_, self.items[itemID]) itemClusterID = self.get_itemClusterID(article) article = np.append( self.items[itemID].get_itemvec(article), self.item_cluster[itemClusterID].item_cluster_theta) prob_lin = self.users[userID].get_prob( self.alpha, article, self.item_cluster[itemClusterID]) simIDs = self.get_itemsimID(self.sim_tuple[id_]) prob_sim_item = self.items[itemID].get_CBF([ self.items[simIDs[0]], self.items[simIDs[1]], self.items[simIDs[2]], self.items[simIDs[3]], self.items[simIDs[4]] ], self.users[userID].user_theta[:6], self.sim_dic) # prob = prob_lin # 追加ベクトル:clustervec,stdevvec 8/31現状一番良い 最終 1.253 最高 1.253 prob = prob_lin + 0.8 * prob_sim_item # 追加ベクトル:clustervec,stdevvec + CBF的アプローチ(類似上位5までを使用) + 新出アイテムへのベクトルコピー 9/1現状一番良い 最終 1.272 最高 1.272 2回目:1.261 # 右上下 if maxprob < prob: maxprob = prob maxid = id_ return maxid def copy_vecs(self, itemID, item): for i in range(5): if self.items[self.get_itemID( self.sim_tuple[itemID][i])].high_n > 10: top_sim_item = self.items[self.get_itemID( int(self.sim_tuple[itemID][i]))] item.high_n = top_sim_item.high_n item.high_average_context = copy.deepcopy( top_sim_item.high_average_context) item.stdev_context = copy.deepcopy(top_sim_item.stdev_context) item.ave_distance_click = top_sim_item.ave_distance_click def get_itemID(self, article): if article not in self.dic: self.dic[article] = self.item_count self.item_count += 1 return self.dic[article] def regist_itemID(self, article, article_vec): if article not in self.dic: self.dic[article] = self.item_count self.item_count += 1 if len(self.dic) > 50: self.item_dic[article] = article_vec self.sim_dic = self.sim_calc() self.sim_tuple = self.choice_itemsim() return self.dic[article] def get_itemClusterID(self, article): return self.item_model.predict_cluster(article)[0] def get_itemsimID(self, sim_tuples): get_sim = lambda x: self.get_itemID(x) return list(map(get_sim, sim_tuples)) def choice_itemsim(self): import copy sim_tuple = {} for id_, dics in self.sim_dic.items(): keys = [] values = [] for id2_, sim in dics.items(): keys.append(id2_) values.append(sim) values2 = copy.deepcopy(values) values.sort(reverse=True) sim_tuple[id_] = (keys[values.index(values2[0])], keys[values.index(values2[1])], keys[values.index(values2[2])], keys[values.index(values2[3])], keys[values.index(values2[4])]) return sim_tuple def update(self, userID, user_data, article_feature, click, article_id): user_data = np.array(user_data) article_feature = np.array(article_feature) itemClusterID = self.get_itemClusterID(article_feature) article_feature = self.items[self.get_itemID(article_id)].get_itemvec( article_feature) article_feature = np.append( article_feature, self.item_cluster[itemClusterID].get_itemcluster_vec()) self.users[userID].update_parameters(article_feature, click, user_data) self.item_cluster[itemClusterID].update(article_feature, click, self.users[userID]) if int(click) == 1: # self.users[userID].update_stdev(article_feature) self.items[self.get_itemID(article_id)].update_stdev( article_feature, self.users[userID]) def clac_itemsim(self): with open('../data/itemdata.csv') as f: for line in f: line = line.split(',') self.item_dic[line[0]] = [] for val in line[1:]: self.item_dic[line[0]].append(float(val)) if int(line[0]) not in self.dic: self.dic[int(line[0])] = self.item_count self.item_count += 1 return self.sim_calc() def sim_calc(self): sim_dic = {} cos = lambda v1, v2: np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg. norm(v2)) for id_, val in self.item_dic.items(): sim_dic[int(id_)] = {} id_ = int(id_) for id2_, val2 in self.item_dic.items(): id2_ = int(id2_) if id_ == id2_: continue sim_dic[id_][id2_] = cos(val, val2) if math.isnan(sim_dic[id_][id2_]): sim_dic[id_][id2_] = 0.2 return sim_dic def save_weight(self, filename): with open('./weight/' + filename, 'w') as f: for userID in range(len(self.users)): for idx, weight in enumerate(list(self.users[userID].w)): f.write(str(float(weight))) if idx != 5: f.write(',') else: f.write('\n') for idx, weight in enumerate( list(self.users[userID].sigma.reshape(1, 36)[0])): f.write(str(float(weight))) if idx != 35: f.write(',') else: f.write('\n') def load_weight(self, filename): count = 1 userID = 0 with open('./weight/' + filename) as f: for line in f: line = line.split(',') if (count - 1) % 2 == 0: vec = [] for val in line: vec.append(float(val)) self.users[userID].w = np.array(vec) elif count % 2 == 0: matrix = [] for val in line: matrix.append(float(val)) self.users[userID].sigma = np.array(matrix).reshape(6, 6) userID += 1 count += 1 def memory_item_num(self): with open('../data/num_item_click.csv', 'w') as f: for id_, item in enumerate(self.items): f.write(str(id_) + ',' + str(item.high_n) + '\n')
import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA import os import sys path = os.path.join(os.path.dirname(__file__), '../system/') sys.path.append(path) from input_data import InputData from user_clustering import UserCluster clicks = {} model = UserCluster(40).model_load('model40.pkl') with open('/Users/chan-p/Desktop/R6/ydata-fp-td-clicks-v1_0.20090501') as f: for line in f: _, click_article_id, click, user_data, article_pool = InputData.split_data( line) userID = model.predict_cluster(user_data)[0] if click == 1: if userID not in clicks: clicks[userID] = [] clicks[userID].append(article_pool[str(click_article_id)]) if len(clicks[userID]) == 5000: break if len(clicks[userID]) % 100 == 0: print(userID) print(len(clicks[userID])) for id_, val in clicks.items(): print(id_)