def __init__(self, train_filename, topic_num, split_sig, time_format, uin, iin, timein, read_from_file=False): self.train_filename = train_filename self.topic_num = topic_num self.checks = read_checks_table(train_filename, split_sig=split_sig, time_format=time_format, uin=uin, iin=iin, timein=timein) self.check_set = read_dic_set(train_filename, split_tag=split_sig, oin=uin, ain=iin) print(len({k for k,v in self.check_set.items()})) dir_name = 'mid_data/' + '-'.join(train_filename.split('.')[:-1]) + '/tim-plsa' + str(topic_num) + 't/' u_in_z_filename = dir_name + 'pr_u_in_z.txt' i_in_z_filename = dir_name + 'pr_i_in_z.txt' z_filename = dir_name + 'pz.txt' pr_filename = dir_name + 'pr.txt' t_in_z_filename = dir_name + 'pr_t_in_z.txt' if not os.path.exists(dir_name): os.mkdir(dir_name) if os.path.exists(u_in_z_filename) and os.path.exists(i_in_z_filename) and os.path.exists( z_filename) and os.path.exists(pr_filename) and os.path.exists(t_in_z_filename): self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr_t_in_z = read_obj(u_in_z_filename), read_obj( i_in_z_filename), read_obj(z_filename), read_obj(t_in_z_filename) else: self.pr_u_in_z, self.pr_i_in_z, self.pr_t_in_z, self.pz, self.pr = self.init_data() self.em_loop() write_obj(u_in_z_filename, self.pr_u_in_z) write_obj(i_in_z_filename, self.pr_i_in_z) write_obj(z_filename, self.pz) write_obj(pr_filename, self.pr) write_obj(t_in_z_filename, self.pr_t_in_z) print(self.pz)
def __init__(self, train_filename, topic_num, split_sig, time_format, uin, iin, timein, read_from_file=False): self.train_filename = train_filename self.topic_num = topic_num self.checks = read_checks_table(train_filename, split_sig=split_sig, time_format=time_format, uin=uin, iin=iin, timein=timein) dir_name = 'mid_data/' + '-'.join( train_filename.split('.')[:-1]) + '/plsa' + str(topic_num) + 't/' u_in_z_filename = dir_name + 'pr_u_in_z.txt' i_in_z_filename = dir_name + 'pr_i_in_z.txt' z_filename = dir_name + 'pz.txt' pr_filename = dir_name + 'pr.txt' z_in_u_filename = dir_name + 'pr_z_in_u.txt' user_list_file = dir_name + 'users' items_list_file = dir_name + 'items' if not os.path.exists(dir_name): os.mkdir(dir_name) if os.path.exists(u_in_z_filename) and os.path.exists( i_in_z_filename) and os.path.exists( z_filename) and os.path.exists( pr_filename) and os.path.exists(z_in_u_filename): self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr_z_in_u = read_obj( u_in_z_filename), read_obj(i_in_z_filename), read_obj( z_filename), read_obj(z_in_u_filename) # self.pr = read_obj(pr_filename) else: self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr = MyLDA.init_data( topic_num, self.checks) MyLDA.em_loop(self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr, self.checks) self.pr_z_in_u = {} for u in self.checks.keys(): self.pr_z_in_u[u] = {} for z in self.pz.keys(): self.pr_z_in_u[u][z] = 0 for check in self.checks[u]: i = check[0] self.pr_z_in_u[u][z] += self.pr[(u, i)][z] dic_value_reg_one(self.pr_z_in_u[u]) write_obj(u_in_z_filename, self.pr_u_in_z) write_obj(i_in_z_filename, self.pr_i_in_z) write_obj(z_filename, self.pz) write_obj(pr_filename, self.pr) write_obj(z_in_u_filename, self.pr_z_in_u) print(self.pz)
def __init__(self, checks, K, dirname): self.checks = checks self.K = K p_name = dirname + 'p.txt' q_name = dirname + 'q.txt' if os.path.exists(p_name) and os.path.exists(q_name): self.p, self.q = read_obj(p_name), read_obj(q_name) else: self.p, self.q = MyMFModel.gradAscent(self.checks, K) write_obj(p_name, self.p) write_obj(q_name, self.q)
def main(train_file, test_file, feature_num, topks): if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/'): os.makedirs('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/') nprs = [] nres = [] print('read_table') checks = read_checks_table(train_file, uin=0, iin=1) test = read_checks_table(test_file, uin=0, iin=1) sim_fun_name = 'pmf' + str(feature_num) + 't/' dir_name = 'mid_data/' + '-'.join( train_file.split('.')[:-1]) + '/' + sim_fun_name if not os.path.exists(dir_name): os.makedirs(dir_name) mf_model = MyMFModel(checks, K=feature_num, dirname=dir_name) # for topn in topns: ex_rec_name = dir_name + '-'.join(['ex_rec']) + '.txt' if os.path.exists(ex_rec_name): print('read recommend result from file') rec = read_obj(ex_rec_name) for k, v in rec.items(): v.reverse() else: print('recommend') rec = mf_model.recommend() write_obj(ex_rec_name, rec) prs = [] res = [] for topk in topks: pr = precision(rec, test, topk) print(pr) re = recall(rec, test, topk) prs.append(float('%.4f' % pr)) res.append(float('%.4f' % re)) nprs.append(prs.copy()) nres.append(res.copy()) out_json_to_file(dir_name + 'pr.txt', nprs) out_json_to_file(dir_name + 're.txt', nres) return nprs, nres
def cf_main(train_file, test_file, topns=None, topks=None, topic_num=8): start = datetime.now() if topks is None: topks = [20] if topns is None: topns = [20] nprs = [] nres = [] print('read_table') # table = read_checks_table(train_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None, # time_format='%Y-%m-%dT%H:%M:%SZ') # test = read_checks_table(test_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None, # time_format='%Y-%m-%dT%H:%M:%SZ') table = read_checks_table(train_file, split_sig='\t', uin=0, iin=1, timein=7, scorein=None, time_format='%a %b %d %H:%M:%S %z %Y') test = read_checks_table(test_file, split_sig='\t', uin=0, iin=1, timein=7, scorein=None, time_format="%a %b %d %H:%M:%S %z %Y") # table = read_checks_table(train_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None, # time_format='%Y-%m-%d %H:%M:%S') # test = read_checks_table(test_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None, # time_format='%Y-%m-%d %H:%M:%S') # friends_dic = read_dic_set('Gowalla_edges.txt') if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/'): os.mkdir('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/') # ========= LDA ================ # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig='\t', uin=0, iin=4, timein=1, # time_format='%Y-%m-%dT%H:%M:%SZ') # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig=',', uin=0, iin=4, timein=3, time_format='%Y-%m-%d %H:%M:%S') lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig='\t', uin=0, iin=1, timein=7, time_format="%a %b %d %H:%M:%S %z %Y") # sim_fun = lambda u1, u2: lda.sim(u1, u2) predict_fun = lda.predict # ''' sim_fun_name = 'tim-plsa' + str(topic_num) + 't' dir_name = 'mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/' + sim_fun_name + '/' sim_name = dir_name + 'sim.txt' if not os.path.exists(dir_name): os.mkdir(dir_name) # if os.path.exists(sim_name): # print('read sim metrics from file') # sim_metrics = read_obj(sim_name) # else: # print('cal_sim_mat') # sim_metrics = cal_sim_mat(table, similar_fun=sim_fun) # write_obj(sim_name, sim_metrics) for topn in topns: ex_rec_name = dir_name + '-'.join(['ex_rec', sim_fun_name, str(topn)]) + '.txt' if os.path.exists(ex_rec_name): print('read recommend result from file') rec = read_obj(ex_rec_name) else: print('recommend') users = set(table.keys()) items = set() zp = ZPriorityQ(maxsize=1000) for z in range(len(lda.pr_i_in_z)): for i in range(len(lda.pr_i_in_z[z, :])): zp.enQ(KVTtem(i, lda.pr_i_in_z[z, i])) items.update([e.k for e in zp.items]) print(len(items)) # for item, v in lda.pr_i_in_z[0].items(): # items.add(item) rec = exclude_recommend(table, users, items, predict_fun) # write_obj(rec_name, rec) # exclude_dup(table, rec) write_obj(ex_rec_name, rec) prs = [] res = [] for topk in topks: print('precision') pr = precision(rec, test, topk) print(pr) re = recall(rec, test, topk) print('recall') prs.append(float('%.4f' % pr)) res.append(float('%.4f' % re)) # print('y1=',prs) # print('y2=',res) nprs.append(prs.copy()) nres.append(res.copy()) out_json_to_file(dir_name + 'nprs.txt', nprs) out_json_to_file(dir_name + 'nres.txt', nres) end = datetime.now() print('the cost time is ', (end - start).seconds) return nprs, nres
def __init__(self, checks, K, dirname): self.K = K p_name = dirname + 'p.txt' q_name = dirname + 'q.txt' user_list_name = dirname + 'user-list.txt' item_list_name = dirname + 'item-list.txt' if os.path.exists(p_name) and os.path.exists( q_name) and os.path.exists(user_list_name) and os.path.exists( item_list_name): self.p, self.q, self.users, self.items = read_obj(p_name), \ read_obj(q_name), \ read_obj(user_list_name), \ read_obj(item_list_name) self.M = len(self.users) self.N = len(self.items) print(self.M, self.N) self.user_index = { self.users[u]: u for u in range(len(self.users)) } self.item_index = { self.items[i]: i for i in range(len(self.items)) } self.R = {} for ou in checks.keys(): u = self.user_index[ou] if not self.R.__contains__(u): self.R[u] = {} for check in checks[u]: i = self.item_index[check[0]] if not self.R[u].__contains__(i): self.R[u][i] = 0 self.R[u][i] = 1 else: print('init user, items index') users = set() items = set() for u in checks.keys(): users.add(u) for check in checks[u]: i = check[0] items.add(i) self.users = list(users) self.items = list(items) self.M = len(self.users) self.N = len(self.items) print(self.M, self.N) self.user_index = { self.users[u]: u for u in range(len(self.users)) } self.item_index = { self.items[i]: i for i in range(len(self.items)) } self.R = {} for ou in checks.keys(): u = self.user_index[ou] if not self.R.__contains__(u): self.R[u] = {} for check in checks[u]: i = self.item_index[check[0]] if not self.R[u].__contains__(i): self.R[u][i] = 0 self.R[u][i] += 1 # 初始化 参数 列表 成矩阵 self.p, self.q = self.gradAscent() write_obj(p_name, self.p) write_obj(q_name, self.q) write_obj(user_list_name, self.users) write_obj(item_list_name, self.items)
def save(self): write_obj(self.name, self.cache)
def cf_main(train_file, test_file, topns=None, topks=None, topic_num=8): if topks is None: topks = [20] if topns is None: topns = [20] nprs = [] nres = [] print('read_table') table = read_checks_table(train_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None, time_format='%Y-%m-%dT%H:%M:%SZ') test = read_checks_table(test_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None, time_format='%Y-%m-%dT%H:%M:%SZ') # table = read_checks_table(train_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None, # time_format='%Y-%m-%d %H:%M:%S') # test = read_checks_table(test_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None, # time_format='%Y-%m-%d %H:%M:%S') # ''' # friends_dic = read_dic_set('Gowalla_edges.txt') if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/'): os.mkdir('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/') # ========= LDA ================ lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig='\t', uin=0, iin=4, timein=1, time_format='%Y-%m-%dT%H:%M:%SZ') # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig=',', uin=0, iin=4, timein=3, time_format='%Y-%m-%d %H:%M:%S') # sim_fun = lambda u1, u2: lda.sim(u1, u2) predict_fun = lda.predict # ''' sim_fun_name = 'lda' + str(topic_num) + 't' dir_name = 'mid_data/' + '-'.join( train_file.split('.')[:-1]) + '/' + sim_fun_name + '/' sim_name = dir_name + 'sim.txt' if not os.path.exists(dir_name): os.mkdir(dir_name) # if os.path.exists(sim_name): # print('read sim metrics from file') # sim_metrics = read_obj(sim_name) # else: # print('cal_sim_mat') # sim_metrics = cal_sim_mat(table, similar_fun=sim_fun) # write_obj(sim_name, sim_metrics) for topn in topns: rec_name = dir_name + '-'.join(['rec', sim_fun_name, str(topn)]) + '.txt' ex_rec_name = dir_name + '-'.join(['ex_rec', sim_fun_name, str(topn)]) + '.txt' if os.path.exists(ex_rec_name): print('read recommend result from file') rec = read_obj(ex_rec_name) else: print('recommend') users = set(table.keys()) items = set() for z, zis in lda.pr_i_in_z.items(): items.update( [e[0] for e in sort_dict(lda.pr_i_in_z[z])[:1000]]) print(len(items)) # for item, v in lda.pr_i_in_z[0].items(): # items.add(item) rec = exclude_recommend(table, users, items, predict_fun) # write_obj(rec_name, rec) # exclude_dup(table, rec) write_obj(ex_rec_name, rec) prs = [] res = [] for topk in topks: print('precision') pr = precision(rec, test, topk) print(pr) re = recall(rec, test, topk) print('recall') prs.append(float('%.4f' % pr)) res.append(float('%.4f' % re)) # print('y1=',prs) # print('y2=',res) nprs.append(prs.copy()) nres.append(res.copy()) out_json_to_file(dir_name + 'nprs.txt', nprs) out_json_to_file(dir_name + 'nres.txt', nres) return nprs, nres
def __init__(self, train_filename, friends_file, topic_num, split_sig, time_format, uin, iin, timein, read_from_file=False): self.train_filename = train_filename self.topic_num = topic_num self.checks = read_checks_table(train_filename, split_sig=split_sig, time_format=time_format, uin=uin, iin=iin, timein=timein) self.friends = read_dic_set(friends_file, split_tag=split_sig, oin=0, ain=1) self.check_set = read_dic_set(train_filename, split_tag=split_sig, oin=0, ain=4) dir_name = 'mid_data/' + '-'.join(train_filename.split( '.')[:-1]) + '/' + self.name + str(topic_num) + 't/' f_in_z_filename = dir_name + 'pr_f_in_z.txt' i_in_z_filename = dir_name + 'pr_i_in_z.txt' u_in_f_filename = dir_name + 'pr_u_in_f.txt' z_filename = dir_name + 'pz.txt' pr_filename = dir_name + 'pr.txt' max_user = max(self.check_set.keys()) for i in range(max_user + 1): if self.friends.__contains__(i): self.friends[i].add(i) else: self.friends[i] = {i} if not os.path.exists(dir_name): os.mkdir(dir_name) if os.path.exists(f_in_z_filename) and os.path.exists(i_in_z_filename) \ and os.path.exists(z_filename) and os.path.exists(pr_filename) \ and os.path.exists(u_in_f_filename): self.pr_f_in_z, \ self.pr_i_in_z, \ self.pr_u_in_f, \ self.pz = read_obj(f_in_z_filename), \ read_obj(i_in_z_filename), \ read_obj(u_in_f_filename),\ read_obj(z_filename) else: self.pr_f_in_z, self.pr_i_in_z, self.pr_u_in_f, self.pz, self.pr = self.init_data( ) self.em_loop() self.pr_z_in_u = {} # for u in self.checks.keys(): # self.pr_z_in_u[u] = {} # for z in range(len(self.pz)): # self.pr_z_in_u[u][z] = 0 # for check in self.checks[u]: # i = check[0] # self.pr_z_in_u[u][z] += self.pr[(u, i)][z] # dic_value_reg_one(self.pr_z_in_u[u]) write_obj(f_in_z_filename, self.pr_f_in_z) write_obj(i_in_z_filename, self.pr_i_in_z) write_obj(z_filename, self.pz) write_obj(pr_filename, self.pr) write_obj(u_in_f_filename, self.pr_u_in_f) print(self.pz)