def show(self): labels = [e[0] for e in sort_dict(self.labels, reverse=False)] with open(self.dir_name + 'z_w.txt', 'w') as f: for z in range(self.topic_num): temp = sort_dict({ labels[w]: self.pr_w_in_z[z][w] for w in range(len(self.pr_w_in_z[z])) }) print(temp[:10]) f.write(str(temp[:10]))
def cal_sim_mat(table, similar_fun, reg_one=True, sort_change=True): count = 0 ids = list(table.keys()) total = len(ids) * len(ids) l = len(ids) print(l) TM = {} for u1 in ids: if not TM.__contains__(u1): TM[u1] = {} for u2 in ids: if u1 == u2: continue else: count += 1 # if count % 10000 == 0: # print(count, total) s = similar_fun(u1, u2) if s > 0: TM[u1][u2] = s if TM[u1] is None: print(u1, TM[u1]) if reg_one: dic_value_reg_one(TM[u1]) if sort_change: TM = {k: sort_dict(v) for k, v in TM.items()} return TM
def cal_sim_mat_for_async(worknum, user_queue, users, similar_fun, reg_one=True, sort_change=True): sim_mat = {} print('run thread' + str(worknum)) count = 0 while user_queue: count += 1 user = user_queue.pop() print(len(user_queue), worknum) if user is not None: sim_mat[user] = {} for u in users: if u == user: continue s = similar_fun(user, u) if s > 0: sim_mat[user][u] = s if reg_one: dic_value_reg_one(sim_mat[user]) if sort_change: sim_mat[user] = sort_dict(sim_mat[user]) # if count % 20 == 0: # gc.collect() return sim_mat
def exclude_recommend(checks, users, locs, predic_fun): rec = {} c = 0 for u in users: old_items = set([int(i[0]) for i in checks.get(u, [])]) rec[u] = {} c += 1 for l in locs: if not old_items.__contains__(l): rec[u][l] = predic_fun(int(u), int(l)) del old_items if c % 100 == 0: print(c / 100) rec[u] = sort_dict(rec[u])[:100] return rec
orec = read_obj(rec_file) geo_inf = GeoInf(a=0.84534522188, b=-1.61667304945, checks=table, loc_center=loc_center, user_center=user_center) # geo_inf = GeoInf(a=0.651, b=-1.628, checks=table, loc_center=loc_center, user_center=user_center) # locs = set(loc_center.keys()) # users = ["11823", "10362", "11588", "16457", "2738", "7380", "1676", "2270", "9429", "10650", "9488", "10320", "2461", "4330", "9565", "8895", "16248", "16201", "16633", "14710", "9632", "4962", "10579", "16057", "7836", "4971", "12417", "6791", "16181", "6533", "322", "132", "11998", "2882", "10184", "15244", "15469", "9210", "15982", "685", "1147", "7313", "6390", "11391", "13552", "4421", "11881", "2953", "10025", "4610", "15455", "7744", "11512", "13107", "11328", "2153", "2150", "13310", "10554", "17003", "4343", "17836", "13097", "3510", "7806", "15655", "70", "15838", "17717", "17390", "4282", "16446", "15078", "6074", "9504", "12785", "740", "8525", "16427", "2188", "11119"] # rate = 0 pre = [] re = [] for rate in [3]: rate /= 10 rec = {} for u in orec.keys(): rec[u] = {p[0]: p[1] for p in orec[u]} inf = geo_inf(u, rec[u].keys()) for l in inf.keys(): rec[u][l] = rate * rec[u].get(l, 0) + (1 - rate) * inf[l] rec[u] = sort_dict(rec[u]) pre.append(precision(rec, test_table=test, topk=10)) re.append(recall(rec, test_table=test, topk=10)) print(pre) print(re)
def recommend(locs, centers, visited): rec = {} for loc in locs - visited: rec[loc] = kde(centers, visited, loc) return sort_dict(rec)
def cf_main(train_file, test_file, topns=None, topks=None, topic_num=8): if topks is None: topks = [20] if topns is None: topns = [20] nprs = [] nres = [] print('read_table') table = read_checks_table(train_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None, time_format='%Y-%m-%dT%H:%M:%SZ') test = read_checks_table(test_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None, time_format='%Y-%m-%dT%H:%M:%SZ') # table = read_checks_table(train_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None, # time_format='%Y-%m-%d %H:%M:%S') # test = read_checks_table(test_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None, # time_format='%Y-%m-%d %H:%M:%S') # ''' # friends_dic = read_dic_set('Gowalla_edges.txt') if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/'): os.mkdir('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/') # ========= LDA ================ lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig='\t', uin=0, iin=4, timein=1, time_format='%Y-%m-%dT%H:%M:%SZ') # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig=',', uin=0, iin=4, timein=3, time_format='%Y-%m-%d %H:%M:%S') # sim_fun = lambda u1, u2: lda.sim(u1, u2) predict_fun = lda.predict # ''' sim_fun_name = 'lda' + str(topic_num) + 't' dir_name = 'mid_data/' + '-'.join( train_file.split('.')[:-1]) + '/' + sim_fun_name + '/' sim_name = dir_name + 'sim.txt' if not os.path.exists(dir_name): os.mkdir(dir_name) # if os.path.exists(sim_name): # print('read sim metrics from file') # sim_metrics = read_obj(sim_name) # else: # print('cal_sim_mat') # sim_metrics = cal_sim_mat(table, similar_fun=sim_fun) # write_obj(sim_name, sim_metrics) for topn in topns: rec_name = dir_name + '-'.join(['rec', sim_fun_name, str(topn)]) + '.txt' ex_rec_name = dir_name + '-'.join(['ex_rec', sim_fun_name, str(topn)]) + '.txt' if os.path.exists(ex_rec_name): print('read recommend result from file') rec = read_obj(ex_rec_name) else: print('recommend') users = set(table.keys()) items = set() for z, zis in lda.pr_i_in_z.items(): items.update( [e[0] for e in sort_dict(lda.pr_i_in_z[z])[:1000]]) print(len(items)) # for item, v in lda.pr_i_in_z[0].items(): # items.add(item) rec = exclude_recommend(table, users, items, predict_fun) # write_obj(rec_name, rec) # exclude_dup(table, rec) write_obj(ex_rec_name, rec) prs = [] res = [] for topk in topks: print('precision') pr = precision(rec, test, topk) print(pr) re = recall(rec, test, topk) print('recall') prs.append(float('%.4f' % pr)) res.append(float('%.4f' % re)) # print('y1=',prs) # print('y2=',res) nprs.append(prs.copy()) nres.append(res.copy()) out_json_to_file(dir_name + 'nprs.txt', nprs) out_json_to_file(dir_name + 'nres.txt', nres) return nprs, nres