def show(self):
     labels = [e[0] for e in sort_dict(self.labels, reverse=False)]
     with open(self.dir_name + 'z_w.txt', 'w') as f:
         for z in range(self.topic_num):
             temp = sort_dict({
                 labels[w]: self.pr_w_in_z[z][w]
                 for w in range(len(self.pr_w_in_z[z]))
             })
             print(temp[:10])
             f.write(str(temp[:10]))
Пример #2
0
def cal_sim_mat(table, similar_fun, reg_one=True, sort_change=True):
    count = 0
    ids = list(table.keys())
    total = len(ids) * len(ids)
    l = len(ids)
    print(l)
    TM = {}
    for u1 in ids:
        if not TM.__contains__(u1):
            TM[u1] = {}
        for u2 in ids:
            if u1 == u2:
                continue
            else:
                count += 1
                # if count % 10000 == 0:
                #     print(count, total)
                s = similar_fun(u1, u2)
                if s > 0:
                    TM[u1][u2] = s
        if TM[u1] is None:
            print(u1, TM[u1])
        if reg_one:
            dic_value_reg_one(TM[u1])

    if sort_change:
        TM = {k: sort_dict(v) for k, v in TM.items()}

    return TM
Пример #3
0
def cal_sim_mat_for_async(worknum,
                          user_queue,
                          users,
                          similar_fun,
                          reg_one=True,
                          sort_change=True):
    sim_mat = {}
    print('run thread' + str(worknum))
    count = 0
    while user_queue:
        count += 1
        user = user_queue.pop()
        print(len(user_queue), worknum)
        if user is not None:
            sim_mat[user] = {}
            for u in users:
                if u == user:
                    continue
                s = similar_fun(user, u)
                if s > 0:
                    sim_mat[user][u] = s
            if reg_one:
                dic_value_reg_one(sim_mat[user])
            if sort_change:
                sim_mat[user] = sort_dict(sim_mat[user])
            # if count % 20 == 0:
            #     gc.collect()
    return sim_mat
Пример #4
0
def exclude_recommend(checks, users, locs, predic_fun):
    rec = {}
    c = 0
    for u in users:
        old_items = set([int(i[0]) for i in checks.get(u, [])])
        rec[u] = {}
        c += 1
        for l in locs:
            if not old_items.__contains__(l):
                rec[u][l] = predic_fun(int(u), int(l))
        del old_items
        if c % 100 == 0:
            print(c / 100)
        rec[u] = sort_dict(rec[u])[:100]
    return rec
Пример #5
0
orec = read_obj(rec_file)

geo_inf = GeoInf(a=0.84534522188,
                 b=-1.61667304945,
                 checks=table,
                 loc_center=loc_center,
                 user_center=user_center)
# geo_inf = GeoInf(a=0.651, b=-1.628, checks=table, loc_center=loc_center, user_center=user_center)

# locs = set(loc_center.keys())

# users = ["11823", "10362", "11588", "16457", "2738", "7380", "1676", "2270", "9429", "10650", "9488", "10320", "2461", "4330", "9565", "8895", "16248", "16201", "16633", "14710", "9632", "4962", "10579", "16057", "7836", "4971", "12417", "6791", "16181", "6533", "322", "132", "11998", "2882", "10184", "15244", "15469", "9210", "15982", "685", "1147", "7313", "6390", "11391", "13552", "4421", "11881", "2953", "10025", "4610", "15455", "7744", "11512", "13107", "11328", "2153", "2150", "13310", "10554", "17003", "4343", "17836", "13097", "3510", "7806", "15655", "70", "15838", "17717", "17390", "4282", "16446", "15078", "6074", "9504", "12785", "740", "8525", "16427", "2188", "11119"]

# rate = 0
pre = []
re = []
for rate in [3]:
    rate /= 10
    rec = {}
    for u in orec.keys():
        rec[u] = {p[0]: p[1] for p in orec[u]}
        inf = geo_inf(u, rec[u].keys())
        for l in inf.keys():
            rec[u][l] = rate * rec[u].get(l, 0) + (1 - rate) * inf[l]
        rec[u] = sort_dict(rec[u])

    pre.append(precision(rec, test_table=test, topk=10))
    re.append(recall(rec, test_table=test, topk=10))
print(pre)
print(re)
Пример #6
0
def recommend(locs, centers, visited):
    rec = {}
    for loc in locs - visited:
        rec[loc] = kde(centers, visited, loc)
    return sort_dict(rec)
Пример #7
0
def cf_main(train_file, test_file, topns=None, topks=None, topic_num=8):
    if topks is None:
        topks = [20]
    if topns is None:
        topns = [20]
    nprs = []
    nres = []

    print('read_table')
    table = read_checks_table(train_file,
                              split_sig='\t',
                              uin=0,
                              iin=4,
                              timein=1,
                              scorein=None,
                              time_format='%Y-%m-%dT%H:%M:%SZ')
    test = read_checks_table(test_file,
                             split_sig='\t',
                             uin=0,
                             iin=4,
                             timein=1,
                             scorein=None,
                             time_format='%Y-%m-%dT%H:%M:%SZ')

    # table = read_checks_table(train_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None,
    #                           time_format='%Y-%m-%d %H:%M:%S')
    # test = read_checks_table(test_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None,
    #                          time_format='%Y-%m-%d %H:%M:%S')

    # '''
    # friends_dic = read_dic_set('Gowalla_edges.txt')
    if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) +
                          '/'):
        os.mkdir('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/')
    # ========= LDA ================
    lda = MyLDA(train_filename=train_file,
                topic_num=topic_num,
                split_sig='\t',
                uin=0,
                iin=4,
                timein=1,
                time_format='%Y-%m-%dT%H:%M:%SZ')
    # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig=',', uin=0, iin=4, timein=3, time_format='%Y-%m-%d %H:%M:%S')

    # sim_fun = lambda u1, u2: lda.sim(u1, u2)
    predict_fun = lda.predict
    # '''
    sim_fun_name = 'lda' + str(topic_num) + 't'
    dir_name = 'mid_data/' + '-'.join(
        train_file.split('.')[:-1]) + '/' + sim_fun_name + '/'
    sim_name = dir_name + 'sim.txt'

    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    # if os.path.exists(sim_name):
    #     print('read sim metrics from file')
    #     sim_metrics = read_obj(sim_name)
    # else:
    #     print('cal_sim_mat')
    #     sim_metrics = cal_sim_mat(table, similar_fun=sim_fun)
    #     write_obj(sim_name, sim_metrics)
    for topn in topns:
        rec_name = dir_name + '-'.join(['rec', sim_fun_name,
                                        str(topn)]) + '.txt'
        ex_rec_name = dir_name + '-'.join(['ex_rec', sim_fun_name,
                                           str(topn)]) + '.txt'
        if os.path.exists(ex_rec_name):
            print('read recommend result from file')
            rec = read_obj(ex_rec_name)
        else:
            print('recommend')
            users = set(table.keys())
            items = set()
            for z, zis in lda.pr_i_in_z.items():
                items.update(
                    [e[0] for e in sort_dict(lda.pr_i_in_z[z])[:1000]])
            print(len(items))
            # for item, v in lda.pr_i_in_z[0].items():
            #     items.add(item)
            rec = exclude_recommend(table, users, items, predict_fun)
            # write_obj(rec_name, rec)
            # exclude_dup(table, rec)
            write_obj(ex_rec_name, rec)

        prs = []
        res = []
        for topk in topks:
            print('precision')
            pr = precision(rec, test, topk)
            print(pr)
            re = recall(rec, test, topk)
            print('recall')
            prs.append(float('%.4f' % pr))
            res.append(float('%.4f' % re))
        # print('y1=',prs)
        # print('y2=',res)
        nprs.append(prs.copy())
        nres.append(res.copy())
    out_json_to_file(dir_name + 'nprs.txt', nprs)
    out_json_to_file(dir_name + 'nres.txt', nres)

    return nprs, nres