Пример #1
0
    def __init__(self, train_filename, topic_num, split_sig, time_format, uin, iin, timein, read_from_file=False):
        self.train_filename = train_filename
        self.topic_num = topic_num
        self.checks = read_checks_table(train_filename, split_sig=split_sig, time_format=time_format, uin=uin, iin=iin,
                                        timein=timein)
        self.check_set = read_dic_set(train_filename, split_tag=split_sig, oin=uin, ain=iin)
        print(len({k for k,v in self.check_set.items()}))

        dir_name = 'mid_data/' + '-'.join(train_filename.split('.')[:-1]) + '/tim-plsa' + str(topic_num) + 't/'
        u_in_z_filename = dir_name + 'pr_u_in_z.txt'
        i_in_z_filename = dir_name + 'pr_i_in_z.txt'
        z_filename = dir_name + 'pz.txt'
        pr_filename = dir_name + 'pr.txt'
        t_in_z_filename = dir_name + 'pr_t_in_z.txt'

        if not os.path.exists(dir_name):
            os.mkdir(dir_name)

        if os.path.exists(u_in_z_filename) and os.path.exists(i_in_z_filename) and os.path.exists(
                z_filename) and os.path.exists(pr_filename) and os.path.exists(t_in_z_filename):
            self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr_t_in_z = read_obj(u_in_z_filename), read_obj(
                i_in_z_filename), read_obj(z_filename), read_obj(t_in_z_filename)
        else:
            self.pr_u_in_z, self.pr_i_in_z, self.pr_t_in_z, self.pz, self.pr = self.init_data()
            self.em_loop()
            write_obj(u_in_z_filename, self.pr_u_in_z)
            write_obj(i_in_z_filename, self.pr_i_in_z)
            write_obj(z_filename, self.pz)
            write_obj(pr_filename, self.pr)
            write_obj(t_in_z_filename, self.pr_t_in_z)
        print(self.pz)
Пример #2
0
    def __init__(self,
                 train_filename,
                 topic_num,
                 split_sig,
                 time_format,
                 uin,
                 iin,
                 timein,
                 read_from_file=False):
        self.train_filename = train_filename
        self.topic_num = topic_num
        self.checks = read_checks_table(train_filename,
                                        split_sig=split_sig,
                                        time_format=time_format,
                                        uin=uin,
                                        iin=iin,
                                        timein=timein)

        dir_name = 'mid_data/' + '-'.join(
            train_filename.split('.')[:-1]) + '/plsa' + str(topic_num) + 't/'
        u_in_z_filename = dir_name + 'pr_u_in_z.txt'
        i_in_z_filename = dir_name + 'pr_i_in_z.txt'
        z_filename = dir_name + 'pz.txt'
        pr_filename = dir_name + 'pr.txt'
        z_in_u_filename = dir_name + 'pr_z_in_u.txt'
        user_list_file = dir_name + 'users'
        items_list_file = dir_name + 'items'

        if not os.path.exists(dir_name):
            os.mkdir(dir_name)

        if os.path.exists(u_in_z_filename) and os.path.exists(
                i_in_z_filename) and os.path.exists(
                    z_filename) and os.path.exists(
                        pr_filename) and os.path.exists(z_in_u_filename):
            self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr_z_in_u = read_obj(
                u_in_z_filename), read_obj(i_in_z_filename), read_obj(
                    z_filename), read_obj(z_in_u_filename)
            # self.pr = read_obj(pr_filename)
        else:
            self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr = MyLDA.init_data(
                topic_num, self.checks)
            MyLDA.em_loop(self.pr_u_in_z, self.pr_i_in_z, self.pz, self.pr,
                          self.checks)
            self.pr_z_in_u = {}
            for u in self.checks.keys():
                self.pr_z_in_u[u] = {}
                for z in self.pz.keys():
                    self.pr_z_in_u[u][z] = 0
                    for check in self.checks[u]:
                        i = check[0]
                        self.pr_z_in_u[u][z] += self.pr[(u, i)][z]
                dic_value_reg_one(self.pr_z_in_u[u])
            write_obj(u_in_z_filename, self.pr_u_in_z)
            write_obj(i_in_z_filename, self.pr_i_in_z)
            write_obj(z_filename, self.pz)
            write_obj(pr_filename, self.pr)
            write_obj(z_in_u_filename, self.pr_z_in_u)
        print(self.pz)
Пример #3
0
 def __init__(self, checks, K, dirname):
     self.checks = checks
     self.K = K
     p_name = dirname + 'p.txt'
     q_name = dirname + 'q.txt'
     if os.path.exists(p_name) and os.path.exists(q_name):
         self.p, self.q = read_obj(p_name), read_obj(q_name)
     else:
         self.p, self.q = MyMFModel.gradAscent(self.checks, K)
         write_obj(p_name, self.p)
         write_obj(q_name, self.q)
Пример #4
0
 def __init__(self, h):
     self.h = h
     if os.path.exists(self.name):
         self.cache = read_obj(self.name)
     else:
         self.cache = {}
     self.size = len(self.cache)
Пример #5
0
 def __init__(self, dir_name):
     self.dir_name = dir_name
     if os.path.exists(self.name):
         self.cache = read_obj(self.name)
     else:
         self.cache = {}
     self.size = len(self.cache)
Пример #6
0
def main(train_file, test_file, feature_num, topks):

    if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) +
                          '/'):
        os.makedirs('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/')
    nprs = []
    nres = []
    print('read_table')
    checks = read_checks_table(train_file, uin=0, iin=1)
    test = read_checks_table(test_file, uin=0, iin=1)

    sim_fun_name = 'pmf' + str(feature_num) + 't/'

    dir_name = 'mid_data/' + '-'.join(
        train_file.split('.')[:-1]) + '/' + sim_fun_name

    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    mf_model = MyMFModel(checks, K=feature_num, dirname=dir_name)

    # for topn in topns:
    ex_rec_name = dir_name + '-'.join(['ex_rec']) + '.txt'
    if os.path.exists(ex_rec_name):
        print('read recommend result from file')
        rec = read_obj(ex_rec_name)
        for k, v in rec.items():
            v.reverse()
    else:
        print('recommend')
        rec = mf_model.recommend()
        write_obj(ex_rec_name, rec)
    prs = []
    res = []
    for topk in topks:
        pr = precision(rec, test, topk)
        print(pr)
        re = recall(rec, test, topk)
        prs.append(float('%.4f' % pr))
        res.append(float('%.4f' % re))
    nprs.append(prs.copy())
    nres.append(res.copy())
    out_json_to_file(dir_name + 'pr.txt', nprs)
    out_json_to_file(dir_name + 're.txt', nres)
    return nprs, nres
Пример #7
0
def cf_main(train_file, test_file, topns=None, topks=None, topic_num=8):
    start = datetime.now()
    if topks is None:
        topks = [20]
    if topns is None:
        topns = [20]
    nprs = []
    nres = []

    print('read_table')
    # table = read_checks_table(train_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None,
    #                           time_format='%Y-%m-%dT%H:%M:%SZ')
    # test = read_checks_table(test_file, split_sig='\t', uin=0, iin=4, timein=1, scorein=None,
    #                          time_format='%Y-%m-%dT%H:%M:%SZ')

    table = read_checks_table(train_file, split_sig='\t', uin=0, iin=1, timein=7, scorein=None,
                              time_format='%a %b %d %H:%M:%S %z %Y')
    test = read_checks_table(test_file, split_sig='\t', uin=0, iin=1, timein=7, scorein=None,
                             time_format="%a %b %d %H:%M:%S %z %Y")

    # table = read_checks_table(train_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None,
    #                           time_format='%Y-%m-%d %H:%M:%S')
    # test = read_checks_table(test_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None,
    #                          time_format='%Y-%m-%d %H:%M:%S')

    # friends_dic = read_dic_set('Gowalla_edges.txt')

    if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/'):
        os.mkdir('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/')

    # ========= LDA ================

    # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig='\t', uin=0, iin=4, timein=1,
                # time_format='%Y-%m-%dT%H:%M:%SZ')
    # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig=',', uin=0, iin=4, timein=3, time_format='%Y-%m-%d %H:%M:%S')
    lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig='\t', uin=0, iin=1, timein=7, time_format="%a %b %d %H:%M:%S %z %Y")

    # sim_fun = lambda u1, u2: lda.sim(u1, u2)
    predict_fun = lda.predict
    # '''
    sim_fun_name = 'tim-plsa' + str(topic_num) + 't'
    dir_name = 'mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/' + sim_fun_name + '/'
    sim_name = dir_name + 'sim.txt'

    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    # if os.path.exists(sim_name):
    #     print('read sim metrics from file')
    #     sim_metrics = read_obj(sim_name)
    # else:
    #     print('cal_sim_mat')
    #     sim_metrics = cal_sim_mat(table, similar_fun=sim_fun)
    #     write_obj(sim_name, sim_metrics)

    for topn in topns:
        ex_rec_name = dir_name + '-'.join(['ex_rec', sim_fun_name, str(topn)]) + '.txt'
        if os.path.exists(ex_rec_name):
            print('read recommend result from file')
            rec = read_obj(ex_rec_name)
        else:
            print('recommend')
            users = set(table.keys())
            items = set()
            zp = ZPriorityQ(maxsize=1000)
            for z in range(len(lda.pr_i_in_z)):
                for i in range(len(lda.pr_i_in_z[z, :])):
                    zp.enQ(KVTtem(i, lda.pr_i_in_z[z, i]))
                items.update([e.k for e in zp.items])
            print(len(items))
            # for item, v in lda.pr_i_in_z[0].items():
            #     items.add(item)
            rec = exclude_recommend(table, users, items, predict_fun)
            # write_obj(rec_name, rec)
            # exclude_dup(table, rec)
            write_obj(ex_rec_name, rec)

        prs = []
        res = []
        for topk in topks:
            print('precision')
            pr = precision(rec, test, topk)
            print(pr)
            re = recall(rec, test, topk)
            print('recall')
            prs.append(float('%.4f' % pr))
            res.append(float('%.4f' % re))
        # print('y1=',prs)
        # print('y2=',res)
        nprs.append(prs.copy())
        nres.append(res.copy())
    out_json_to_file(dir_name + 'nprs.txt', nprs)
    out_json_to_file(dir_name + 'nres.txt', nres)

    end = datetime.now()
    print('the cost time is ', (end - start).seconds)

    return nprs, nres
from pprint import pprint

from numpy import array, arange
import matplotlib.pyplot as plt

from rec_lib.utils import read_obj

uinz = read_obj(
    'mid_data/trainid-id-dataset_TSMC2014_NYC/tim-plsa7t/pr_t_in_z.txt')

for zd in uinz:
    plt.plot(arange(24), zd)

plt.show()
Пример #9
0
    def __init__(self, checks, K, dirname):
        self.K = K
        p_name = dirname + 'p.txt'
        q_name = dirname + 'q.txt'
        user_list_name = dirname + 'user-list.txt'
        item_list_name = dirname + 'item-list.txt'

        if os.path.exists(p_name) and os.path.exists(
                q_name) and os.path.exists(user_list_name) and os.path.exists(
                    item_list_name):
            self.p, self.q, self.users, self.items = read_obj(p_name), \
                                                     read_obj(q_name), \
                                                     read_obj(user_list_name), \
                                                     read_obj(item_list_name)
            self.M = len(self.users)
            self.N = len(self.items)
            print(self.M, self.N)
            self.user_index = {
                self.users[u]: u
                for u in range(len(self.users))
            }
            self.item_index = {
                self.items[i]: i
                for i in range(len(self.items))
            }
            self.R = {}
            for ou in checks.keys():
                u = self.user_index[ou]
                if not self.R.__contains__(u):
                    self.R[u] = {}
                for check in checks[u]:
                    i = self.item_index[check[0]]
                    if not self.R[u].__contains__(i):
                        self.R[u][i] = 0
                    self.R[u][i] = 1
        else:
            print('init user, items index')
            users = set()
            items = set()
            for u in checks.keys():
                users.add(u)
                for check in checks[u]:
                    i = check[0]
                    items.add(i)
            self.users = list(users)
            self.items = list(items)
            self.M = len(self.users)
            self.N = len(self.items)
            print(self.M, self.N)
            self.user_index = {
                self.users[u]: u
                for u in range(len(self.users))
            }
            self.item_index = {
                self.items[i]: i
                for i in range(len(self.items))
            }
            self.R = {}
            for ou in checks.keys():
                u = self.user_index[ou]
                if not self.R.__contains__(u):
                    self.R[u] = {}
                for check in checks[u]:
                    i = self.item_index[check[0]]
                    if not self.R[u].__contains__(i):
                        self.R[u][i] = 0
                    self.R[u][i] += 1
            # 初始化 参数 列表 成矩阵
            self.p, self.q = self.gradAscent()
            write_obj(p_name, self.p)
            write_obj(q_name, self.q)
            write_obj(user_list_name, self.users)
            write_obj(item_list_name, self.items)
Пример #10
0
# loc_center = read_center('trainRF-SH-FoursquareLocationCenter.csv')
# user_center = read_center('trainRF-SH-FoursquareUserCenter.csv')

loc_center = read_center('trainRF-NA-Gowalla_LocCenter.txt')
user_center = read_center('trainRF-NA-Gowalla_UserCenter.txt')

# 这个流行度好像和 距离没法融合
# loc_users = read_location_users(train_file)
# maxu = max([len(users) for loc, users in loc_users.items()])
# pop_inf = {loc: len(users)/maxu for loc, users in loc_users.items()}

# rec_file = 'mid_data/trainRF-SH-FoursquareCheckins/0.5-0.3-soc0.5-sq_score1d-cosine_1/[0.3, 0.2, 0.5]/ex_rec-5.txt'
rec_file = 'mid_data/trainRF-NA-Gowalla_totalCheckins/0.5-0.3-soc0.5-sq_score1d-cosine_1/[0.3, 0.1, 0.6]/ex_rec-5.txt'

orec = read_obj(rec_file)

geo_inf = GeoInf(a=0.84534522188,
                 b=-1.61667304945,
                 checks=table,
                 loc_center=loc_center,
                 user_center=user_center)
# geo_inf = GeoInf(a=0.651, b=-1.628, checks=table, loc_center=loc_center, user_center=user_center)

# locs = set(loc_center.keys())

# users = ["11823", "10362", "11588", "16457", "2738", "7380", "1676", "2270", "9429", "10650", "9488", "10320", "2461", "4330", "9565", "8895", "16248", "16201", "16633", "14710", "9632", "4962", "10579", "16057", "7836", "4971", "12417", "6791", "16181", "6533", "322", "132", "11998", "2882", "10184", "15244", "15469", "9210", "15982", "685", "1147", "7313", "6390", "11391", "13552", "4421", "11881", "2953", "10025", "4610", "15455", "7744", "11512", "13107", "11328", "2153", "2150", "13310", "10554", "17003", "4343", "17836", "13097", "3510", "7806", "15655", "70", "15838", "17717", "17390", "4282", "16446", "15078", "6074", "9504", "12785", "740", "8525", "16427", "2188", "11119"]

# rate = 0
pre = []
re = []
Пример #11
0
def cf_main(train_file, test_file, topns=None, topks=None, topic_num=8):
    if topks is None:
        topks = [20]
    if topns is None:
        topns = [20]
    nprs = []
    nres = []

    print('read_table')
    table = read_checks_table(train_file,
                              split_sig='\t',
                              uin=0,
                              iin=4,
                              timein=1,
                              scorein=None,
                              time_format='%Y-%m-%dT%H:%M:%SZ')
    test = read_checks_table(test_file,
                             split_sig='\t',
                             uin=0,
                             iin=4,
                             timein=1,
                             scorein=None,
                             time_format='%Y-%m-%dT%H:%M:%SZ')

    # table = read_checks_table(train_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None,
    #                           time_format='%Y-%m-%d %H:%M:%S')
    # test = read_checks_table(test_file, split_sig=',', uin=0, iin=4, timein=3, scorein=None,
    #                          time_format='%Y-%m-%d %H:%M:%S')

    # '''
    # friends_dic = read_dic_set('Gowalla_edges.txt')
    if not os.path.exists('mid_data/' + '-'.join(train_file.split('.')[:-1]) +
                          '/'):
        os.mkdir('mid_data/' + '-'.join(train_file.split('.')[:-1]) + '/')
    # ========= LDA ================
    lda = MyLDA(train_filename=train_file,
                topic_num=topic_num,
                split_sig='\t',
                uin=0,
                iin=4,
                timein=1,
                time_format='%Y-%m-%dT%H:%M:%SZ')
    # lda = MyLDA(train_filename=train_file, topic_num=topic_num, split_sig=',', uin=0, iin=4, timein=3, time_format='%Y-%m-%d %H:%M:%S')

    # sim_fun = lambda u1, u2: lda.sim(u1, u2)
    predict_fun = lda.predict
    # '''
    sim_fun_name = 'lda' + str(topic_num) + 't'
    dir_name = 'mid_data/' + '-'.join(
        train_file.split('.')[:-1]) + '/' + sim_fun_name + '/'
    sim_name = dir_name + 'sim.txt'

    if not os.path.exists(dir_name):
        os.mkdir(dir_name)

    # if os.path.exists(sim_name):
    #     print('read sim metrics from file')
    #     sim_metrics = read_obj(sim_name)
    # else:
    #     print('cal_sim_mat')
    #     sim_metrics = cal_sim_mat(table, similar_fun=sim_fun)
    #     write_obj(sim_name, sim_metrics)
    for topn in topns:
        rec_name = dir_name + '-'.join(['rec', sim_fun_name,
                                        str(topn)]) + '.txt'
        ex_rec_name = dir_name + '-'.join(['ex_rec', sim_fun_name,
                                           str(topn)]) + '.txt'
        if os.path.exists(ex_rec_name):
            print('read recommend result from file')
            rec = read_obj(ex_rec_name)
        else:
            print('recommend')
            users = set(table.keys())
            items = set()
            for z, zis in lda.pr_i_in_z.items():
                items.update(
                    [e[0] for e in sort_dict(lda.pr_i_in_z[z])[:1000]])
            print(len(items))
            # for item, v in lda.pr_i_in_z[0].items():
            #     items.add(item)
            rec = exclude_recommend(table, users, items, predict_fun)
            # write_obj(rec_name, rec)
            # exclude_dup(table, rec)
            write_obj(ex_rec_name, rec)

        prs = []
        res = []
        for topk in topks:
            print('precision')
            pr = precision(rec, test, topk)
            print(pr)
            re = recall(rec, test, topk)
            print('recall')
            prs.append(float('%.4f' % pr))
            res.append(float('%.4f' % re))
        # print('y1=',prs)
        # print('y2=',res)
        nprs.append(prs.copy())
        nres.append(res.copy())
    out_json_to_file(dir_name + 'nprs.txt', nprs)
    out_json_to_file(dir_name + 'nres.txt', nres)

    return nprs, nres
Пример #12
0
    def __init__(self,
                 train_filename,
                 friends_file,
                 topic_num,
                 split_sig,
                 time_format,
                 uin,
                 iin,
                 timein,
                 read_from_file=False):
        self.train_filename = train_filename
        self.topic_num = topic_num
        self.checks = read_checks_table(train_filename,
                                        split_sig=split_sig,
                                        time_format=time_format,
                                        uin=uin,
                                        iin=iin,
                                        timein=timein)
        self.friends = read_dic_set(friends_file,
                                    split_tag=split_sig,
                                    oin=0,
                                    ain=1)

        self.check_set = read_dic_set(train_filename,
                                      split_tag=split_sig,
                                      oin=0,
                                      ain=4)

        dir_name = 'mid_data/' + '-'.join(train_filename.split(
            '.')[:-1]) + '/' + self.name + str(topic_num) + 't/'
        f_in_z_filename = dir_name + 'pr_f_in_z.txt'
        i_in_z_filename = dir_name + 'pr_i_in_z.txt'
        u_in_f_filename = dir_name + 'pr_u_in_f.txt'
        z_filename = dir_name + 'pz.txt'
        pr_filename = dir_name + 'pr.txt'

        max_user = max(self.check_set.keys())
        for i in range(max_user + 1):
            if self.friends.__contains__(i):
                self.friends[i].add(i)
            else:
                self.friends[i] = {i}

        if not os.path.exists(dir_name):
            os.mkdir(dir_name)

        if os.path.exists(f_in_z_filename) and os.path.exists(i_in_z_filename) \
                and os.path.exists(z_filename) and os.path.exists(pr_filename) \
                and os.path.exists(u_in_f_filename):
            self.pr_f_in_z, \
            self.pr_i_in_z, \
            self.pr_u_in_f, \
            self.pz = read_obj(f_in_z_filename), \
                      read_obj(i_in_z_filename), \
                      read_obj(u_in_f_filename),\
                      read_obj(z_filename)
        else:
            self.pr_f_in_z, self.pr_i_in_z, self.pr_u_in_f, self.pz, self.pr = self.init_data(
            )
            self.em_loop()
            self.pr_z_in_u = {}
            # for u in self.checks.keys():
            #     self.pr_z_in_u[u] = {}
            #     for z in range(len(self.pz)):
            #         self.pr_z_in_u[u][z] = 0
            #         for check in self.checks[u]:
            #             i = check[0]
            #             self.pr_z_in_u[u][z] += self.pr[(u, i)][z]
            #     dic_value_reg_one(self.pr_z_in_u[u])
            write_obj(f_in_z_filename, self.pr_f_in_z)
            write_obj(i_in_z_filename, self.pr_i_in_z)
            write_obj(z_filename, self.pz)
            write_obj(pr_filename, self.pr)
            write_obj(u_in_f_filename, self.pr_u_in_f)
        print(self.pz)
Пример #13
0
    keys = list(keys)
    index = {}
    for i in range(len(keys)):
        index[keys[i]] = i
    return index


def dic_to_mat(index, dic):
    rmat = np.zeros(shape=(len(index), len(index)))
    for u1, uss in dic.items():
        for u2, s in uss.items():
            rmat[index[u1], index[u2]] = s
    return rmat


if __name__ == '__main__':
    sim_map1 = read_obj(
        '../mid_data/trainRF-SH-FoursquareCheckins/1-0.5-0.3-soc-group0-soc-group1-soc-group2/soc-group0'
    )

    sim_map2 = read_obj(
        '../mid_data/trainRF-SH-FoursquareCheckins/1-0.5-0.3-soc-group0-soc-group1-soc-group2/soc-group1'
    )
    # sim_map2 = {u: {f[0]: f[1] for f in fs} for u, fs in sim_map2.items()}
    index = keys_to_index(sim_map1.keys())

    m1 = dic_to_mat(index, sim_map1)
    m2 = dic_to_mat(index, sim_map2)
    m = m1 * 0.5 + m2 * 0.5
    print(m)