def train_model(self, k):
        super(IntegSVD, self).train_model(k)
        iteration = 0
        while iteration < self.config.maxIter:
            self.loss = 0
            for index, line in enumerate(self.rg.trainSet()):
                user, item, rating = line
                u = self.rg.user[user]
                i = self.rg.item[item]
                ui_neighbors = self.get_neighbor(user, item)
                ui_nei_len = len(ui_neighbors)
                error = rating - self.predict(user, item)
                self.loss += error**2

                p, q = self.P[u], self.Q[i]
                # nu, sum_y = self.get_sum_y(user)

                # update latent vectors
                self.Bu[u] += self.config.lr * (
                    error - self.config.lambdaB * self.Bu[u])
                self.Bi[i] += self.config.lr * (
                    error - self.config.lambdaB * self.Bi[i])

                self.P[u] += self.config.lr * (error * q -
                                               self.config.lambdaP * p)
                self.Q[i] += self.config.lr * (
                    error * p - self.config.lambdaQ * q)  # + sum_y

                # 更新Y
                # u_items = self.rg.user_rated_items(u)
                # for j in u_items:
                #     idj = self.rg.item[j]
                #     self.Y[idj] += self.config.lr * (error / np.sqrt(nu) * q - self.config.lambdaY * self.Y[idj])
                # 更新W,C
                for neighbor in ui_neighbors:
                    j = self.rg.item[neighbor]
                    ruj = self.rg.trainSet_u[user][neighbor]
                    buj = self.rg.globalMean + self.Bu[u] + self.Bi[j]
                    self.W[i][j] += self.config.lr * (
                        error / (ui_nei_len**0.5) *
                        (ruj - buj) - self.config.lambdaW * self.W[i][j])
                    # self.C[i][j] += self.config.lr * (error / (ui_nei_len ** 0.5) - self.config.lambdaC * self.C[i][j])

            self.loss += self.config.lambdaP * (self.P * self.P).sum() + self.config.lambdaQ * (self.Q * self.Q).sum() \
                         + self.config.lambdaB * ( \
                                     (self.Bu * self.Bu).sum() + (self.Bi * self.Bi).sum()) + self.config.lambdaW * (
                                 self.W * self.W).sum()  # + self.config.lambdaY * (self.Y * self.Y).sum() \
            # +self.config.lambdaC * (self.C * self.C).sum()
            iteration += 1
            if self.isConverged(iteration):
                break

        util.save_data(self.user_item_nei,
                       '../data/neibor/ft_intsvd_useritemnei_08.pkl')
Пример #2
0
    def init_model(self):
        super(GEMF, self).init_model()
        print('starting initialization...')
        #1、extract user corpus with user's social network - netwalker
        print('=' * 5 + 'extracting user corpus with users social network' +
              '=' * 5)
        ##########################
        # G = netwalker.load_edgelist_without_weight(self.implict_trust_path, undirected=self.config.undirected)
        # self.walks = netwalker.build_deepwalk_corpus(G, self.config.number_walks, self.config.path_length, self.config.restart_pro, self.config.random_state)

        # weight_dic, G = netwalker.load_edgelist_with_weight(self.implict_trust_path, undirected=self.config.undirected)
        # self.walks = netwalker.build_deepwalk_corpus(G, self.config.number_walks, self.config.path_length, self.config.restart_pro, self.config.random_state)

        ##########################
        # weight_dic, G = netwalker.load_edgelist_with_weight(self.implict_trust_path, undirected=self.config.undirected)
        # self.walks = netwalker.deepwalk_with_alpha(G, weight_dic, self.config.number_walks, self.config.path_length, self.config.restart_pro, self.config.random_state)

        weight_dic, G = netwalker.load_edgelist_with_weight(
            self.implict_trust_path, undirected=self.config.undirected)
        self.walks = netwalker.deepwalk_without_alpha(G, weight_dic,
                                                      self.config.number_walks,
                                                      self.config.path_length,
                                                      self.config.restart_pro,
                                                      self.config.random_state)

        # # shuffle the walks
        np.random.shuffle(self.walks)
        # cpprint(walks)
        netwalker.save_walks(self.walks, self.config.walk_result_path)
        print('=' * 5 + 'generating inverted index...' + '=' * 5)
        self.inverted_index()
        # print(self.node_inverted_index)

        #2、initialize the w and w' in graph embedding
        print('=' * 5 + 'read social corpus' + '=' * 5)
        fi = open(self.config.walk_result_path, 'r')  # training corpus
        self.social_vocab = Vocab(
            fi, self.config.min_count)  # user node and their index

        #social 的用户是否都在ui矩阵中出现,若是子集比较好说,若非子集则需将该用户随机初始化
        print('=' * 5 + 'initialize network for word2vec' + '=' * 5)
        self.reset_index()
        self.init_net()

        print('=' * 5 + 'generate the unigram table for word2vec' + '=' * 5)
        if not os.path.exists(self.config.table_path):  # if exists, continue
            self.table = UnigramTable(self.social_vocab)
            util.save_data(self.table, self.config.table_path)
        else:
            self.table = util.load_data(self.config.table_path)
Пример #3
0
    def build_user_item_sim_CF(self):
        from collections import defaultdict
        self.user_sim = SimMatrix()
        self.item_sim = SimMatrix()
        self.user_k_neibor = defaultdict(dict)
        self.item_k_neibor = defaultdict(dict)

        # compute item-item similarity matrix
        print('constructing user-user similarity matrix...')
        # self.user_sim = util.load_data('../data/sim/ft_08_uu_tricf.pkl')
        for u1 in self.rg.user:
            for u2 in self.rg.user:
                if u1 != u2:
                    if self.user_sim.contains(u1, u2):
                        continue
                    sim = pearson_sp(self.rg.get_row(u1), self.rg.get_row(u2))
                    sim = round(sim, 5)
                    self.user_sim.set(u1, u2, sim)
        util.save_data(self.user_sim, '../data/sim/ft_08_uu_tricf_cv0.pkl')

        # compute the k neighbors of user
        # self.user_k_neibor = util.load_data(
        #     '../data/neibor/ft_08_uu_' + str(self.config.user_near_num) + '_neibor_tricf.pkl')
        for user in self.rg.user:
            matchUsers = sorted(self.user_sim[user].items(),
                                key=lambda x: x[1],
                                reverse=True)[:self.config.user_near_num]
            matchUsers = matchUsers[:self.config.user_near_num]
            self.user_k_neibor[user] = dict(matchUsers)
        util.save_data(
            self.user_k_neibor, '../data/neibor/ft_08_uu_' +
            str(self.config.user_near_num) + '_neibor_tricf_cv0.pkl')

        # compute item-item similarity matrix
        print('constructing item-item similarity matrix...')
        # self.item_sim = util.load_data('../data/sim/ft_08_ii_tricf.pkl')
        for i1 in self.rg.item:
            for i2 in self.rg.item:
                if i1 != i2:
                    if self.item_sim.contains(i1, i2):
                        continue
                    sim = pearson_sp(self.rg.get_col(i1), self.rg.get_col(i2))
                    sim = round(sim, 5)
                    self.item_sim.set(i1, i2, sim)
        util.save_data(self.item_sim, '../data/sim/ft_08_ii_tricf_cv0.pkl')

        # compute the k neighbors of item
        # self.item_k_neibor = util.load_data(
        #     '../data/neibor/ft_08_ii_' + str(self.config.item_near_num) + '_neibor_tricf.pkl')
        for item in self.rg.item:
            matchItems = sorted(self.item_sim[item].items(),
                                key=lambda x: x[1],
                                reverse=True)[:self.config.item_near_num]
            matchItems = matchItems[:self.config.item_near_num]
            self.item_k_neibor[item] = dict(matchItems)
        util.save_data(
            self.item_k_neibor, '../data/neibor/ft_08_ii_' +
            str(self.config.item_near_num) + '_neibor_tricf_cv0.pkl')
        pass
Пример #4
0
    def build_user_item_sim_CF(self,
                               kfold,
                               user_near_num=50,
                               item_near_num=50,
                               load_save_sim=False):

        self.rg = RatingGetter(kfold)
        self.mg = MetaGetter(kfold)

        from collections import defaultdict

        # compute item-item similarity matrix
        print('构建 item-item 相似度矩阵  ...')
        if load_save_sim:
            self.item_sim = util.load_data(
                '../data/sim/%s_08_ii_gemf_cv0.pkl' % self.config.dataset_name)
        else:
            # 封装 item 相似度计算
            self.item_sim = self.mg.getSimMatrix(jaccard_sim)
            util.save_data(
                self.item_sim,
                '../data/sim/%s_08_ii_gemf_cv0.pkl' % self.config.dataset_name)

        # compute the k neighbors of item
        if load_save_sim:
            self.item_k_neibor = util.load_data(
                '../data/neibor/%s_08_ii_%s_neibor_gemf_cv0.pkl' %
                (self.config.dataset_name, item_near_num))
        for item in self.mg.item:
            matchItems = sorted(self.item_sim[item].items(),
                                key=lambda x: x[1],
                                reverse=True)[:item_near_num]
            matchItems = matchItems[:item_near_num]
            self.item_k_neibor[item] = dict(matchItems)
        util.save_data(
            self.item_k_neibor,
            '../data/neibor/%s_08_ii_%s_neibor_gemf_cv0.pkl' %
            (self.config.dataset_name, item_near_num))

        # compute user-user similarity matrix
        print('构建 user-user 相似度矩阵 ...')
        if load_save_sim:
            # if True:
            self.user_sim = util.load_data(
                '../data/sim/%s_08_uu_gemf_cv0.pkl' % self.config.dataset_name)
        else:
            itemNet = {}
            for item in self.rg.trainSet_i:
                if len(self.rg.trainSet_i[item]) > 1:
                    itemNet[item] = self.rg.trainSet_i[item]

            filteredRatings = defaultdict(list)

            for item in itemNet:
                for user in itemNet[item]:
                    if itemNet[item][user] > 0:
                        filteredRatings[user].append(item)

            self.CUNet = defaultdict(list)

            for user1 in tqdm(filteredRatings):
                s1 = set(filteredRatings[user1])
                for user2 in filteredRatings:
                    if user1 != user2:
                        s2 = set(filteredRatings[user2])
                        weight = len(s1.intersection(s2))
                        if weight > 0:
                            self.CUNet[user1] += [user2]

            print('Generating random deep walks...')
            self.walks = []
            self.visited = defaultdict(dict)
            for user in tqdm(self.CUNet):
                for t in range(self.config.walkCount):
                    path = [str(user)]
                    lastNode = user
                    for i in range(1, self.config.walkLength):
                        nextNode = choice(self.CUNet[lastNode])
                        count = 0
                        while (nextNode in self.visited[lastNode]):
                            nextNode = choice(self.CUNet[lastNode])
                            #break infinite loop
                            count += 1
                            if count == self.config.walkLength:  # 10
                                break
                        path.append(str(nextNode))
                        self.visited[user][nextNode] = 1
                        lastNode = nextNode
                    self.walks.append(path)

            self.model = w2v.Word2Vec(self.walks,
                                      size=self.config.walkDim,
                                      window=5,
                                      min_count=0,
                                      iter=3)

            self.topKSim = defaultdict(dict)
            i = 0
            for u1 in tqdm(self.CUNet):
                sims = {}
                for u2 in self.CUNet:
                    if user1 != user2:
                        if self.user_sim.contains(u1, u2):
                            continue
                        wu1 = self.model[str(u1)]
                        wu2 = self.model[str(u2)]
                        sims[u2] = cosine(wu1, wu2)  #若为空咋整
                        self.user_sim.set(u1, u2, sims[u2])
                i += 1
                if i % 200 == 0:
                    print('progress:', i, '/', len(self.CUNet))
            if not os.path.exists('../data/sim'):
                os.makedirs('../data/sim')
                print('../data/sim folder has been established.')
            util.save_data(
                self.user_sim,
                '../data/sim/%s_08_uu_gemf_cv0.pkl' % self.config.dataset_name)

        # compute the k neighbors of user
        if load_save_sim:
            self.user_k_neibor = util.load_data(
                '../data/neibor/%s_08_uu_%s_neibor_gemf_cv0.pkl' %
                (self.config.dataset_name, user_near_num))
        for user in self.rg.user:
            self.topKSim[u1] = sorted(sims.items(),
                                      key=lambda d: d[1],
                                      reverse=True)[:self.config.topK]
            self.topKSim[u1] = self.topKSim[u1][:user_near_num]
            self.user_k_neibor[user] = dict(self.topKSim[u1])

        if not os.path.exists('../data/neibor'):
            os.makedirs('../data/neibor')
            print('../data/neibor folder has been established.')

        util.save_data(
            self.user_k_neibor,
            '../data/neibor/%s_08_uu_%s_neibor_gemf_cv0.pkl' %
            (self.config.dataset_name, user_near_num))
Пример #5
0
    def build_user_item_sim_CF(self,
                               kfold,
                               user_near_num=50,
                               item_near_num=50,
                               load_save_sim=False):

        self.rg = RatingGetter(kfold)
        self.mg = MetaGetter(kfold)

        from collections import defaultdict

        # compute item-item similarity matrix
        print('构建 item-item 相似度矩阵  ...')
        if load_save_sim:
            self.item_sim = util.load_data(
                '../data/sim/%s_08_ii_cucmemf_cv0.pkl' %
                self.config.dataset_name)
        else:
            # 封装 item 相似度计算
            self.item_sim = self.mg.getSimMatrix(jaccard_sim)
            util.save_data(
                self.item_sim, '../data/sim/%s_08_ii_cucmemf_cv0.pkl' %
                self.config.dataset_name)

        # compute the k neighbors of item
        if load_save_sim:
            self.item_k_neibor = util.load_data(
                '../data/neibor/%s_08_ii_%s_neibor_cucmemf_cv0.pkl' %
                (self.config.dataset_name, item_near_num))
        for item in self.mg.item:
            matchItems = sorted(self.item_sim[item].items(),
                                key=lambda x: x[1],
                                reverse=True)[:item_near_num]
            matchItems = matchItems[:item_near_num]
            self.item_k_neibor[item] = dict(matchItems)
        util.save_data(
            self.item_k_neibor,
            '../data/neibor/%s_08_ii_%s_neibor_cucmemf_cv0.pkl' %
            (self.config.dataset_name, item_near_num))

        # compute user-user similarity matrix
        print('构建 user-user 相似度矩阵 ...')
        if load_save_sim:
            # if True:
            self.user_sim = util.load_data(
                '../data/sim/%s_08_uu_cucmemf_cv0.pkl' %
                self.config.dataset_name)
        else:
            for u1 in tqdm(self.rg.user):
                for u2 in self.rg.user:
                    if u1 != u2:
                        if self.user_sim.contains(u1, u2):
                            continue
                        # 皮尔逊相似度? 修改为余弦相似度?;
                        sim = pearson_sp(self.rg.get_row(u1),
                                         self.rg.get_row(u2))
                        sim = round(sim, 5)
                        self.user_sim.set(u1, u2, sim)
            if not os.path.exists('../data/sim'):
                os.makedirs('../data/sim')
                print('../data/sim folder has been established.')
            util.save_data(
                self.user_sim, '../data/sim/%s_08_uu_cucmemf_cv0.pkl' %
                self.config.dataset_name)

        # compute the k neighbors of user
        if load_save_sim:
            self.user_k_neibor = util.load_data(
                '../data/neibor/%s_08_uu_%s_neibor_cucmemf_cv0.pkl' %
                (self.config.dataset_name, user_near_num))
        for user in self.rg.user:
            matchUsers = sorted(self.user_sim[user].items(),
                                key=lambda x: x[1],
                                reverse=True)[kfold:user_near_num]
            matchUsers = matchUsers[:user_near_num]
            self.user_k_neibor[user] = dict(matchUsers)

        if not os.path.exists('../data/neibor'):
            os.makedirs('../data/neibor')
            print('../data/neibor folder has been established.')

        util.save_data(
            self.user_k_neibor,
            '../data/neibor/%s_08_uu_%s_neibor_cucmemf_cv0.pkl' %
            (self.config.dataset_name, user_near_num))
Пример #6
0
    def build_user_item_sim_CF(self,
                               kfold,
                               user_near_num=50,
                               item_near_num=50,
                               load_save_sim=False):

        self.rg = RatingGetter(kfold)

        from collections import defaultdict

        # compute item-item similarity matrix

        print('构建 user-user 相似度矩阵 ...')
        if load_save_sim:
            self.user_sim = util.load_data(
                '../data/sim/db_08_uu_tricf_cv0.pkl')
        else:
            for u1 in self.rg.user:
                for u2 in self.rg.user:
                    if u1 != u2:
                        if self.user_sim.contains(u1, u2):
                            continue
                        # 皮尔逊相似度? 修改为余弦相似度;
                        sim = pearson_sp(self.rg.get_row(u1),
                                         self.rg.get_row(u2))
                        sim = round(sim, 5)
                        self.user_sim.set(u1, u2, sim)
            if not os.path.exists('../data/sim'):
                os.makedirs('../data/sim')
                print('../data/sim folder has been established.')
            util.save_data(self.user_sim, '../data/sim/db_08_uu_tricf_cv0.pkl')

        # compute the k neighbors of user
        # self.user_k_neibor = util.load_data(
        #     '../data/neibor/db_08_uu_' + str(user_near_num) + '_neibor_tricf.pkl')
        for user in self.rg.user:
            matchUsers = sorted(self.user_sim[user].items(),
                                key=lambda x: x[1],
                                reverse=True)[kfold:user_near_num]
            matchUsers = matchUsers[:user_near_num]
            self.user_k_neibor[user] = dict(matchUsers)

        if not os.path.exists('../data/neibor'):
            os.makedirs('../data/neibor')
            print('../data/neibor folder has been established.')

        util.save_data(
            self.user_k_neibor, '../data/neibor/db_08_uu_' +
            str(user_near_num) + '_neibor_tricf_cv0.pkl')

        # compute item-item similarity matrix
        print('构建 item-item 相似度矩阵  ...')
        if load_save_sim:
            self.item_sim = util.load_data(
                '../data/sim/db_08_ii_tricf_cv0.pkl')
        else:
            for i1 in self.rg.item:
                for i2 in self.rg.item:
                    if i1 != i2:
                        if self.item_sim.contains(i1, i2):
                            continue
                        # 皮尔逊相似度? 修改为余弦相似度;
                        sim = pearson_sp(self.rg.get_col(i1),
                                         self.rg.get_col(i2))
                        sim = round(sim, 5)
                        self.item_sim.set(i1, i2, sim)
            util.save_data(self.item_sim, '../data/sim/db_08_ii_tricf_cv0.pkl')

        # compute the k neighbors of item
        # self.item_k_neibor = util.load_data(
        #     '../data/neibor/db_08_ii_' + str(item_near_num) + '_neibor_tricf.pkl')
        for item in self.rg.item:
            matchItems = sorted(self.item_sim[item].items(),
                                key=lambda x: x[1],
                                reverse=True)[:item_near_num]
            matchItems = matchItems[:item_near_num]
            self.item_k_neibor[item] = dict(matchItems)
        util.save_data(
            self.item_k_neibor, '../data/neibor/db_08_ii_' +
            str(item_near_num) + '_neibor_tricf_cv0.pkl')
        pass