예제 #1
0
    def __init__(self, p=None):
        """
        构建模型参数,加载数据
            把前80%分为6:2用作train和valid,来选择超参数, 不用去管剩下的20%.
            把前80%作为train,剩下的是test,把valid时学到的参数拿过来跑程序.
            valid和test部分,程序是一样的,区别在于送入的数据而已。
        :param p: 一个标示符,没啥用
        :return:
        """
        global PATH
        # 1. 建立各参数。要调整的地方都在 p 这了,其它函数都给写死。
        if not p:
            v = 1  # 写1就是valid, 写0就是test
            assert 0 == v or 1 == v  # no other case
            p = OrderedDict([
                ('dataset', 'user_buys.txt'),
                ('fea_image', 'normalized_features_image/'),
                ('fea_text', 'normalized_features_text/'),
                ('mode', 'valid' if 1 == v else 'test'),
                ('split', 0.8),  # valid: 6/2/2。test: 8/2.
                ('at_nums', [10, 20, 30, 50]),
                ('intervals', [2, 10, 30
                               ]),  # 以次数2为间隔,分为10个区间. 计算auc/recall@30上的. 换为10
                ('epochs', 30 if 'taobao' in PATH else 50),
                ('fea_random_zero', 0.0),  # 0.2 / 0.4
                ('latent_size', [20, 1024, 100]),
                ('alpha', 0.1),
                ('lambda', 0.0),  # 要不要self.lt和self.ux/wh/bi用不同的lambda?
                ('lambda_ev', 0.0),  # 图文降维局矩阵的。就是这个0.0
                ('lambda_ae', None),  # 重构误差的。
                ('mini_batch', None),  # 0:one_by_one,     1:mini_batch
                ('mvgru', 0),  # 0:bpr, # 1:vbpr
                ('batch_size_train', 1),  # size大了之后性能下降非常严重
                ('batch_size_test', 768),  # user*item矩阵太大,要多次计算。a5下亲测768最快。
            ])
            for i in p.items():
                print(i)
            assert 'valid' == p['mode'] or 'test' == p['mode']

        # 2. 加载数据
        # 因为train/set里每项的长度不等,无法转换为完全的(n, m)矩阵样式,所以shared会报错.
        [(user_num, item_num), aliases_dict,
         (test_i_cou, test_i_intervals_cumsum, test_i_cold_active),
         (tra_buys, tes_buys)] = \
            load_data(os.path.join(PATH, p['dataset']),
                      p['mode'], p['split'], p['intervals'])
        # 正样本加masks
        tra_buys_masks, tra_masks = fun_data_buys_masks(tra_buys,
                                                        tail=[item_num
                                                              ])  # 预测时算用户表达用
        tes_buys_masks, tes_masks = fun_data_buys_masks(tes_buys,
                                                        tail=[item_num
                                                              ])  # 预测时用
        # 负样本加masks
        tra_buys_neg_masks = fun_random_neg_tra(
            item_num, tra_buys_masks)  # 训练时用(逐条、mini-batch均可)
        tes_buys_neg_masks = fun_random_neg_tes(item_num, tra_buys_masks,
                                                tes_buys_masks)  # 预测时用

        # 3. 创建类变量
        self.p = p
        self.user_num, self.item_num = user_num, item_num
        self.aliases_dict = aliases_dict
        self.tic, self.tiic, self.tica = test_i_cou, test_i_intervals_cumsum, test_i_cold_active
        self.tra_buys, self.tes_buys = tra_buys, tes_buys
        self.tra_buys_masks, self.tra_masks, self.tra_buys_neg_masks = tra_buys_masks, tra_masks, tra_buys_neg_masks
        self.tes_buys_masks, self.tes_masks, self.tes_buys_neg_masks = tes_buys_masks, tes_masks, tes_buys_neg_masks
예제 #2
0
def train_valid_or_test(p=None):
    """
    构建模型参数,加载数据
        把前80%分为6:2用作train和valid,来选择超参数, 不用去管剩下的20%.
        把前80%作为train,剩下的是test,把valid时学到的参数拿过来跑程序.
        valid和test部分,程序是一样的,区别在于送入的数据而已。
    :param p: 一个标示符,没啥用
    :return:
    """
    global PATH
    # 1. 建立各参数。要调整的地方都在 p 这了,其它函数都给写死。
    if not p:
        v = 1  # 写1就是valid, 写0就是test
        assert 0 == v or 1 == v  # no other case
        p = OrderedDict([
            ('dataset', 'user_buys.txt'),
            ('fea_image', 'normalized_features_image/'),
            ('fea_text', 'normalized_features_text/'),
            ('mode', 'valid' if 1 == v else 'test'),
            ('split', 0.8),  # valid: 6/2/2。test: 8/2.
            ('at_nums', [10, 20, 30, 50]),  # 5, 15
            ('intervals', [2, 10,
                           30]),  # 以次数2为间隔,分为10个区间. 计算auc/recall@30上的. 换为10
            ('batch_size_train', 4),  # size大了之后性能下降非常严重
            ('batch_size_test', 768),  # user*item矩阵太大,要多次计算。a5下亲测768最快。
        ])
        for e in p.items():
            print(e)
        assert 'valid' == p['mode'] or 'test' == p['mode']

    # 2. 加载数据
    # 因为train/set里每项的长度不等,无法转换为完全的(n, m)矩阵样式,所以shared会报错.
    [(user_num, item_num), aliases_dict,
     (test_i_cou, test_i_intervals_cumsum, test_i_cold_active),
     (tra_buys, tes_buys)] = \
        load_data(os.path.join(PATH, p['dataset']),
                  p['mode'], p['split'], p['intervals'])
    # 正样本加masks
    tra_buys_masks, tra_masks = fun_data_buys_masks(tra_buys,
                                                    tail=[item_num
                                                          ])  # 预测时算用户表达用
    tes_buys_masks, tes_masks = fun_data_buys_masks(tes_buys,
                                                    tail=[item_num])  # 预测时用
    # 负样本加masks
    # tra_buys_neg_masks = fun_random_neg_tra(item_num, tra_buys_masks)   # 训练时用(逐条、mini-batch均可)
    tes_buys_neg_masks = fun_random_neg_tes(item_num, tra_buys_masks,
                                            tes_buys_masks)  # 预测时用

    # --------------------------------------------------------------------------------------------------------------
    # 获得按购买次数由大到小排序的items, 出现次数相同的,随机排列。
    tra = []
    for buy in tra_buys:
        tra.extend(buy)
    train_i = set(tra)
    train_i_cou = dict(Counter(tra))  # {item: num, }, 各个item出现的次数
    lst = defaultdict(list)
    for item, count in train_i_cou.items():
        lst[count].append(item)
    # 某个被购买次数(count)下各有哪些商品,商品数目是count。count越大,这些items越popular
    lst = list(lst.items())  # [(num, [item1, item2, ...]), ]
    lst = list(sorted(lst, key=lambda x: x[0]))[::-1]  # 被购买次数多的,出现在首端
    sequence = []
    for count, items in lst:
        sequence.extend(random.sample(items, len(items)))  # 某个购买次数下的各商品,随机排列。

    def fun_judge_tes_and_neg(tes_mark_neg):
        tes, mark, tes_neg, _ = tes_mark_neg
        zero_one = []
        for idx, flag in enumerate(mark):
            if 0 == flag:
                zero_one.append(0)
            else:
                i, j = tes[idx], tes_neg[idx]
                if i in train_i and j in train_i:
                    zero_one.append(
                        1 if train_i_cou[i] > train_i_cou[j] else 0)
                elif i in train_i and j not in train_i:
                    zero_one.append(1)
                elif i not in train_i and j in train_i:
                    zero_one.append(0)
                else:
                    zero_one.append(0)
        return zero_one  # 与mask等长的0/1序列。1表示用户买的商品比负样本更流行。

    # --------------------------------------------------------------------------------------------------------------
    print("\tPop ...")
    append = [[0] for _ in np.arange(len(tes_buys_masks))]
    all_upqs = np.apply_along_axis(  # 判断tes里的是否比tes_neg更流行
        func1d=fun_judge_tes_and_neg,
        axis=1,
        arr=np.array(zip(tes_buys_masks, tes_masks, tes_buys_neg_masks,
                         append)))
    recom = sequence[:p['at_nums'][-1]]  # 每个用户都给推荐前100个最流行的
    all_ranks = np.array([recom for _ in np.arange(user_num)])

    # 存放最优数据。计算各种指标并输出。
    best = GlobalBest(at_nums=p['at_nums'], intervals=p['intervals'])
    fun_predict_pop_random(p, best, all_upqs, all_ranks, tes_buys_masks,
                           tes_masks, test_i_cou, test_i_intervals_cumsum,
                           test_i_cold_active)
    best.fun_print_best(epoch=0)  # 每次都只输出当前最优的结果

    # --------------------------------------------------------------------------------------------------------------
    print("\tRandom ...")
    all_upqs = None  # random的auc就是0.5,直接引用文献里的说法。
    seq_random = sample(sequence, len(sequence))  # 先把总序列打乱顺序。再每个用户都给随机推荐100个
    all_ranks = np.array(
        [sample(seq_random, p['at_nums'][-1]) for _ in np.arange(user_num)])

    # 存放最优数据。计算各种指标并输出。
    best = GlobalBest(at_nums=p['at_nums'], intervals=p['intervals'])
    fun_predict_pop_random(p, best, all_upqs, all_ranks, tes_buys_masks,
                           tes_masks, test_i_cou, test_i_intervals_cumsum,
                           test_i_cold_active)
    best.fun_print_best(epoch=0)  # 每次都只输出当前最优的结果