示例#1
0
def test(train_file=train_file,
         test_file=test_file,
         uid_voc=uid_voc,
         mid_voc=mid_voc,
         cat_voc=cat_voc,
         batch_size=128,
         maxlen=100):
    # sample_io
    sample_io = SampleIO(train_file, test_file, uid_voc, mid_voc, cat_voc,
                         batch_size, maxlen, EMBEDDING_DIM)

    if xdl.get_config('model') == 'din':
        model = Model_DIN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
    elif xdl.get_config('model') == 'dien':
        model = Model_DIEN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
    else:
        raise Exception('only support din and dien model')

    # test
    # datas = sample_io.next_test()
    # test_ops = tf_test_model(*model.xdl_embedding(datas, EMBEDDING_DIM, *sample_io.get_n()))
    # print('='*10,'start test','='*10)
    test_ops = model.build_final_net(EMBEDDING_DIM, sample_io, is_train=False)
    print('=' * 10 + 'start test' + '=' * 10)
    saver = xdl.Saver()
    checkpoint_version = "ckpt-...............12000"
    saver.restore(version=checkpoint_version)
    eval_sess = xdl.TrainSession()
    print(
        'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f'
        % eval_model(eval_sess, test_ops))
示例#2
0
def test(train_file=train_file,
         test_file=test_file,
         uid_voc=uid_voc,
         mid_voc=mid_voc,
         cat_voc=cat_voc,
         batch_size=128,
         maxlen=100):
   # sample_io
    sample_io = SampleIO(train_file, test_file, uid_voc, mid_voc,
                         cat_voc, batch_size, maxlen, EMBEDDING_DIM)

    if xdl.get_config('model') == 'din':    
        model = Model_DIN(
            EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
    elif xdl.get_config('model') == 'dien':    
        model = Model_DIEN(
            EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
    else:
        raise Exception('only support din and dien model')

    # test
    datas = sample_io.next_test()
    test_ops = tf_test_model(
        *model.xdl_embedding(datas, EMBEDDING_DIM, *sample_io.get_n()))
    eval_sess = xdl.TrainSession()
    print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' %
          eval_model(eval_sess, test_ops))
示例#3
0
def train(train_file=train_file,
          test_file=test_file,
          uid_voc=uid_voc,
          mid_voc=mid_voc,
          cat_voc=cat_voc,
          item_info=item_info,
          reviews_info=reviews_info,
          batch_size=128,
          maxlen=100,
          test_iter=700):
    if xdl.get_config('model') == 'din':
        model = Model_DIN(
            EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
    elif xdl.get_config('model') == 'gwen':
        model = Model_GwEN(
            EMBEDDING_DIM, HIDDEN_SIZE)
    elif xdl.get_config('model') == 'dien':
        model = Model_DIEN(
            EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
    else:
        raise Exception('only support din and dien')

    sample_io = SampleIO(train_file, test_file, uid_voc, mid_voc,
                         cat_voc, item_info, reviews_info,
                         batch_size, maxlen, EMBEDDING_DIM)
    with xdl.model_scope('train'):
        train_ops = model.build_final_net(EMBEDDING_DIM, sample_io)
        lr = 0.001
        # Adam Adagrad
        train_ops.append(xdl.Adam(lr).optimize())
        hooks = []
        log_format = "[%(time)s] lstep[%(lstep)s] gstep[%(gstep)s] lqps[%(lqps)s] gqps[%(gqps)s] loss[%(loss)s]"
        hooks = [QpsMetricsHook(), MetricsPrinterHook(log_format)]
        if xdl.get_task_index() == 0:
            hooks.append(xdl.CheckpointHook(xdl.get_config('checkpoint', 'save_interval')))
        train_sess = xdl.TrainSession(hooks=hooks)

    with xdl.model_scope('test'):
        test_ops = model.build_final_net(
            EMBEDDING_DIM, sample_io, is_train=False)
        test_sess = xdl.TrainSession()

    model.run(train_ops, train_sess, test_ops, test_sess, test_iter=test_iter)
示例#4
0
def train(train_file=train_file,
          test_file=test_file,
          uid_voc=uid_voc,
          mid_voc=mid_voc,
          cat_voc=cat_voc,
          item_info=item_info,
          reviews_info=reviews_info,
          batch_size=128,
          maxlen=100,
          test_iter=700):
    if xdl.get_config('model') == 'din':
        model = Model_DIN(
            EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
    elif xdl.get_config('model') == 'dien':
        model = Model_DIEN(
            EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
    else:
        raise Exception('only support din and dien')

    sample_io = SampleIO(train_file, test_file, uid_voc, mid_voc,
                         cat_voc, item_info, reviews_info,
                         batch_size, maxlen, EMBEDDING_DIM)
    with xdl.model_scope('train'):
        train_ops = model.build_final_net(EMBEDDING_DIM, sample_io)
        lr = 0.001
        # Adam Adagrad
        train_ops.append(xdl.Adam(lr).optimize())
        hooks = []
        log_format = "[%(time)s] lstep[%(lstep)s] gstep[%(gstep)s] lqps[%(lqps)s] gqps[%(gqps)s] loss[%(loss)s]"
        hooks = [QpsMetricsHook(), MetricsPrinterHook(log_format)]
        if xdl.get_task_index() == 0:
            hooks.append(xdl.CheckpointHook(xdl.get_config('checkpoint', 'save_interval')))
        train_sess = xdl.TrainSession(hooks=hooks)

    with xdl.model_scope('test'):
        test_ops = model.build_final_net(
            EMBEDDING_DIM, sample_io, is_train=False)
        test_sess = xdl.TrainSession()

    model.run(train_ops, train_sess, test_ops, test_sess, test_iter=test_iter)
示例#5
0
def use_rocket_training():
    return xdl.get_config('rocket_training')
示例#6
0
def get_data_prefix():
    return xdl.get_config('data_dir')
示例#7
0
                       ATTENTION_SIZE,
                       LIGHT_EMBEDDING_DIM,
                       LIGHT_HIDDEN_SIZE,
                       LIGHT_ATTENTION_SIZE,
                       use_rocket_training=use_rocket_training())
    # test
    datas = sample_io.next_test()
    test_ops = tf_test_model(*model.xdl_embedding(
        datas, EMBEDDING_DIM, LIGHT_EMBEDDING_DIM, *sample_io.get_n()))
    eval_sess = xdl.TrainSession()
    print(
        'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f'
        % eval_model(eval_sess, test_ops))


if __name__ == '__main__':
    SEED = xdl.get_config("seed")
    if SEED is None:
        SEED = 3
    tf.set_random_seed(SEED)
    numpy.random.seed(SEED)
    random.seed(SEED)

    job_type = xdl.get_config("job_type")
    if job_type == 'train':
        train()
    elif job_type == 'test':
        test()
    else:
        print('job type must be train or test, do nothing...')
示例#8
0
def test(train_file=train_file,
         test_file=test_file,
         uid_voc=uid_voc,
         mid_voc=mid_voc,
         cat_voc=cat_voc,
         item_info=item_info,
         reviews_info=reviews_info,
         batch_size=99,
         maxlen=100):

    if xdl.get_config('model') == 'din':
        model = Model_DIN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
    elif xdl.get_config('model') == 'dien':
        model = Model_DIEN(EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
    else:
        raise Exception('only support din and dien model')

    # create item cate dict
    i_c = {}
    for i in item_c:
        ii = i.strip().split('\t')
        i_c[ii[0]] = ii[1]

    saver = xdl.Saver()
    checkpoint_version = "ckpt-...............20000"
    saver.restore(version=checkpoint_version)

    last_hist = []
    target_list = []
    seq = []
    test_set = pkl.load(open(test_file, 'rb'))
    knn_table = pkl.load(
        open('../data/ali_knn_table/knn' + str(test_file[-5]) + '_no_pro2.pkl',
             'rb'))
    print('length before deal with : ', len(test_set))
    test_knn = open('../data/test_knn', 'w')
    count22 = 0
    for i in test_set:
        # knn
        ss = i.strip().split('\t')
        last = ss[4].split('/')[-1]

        # append last, target, and seq
        last_hist.append(last)
        target_list.append(ss[2])
        seq.append((ss[1], ss[4]))  # uid and hist
        knn = knn_table[last]

        for k in knn:
            count22 += 1
            if k in i_c:
                tmp = '1\t' + ss[1] + '\t' + k + '\t' + i_c[k] + '\t' + ss[
                    4] + '\t' + ss[5]
            else:
                tmp = '1\t' + ss[1] + '\t' + k + '\t' + 'UNK' + '\t' + ss[
                    4] + '\t' + ss[5]
            print >> test_knn, tmp

    test_knn.close()

    print('after last_hist :', len(last_hist))
    print('all test_knn length :', count22)

    # sample_io
    test_knn_f = os.path.join(get_data_prefix(), 'test_knn')
    sample_io = SampleIO(train_file, test_knn_f, uid_voc, mid_voc, cat_voc,
                         item_info, reviews_info, batch_size, maxlen,
                         EMBEDDING_DIM)

    print('all length:', len(last_hist))

    test_ops = model.build_final_net(EMBEDDING_DIM, sample_io, is_train=False)
    print('=' * 10 + 'start test' + '=' * 10)
    eval_sess = xdl.TrainSession()
    pro_all, test_auc, loss_sum, accuracy_sum, aux_loss_sum = eval_model(
        eval_sess, test_ops)
    print(
        'test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f'
        % (test_auc, loss_sum, accuracy_sum, aux_loss_sum))

    print('after pro length :', len(pro_all))
    print('=' * 50)

    # sort the knn with prob
    rank_all_knn = {}
    rank = []
    for i in range(len(last_hist)):
        knn = knn_table[last_hist[i]]
        pro = pro_all[i]

        c = list(zip(knn, pro))
        c = sorted(c, key=lambda t: t[1], reverse=True)
        rank_all = [sss[0] for sss in c]
        rank_all_knn[seq[i][0]] = rank_all

        if target_list[i] in rank_all:
            rank.append(rank_all.index(target_list[i]) + 1)
        else:
            rank.append(100)

    # print(rank_all_knn)
    # save the result of re-rank
    user = [i[0] for i in seq]
    hist = [i[1] for i in seq]
    assert len(last_hist) == len(user)
    results = list(zip(user, hist, last_hist, target_list, rank))
    # results = pd.DataFrame(results, columns = ['last','target','rank'])
    # esults.to_csv('ali_dien_rank.csv',index=False)
    with open('ali_dien_rank_4days' + test_file[-11:], 'wb') as d:
        pkl.dump(results, d)
示例#9
0
def use_rocket_training():
    return xdl.get_config('rocket_training')
示例#10
0
def get_data_prefix():
    return xdl.get_config('data_dir')
示例#11
0
    sample_io = SampleIO(train_file, test_file, uid_voc, mid_voc,
                         cat_voc, batch_size, maxlen,
                         embedding_dim=EMBEDDING_DIM, 
                         light_embedding_dim=LIGHT_EMBEDDING_DIM)
    model = Model_DIEN(
        EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, LIGHT_EMBEDDING_DIM, 
        LIGHT_HIDDEN_SIZE, LIGHT_ATTENTION_SIZE, use_rocket_training=use_rocket_training())
    # test
    datas = sample_io.next_test()
    test_ops = tf_test_model(
        *model.xdl_embedding(datas, EMBEDDING_DIM, LIGHT_EMBEDDING_DIM, *sample_io.get_n()))
    eval_sess = xdl.TrainSession()
    print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' %
          eval_model(eval_sess, test_ops))

if __name__ == '__main__':
    SEED = xdl.get_config("seed")
    if SEED is None:
        SEED = 3
    tf.set_random_seed(SEED)
    numpy.random.seed(SEED)
    random.seed(SEED)

    job_type = xdl.get_config("job_type")
    if job_type == 'train':
        train()
    elif job_type == 'test':
        test()
    else:
        print('job type must be train or test, do nothing...')
示例#12
0
def run(is_training, files):

    data_io = reader("esmm", files, 2, batch_size, 2, user_fn, ad_fn)
    batch = data_io.read()

    user_embs = list()
    for fn in user_fn:
        emb = xdl.embedding('u_' + fn,
                            batch[fn],
                            xdl.TruncatedNormal(stddev=0.001),
                            embed_size,
                            1000,
                            'sum',
                            vtype='hash')
        user_embs.append(emb)

    ad_embs = list()
    for fn in ad_fn:
        emb = xdl.embedding('a_' + fn,
                            batch[fn],
                            xdl.TruncatedNormal(stddev=0.001),
                            embed_size,
                            1000,
                            'sum',
                            vtype='hash')
        ad_embs.append(emb)

    var_list = model(is_training)(ad_embs, user_embs, batch["indicators"][0],
                                  batch["label"])
    keys = [
        'loss', 'ctr_prop', 'ctcvr_prop', 'cvr_prop', 'ctr_label',
        'ctcvr_label', 'cvr_label'
    ]
    run_vars = dict(zip(keys, list(var_list)))

    hooks = []
    if is_training:
        train_op = xdl.Adam(lr).optimize()
        hooks = get_collection(READER_HOOKS)
        if hooks is None:
            hooks = []
        if xdl.get_task_index() == 0:
            ckpt_hook = xdl.CheckpointHook(1000)
            hooks.append(ckpt_hook)

        run_vars.update({None: train_op})

    if is_debug > 1:
        print("=========gradients")
        grads = xdl.get_gradients()
        grads_keys = grads[''].keys()
        grads_keys.sort()
        for key in grads_keys:
            run_vars.update({"grads {}".format(key): grads[''][key]})

    hooks.append(QpsMetricsHook())
    log_format = "lstep[%(lstep)s] gstep[%(gstep)s] " \
                 "lqps[%(lqps)s] gqps[%(gqps)s]"
    hooks.append(MetricsPrinterHook(log_format, 100))

    ckpt = xdl.get_config("checkpoint", "ckpt")
    if ckpt is not None and len(ckpt) > 0:
        if int(xdl.get_task_index()) == 0:
            from xdl.python.training.saver import Saver
            saver = Saver()
            print("restore from %s" % ckpt)
            saver.restore(ckpt)
        else:
            time.sleep(120)

    sess = xdl.TrainSession(hooks)

    if is_training:
        itr = 1
        ctr_auc = Auc('ctr')
        ctcvr_auc = Auc('ctcvr')
        cvr_auc = Auc('cvr')
        while not sess.should_stop():
            print('iter=', itr)
            values = sess.run(run_vars.values())
            if not values:
                continue
            value_map = dict(zip(run_vars.keys(), values))
            print('loss=', value_map['loss'])
            ctr_auc.add(value_map['ctr_prop'], value_map['ctr_label'])
            ctcvr_auc.add(value_map['ctcvr_prop'], value_map['ctcvr_label'])
            cvr_auc.add_with_filter(value_map['cvr_prop'],
                                    value_map['cvr_label'],
                                    np.where(value_map['ctr_label'] == 1))
            itr += 1
        ctr_auc.show()
        ctcvr_auc.show()
        cvr_auc.show()
    else:
        ctr_test_auc = Auc('ctr')
        ctcvr_test_auc = Auc('ctcvr')
        cvr_test_auc = Auc('cvr')
        for i in xrange(test_batch_num):
            print('iter=', i + 1)
            values = sess.run(run_vars.values())
            value_map = dict(zip(run_vars.keys(), values))
            print('test_loss=', value_map['loss'])
            ctr_test_auc.add(value_map['ctr_prop'], value_map['ctr_label'])
            ctcvr_test_auc.add(value_map['ctcvr_prop'],
                               value_map['ctcvr_label'])
            cvr_test_auc.add_with_filter(value_map['cvr_prop'],
                                         value_map['cvr_label'],
                                         np.where(value_map['ctr_label'] == 1))
        ctr_test_auc.show()
        ctcvr_test_auc.show()
        cvr_test_auc.show()
示例#13
0
            mx.symbol.log(ctr_prop) * ctr_label + \
            mx.symbol.log(ctcvr_prop) * ctcvr_label, \
            axis=1, keepdims=True), \
            normalization="null", \
            grad_scale = 1.0 / batch_size, \
            name="esmm_loss")
        ctr_loss = - mx.symbol.sum(mx.symbol.log(ctr_prop) * ctr_label ) \
                / batch_size
        ctcvr_loss = - mx.symbol.sum(mx.symbol.log(ctcvr_prop) * ctcvr_label) \
                / batch_size

        cnt_cvr_sample = mx.symbol.sum_axis(ctr_clk)
        cnt_ctcvr_sample = mx.symbol.sum_axis(ctcvr_buy)

        cvr_loss = - mx.symbol.sum(mx.symbol.sum_axis( \
            mx.symbol.log(cvr_prop) * ctcvr_label, \
            axis=1, keepdims=True) * ctr_clk) / cnt_cvr_sample

        return loss_r, mx.sym.BlockGrad(ctr_prop_one), \
               mx.sym.BlockGrad(ctcvr_prop_one), \
               mx.sym.BlockGrad(cvr_prop_one), mx.sym.BlockGrad(ctr_clk), \
               mx.sym.BlockGrad(ctcvr_buy), mx.sym.BlockGrad(ctcvr_buy)

    return _model


if __name__ == '__main__':
    is_training = xdl.get_config("is_training")
    files = xdl.get_config("files")
    run(is_training, files)