def get_mov_combined_features():

    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1

    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')

    mov_emb = layers.embedding(input=mov_id,
                               dtype='float32',
                               size=[MOV_DICT_SIZE, 32],
                               param_attr='movie_table',
                               is_sparse=IS_SPARSE)

    mov_fc = layers.fc(input=mov_emb, size=32)

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

    category_id = layers.data(name='category_id',
                              shape=[1],
                              dtype='int64',
                              lod_level=1)

    mov_categories_emb = layers.embedding(input=category_id,
                                          size=[CATEGORY_DICT_SIZE, 32],
                                          is_sparse=IS_SPARSE)

    mov_categories_hidden = layers.sequence_pool(input=mov_categories_emb,
                                                 pool_type="sum")

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

    mov_title_id = layers.data(name='movie_title',
                               shape=[1],
                               dtype='int64',
                               lod_level=1)

    mov_title_emb = layers.embedding(input=mov_title_id,
                                     size=[MOV_TITLE_DICT_SIZE, 32],
                                     is_sparse=IS_SPARSE)

    # 电影标题名称(title)是一个序列的整数,整数代表的是这个词在索引序列中的下标。
    # 这个序列会被送入 sequence_conv_pool 层,这个层会在时间维度上使用卷积和池化。
    # 因为如此,所以输出会是固定长度,尽管输入的序列长度各不相同。
    mov_title_conv = nets.sequence_conv_pool(input=mov_title_emb,
                                             num_filters=32,
                                             filter_size=3,
                                             act="tanh",
                                             pool_type="sum")

    concat_embed = layers.concat(
        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)

    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")

    return mov_combined_features
Exemplo n.º 2
0
def textcnn(token_ids, vocab_size, num_classes, emb_dim, num_filters,
            mlp_hid_dim):
    """TextCNN模型前向过程实现
    Args:
        token_ids: 包含不同长度样本的lod tensor,形状为[-1,1]
        vocab_size: 词典大小
        num_classes: 类别数量
        emb_dim: 词向量维度
        num_filters: 每种尺寸的卷积核数量
        mlm_hid_dim: MLP的隐层维度
    Returns:
        prediction: 预测结果,各个类别的概率分布
    """
    emb = layers.embedding(  # 得到输入样本的词向量表示
        input=token_ids, size=[vocab_size, emb_dim])

    res_size3 = nets.sequence_conv_pool(  # 尺寸为3的卷积层&池化操作
        input=emb,
        num_filters=num_filters,
        filter_size=3,
        act="tanh",
        pool_type="max")
    res_size4 = nets.sequence_conv_pool(  # 尺寸为4的卷积层&池化操作
        input=emb,
        num_filters=num_filters,
        filter_size=4,
        act="tanh",
        pool_type="max")
    res_size5 = nets.sequence_conv_pool(  # 尺寸为5的卷积层&池化操作
        input=emb,
        num_filters=num_filters,
        filter_size=5,
        act="tanh",
        pool_type="max")
    hidden = layers.fc(  # 特征向量到MLP隐层的映射
        input=[res_size3, res_size4, res_size4],
        size=mlp_hid_dim)
    prediction = fluid.layers.fc(  # MLP隐层到类别
        input=hidden, size=num_classes, act="softmax")
    return prediction
def get_mov_combined_features():

    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1

    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')

    mov_emb = layers.embedding(input=mov_id,
                               dtype='float32',
                               size=[MOV_DICT_SIZE, 32],
                               param_attr='movie_table',
                               is_sparse=IS_SPARSE)

    mov_fc = layers.fc(input=mov_emb, size=32)

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

    category_id = layers.data(name='category_id',
                              shape=[1],
                              dtype='int64',
                              lod_level=1)

    mov_categories_emb = layers.embedding(input=category_id,
                                          size=[CATEGORY_DICT_SIZE, 32],
                                          is_sparse=IS_SPARSE)

    mov_categories_hidden = layers.sequence_pool(input=mov_categories_emb,
                                                 pool_type="sum")

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

    mov_title_id = layers.data(name='movie_title',
                               shape=[1],
                               dtype='int64',
                               lod_level=1)

    mov_title_emb = layers.embedding(input=mov_title_id,
                                     size=[MOV_TITLE_DICT_SIZE, 32],
                                     is_sparse=IS_SPARSE)

    mov_title_conv = nets.sequence_conv_pool(input=mov_title_emb,
                                             num_filters=32,
                                             filter_size=3,
                                             act="tanh",
                                             pool_type="sum")

    concat_embed = layers.concat(
        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)

    # FIXME(dzh) : need tanh operator
    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")

    return mov_combined_features
def get_mov_combined_features():

    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1

    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')

    mov_emb = layers.embedding(
        input=mov_id,
        dtype='float32',
        size=[MOV_DICT_SIZE, 32],
        param_attr='movie_table',
        is_sparse=IS_SPARSE)

    mov_fc = layers.fc(input=mov_emb, size=32)

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

    category_id = layers.data(
        name='category_id', shape=[1], dtype='int64', lod_level=1)

    mov_categories_emb = layers.embedding(
        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)

    mov_categories_hidden = layers.sequence_pool(
        input=mov_categories_emb, pool_type="sum")

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

    mov_title_id = layers.data(
        name='movie_title', shape=[1], dtype='int64', lod_level=1)

    mov_title_emb = layers.embedding(
        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)

    mov_title_conv = nets.sequence_conv_pool(
        input=mov_title_emb,
        num_filters=32,
        filter_size=3,
        act="tanh",
        pool_type="sum")

    concat_embed = layers.concat(
        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)

    # FIXME(dzh) : need tanh operator
    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")

    return mov_combined_features
Exemplo n.º 5
0
def model():
    """model"""
    user_phone_brand_id = layers.data(name='user_phone_brand', shape=[1], dtype='int64')
    user_gender_id = layers.data(name='user_gender', shape=[1], dtype='int64')
    user_age_id = layers.data(name='user_age', shape=[1], dtype='int64')
    user_status_id = layers.data(name='user_status', shape=[1], dtype="int64")
    user_trade_id = fluid.layers.data(name='user_trade', shape=[1], dtype='int64')
    user_cater_id = fluid.layers.data(name='user_cater', shape=[1], dtype='int64')
    user_income_id = fluid.layers.data(name='user_income', shape=[1], dtype='int64')

    user_city_id = fluid.layers.data(name='user_city', shape=[1], dtype='int64')

    user_click_id = fluid.layers.data(name='user_click', shape=[1], dtype='int64')
    user_b_click_id = fluid.layers.data(name='user_b_click', shape=[1], dtype='int64')
    user_c_click_id = fluid.layers.data(name='user_c_click', shape=[1], dtype='int64')
    user_d_click_id = fluid.layers.data(name='user_d_click', shape=[1], dtype='int64')

    week_id = layers.data(name='week', shape=[1], dtype="int64")
    hour_id = layers.data(name='hour', shape=[1], dtype='int64')

    content_b_c_d_id = layers.data(name='content_b_c_d', shape=[1], dtype='int64')
    content_tags_id = layers.data(name='content_tags', shape=[1], dtype='int64', lod_level=1)
    content_subtags_id = layers.data(name='content_subtags', shape=[1], dtype='int64', lod_level=1)

    user_content_tag_click_id = layers.data(name='user_content_tag_click', shape=[1], dtype='int64')
    user_content_subtag_click_id = layers.data(name='user_content_subtag_click', shape=[1], dtype='int64')

    content_pctr_discrete_id = layers.data(name='content_pctr_discrete', shape=[1], dtype='int64')
    # dnn_score_discrete_id = layers.data(name='dnn_score_discrete', shape=[1], dtype='int64')

    content_pctr = layers.data(name='content_pctr', shape=[1], dtype='float32')
    # dnn_score = layers.data(name='dnn_score', shape=[1], dtype='float32')
    # content_emb = layers.data(name='content_emb', shape=[64], dtype='float32')
    # user_emb = layers.data(name='user_emb', shape=[64], dtype='float32')

    user_click_tags_id = layers.data(
        name='user_click_tags_id', shape=[1], dtype='int64', lod_level=1)
    user_click_subtags_id = layers.data(
        name='user_click_subtags_id', shape=[1], dtype='int64', lod_level=1)
    candidate_title_word = layers.data(name='candidate_title', shape=[1], dtype='int64', lod_level=1)
    candidate_subtitle_word = layers.data(name='candidate_subtitle', shape=[1], dtype='int64', lod_level=1)
    candidate_title_len_id = layers.data(name='candidate_title_len', shape=[1], dtype='int64')
    candidate_subtitle_len_id = layers.data(name='candidate_subtitle_len', shape=[1], dtype='int64')

    click_title_list = layers.data(name='click_title_list', shape=[1], dtype='int64', lod_level=2)
    click_subtitle_list = layers.data(name='click_subtitle_list', shape=[1], dtype='int64', lod_level=2)
    click_title_len_list = layers.data(name='click_title_len_list', shape=[1], dtype='int64', lod_level=1)
    click_subtitle_len_list = layers.data(name='click_subtitle_len_list', shape=[1], dtype='int64', lod_level=1)

    label = layers.data(name='label', shape=[1], dtype='int64')
    # dnn_score_discrete_id.name, dnn_score.name, content_emb.name,user_emb.name,
    load_list = [user_phone_brand_id, user_gender_id, user_age_id,
                  user_status_id, user_trade_id, user_cater_id, user_income_id,
                  user_city_id, user_click_id, user_b_click_id, user_c_click_id,
                  user_d_click_id, week_id, hour_id, content_b_c_d_id,
                  content_tags_id, content_subtags_id, user_content_tag_click_id,
                  user_content_subtag_click_id, content_pctr_discrete_id,
                  content_pctr,
                  user_click_tags_id, user_click_subtags_id, candidate_title_word,
                  candidate_subtitle_word, candidate_title_len_id, candidate_subtitle_len_id,
                  click_title_list, click_subtitle_list,
                  click_title_len_list, click_subtitle_len_list,
                  label]
    feed_order = [x.name for x in load_list]

    user_phone_brand_emb = layers.embedding(
        input=user_phone_brand_id, dtype='float32',
        size=[7, EMB_LEN], param_attr='user_phone_brand_emb', is_sparse=True)
    user_gender_emb = layers.embedding(
        input=user_gender_id, dtype='float32',
        size=[3, EMB_LEN], param_attr='user_gender_emb', is_sparse=True)
    user_age_emb = layers.embedding(
        input=user_age_id, dtype='float32',
        size=[8, EMB_LEN], param_attr='user_age_emb', is_sparse=True)
    user_status_emb = layers.embedding(
        input=user_status_id, dtype='float32',
        size=[3, EMB_LEN], is_sparse=True, param_attr='user_status_emb')
    user_trade_emb = layers.embedding(
        input=user_trade_id, dtype='float32',
        size=[24, EMB_LEN], is_sparse=True, param_attr='user_trade_emb')
    user_cater_emb = layers.embedding(
        input=user_cater_id, dtype='float32',
        size=[4, EMB_LEN], is_sparse=True, param_attr='user_cater_emb')
    user_income_emb = layers.embedding(
        input=user_income_id, dtype='float32',
        size=[6, EMB_LEN], is_sparse=True, param_attr='user_income_emb')

    user_city_emb = layers.embedding(
        input=user_city_id, dtype='float32',
        size=[4000, EMB_LEN], is_sparse=True, param_attr='user_city_emb')

    user_click_emb = layers.embedding(
        input=user_click_id, dtype='float32',
        size=[6, EMB_LEN], is_sparse=True, param_attr='user_click_emb')
    user_b_click_emb = layers.embedding(
        input=user_b_click_id, dtype='float32',
        size=[6, EMB_LEN], is_sparse=True, param_attr='user_b_click_emb')
    user_c_click_emb = layers.embedding(
        input=user_c_click_id, dtype='float32',
        size=[6, EMB_LEN], is_sparse=True, param_attr='user_c_click_emb')
    user_d_click_emb = layers.embedding(
        input=user_d_click_id, dtype='float32',
        size=[6, EMB_LEN], is_sparse=True, param_attr='user_d_click_emb')

    week_emb = layers.embedding(
        input=week_id, dtype='float32',
        size=[8, EMB_LEN], is_sparse=True, param_attr='week_emb')
    hour_emb = layers.embedding(
        input=hour_id, dtype='float32',
        size=[24, EMB_LEN], is_sparse=True, param_attr='hour_emb')

    content_b_c_d_emb = layers.embedding(
        input=content_b_c_d_id, dtype='float32',
        size=[3, EMB_LEN], is_sparse=True, param_attr='content_b_c_d_emb')

    content_tags_emb = layers.embedding(
        input=content_tags_id, size=[11, EMB_LEN], dtype='float32', is_sparse=True,
        param_attr=fluid.ParamAttr(
            name="content_tags_emb", learning_rate=0.5, regularizer=fluid.regularizer.L2Decay(1.0))
    )
    content_tags_emb_avg = fluid.layers.sequence_pool(input=content_tags_emb, pool_type='average')

    content_subtags_emb = layers.embedding(
        input=content_subtags_id, size=[65, EMB_LEN], dtype='float32', is_sparse=True,
        param_attr=fluid.ParamAttr(
            name="content_subtags_emb", learning_rate=0.5,
            regularizer=fluid.regularizer.L2Decay(1.0))
    )
    content_subtags_emb_avg = fluid.layers.sequence_pool(
        input=content_subtags_emb, pool_type='average')

    user_content_tag_click_emb = layers.embedding(
        input=user_content_tag_click_id, dtype='float32',
        size=[11 * 6, EMB_LEN], is_sparse=True, param_attr='user_content_tag_click_emb')
    user_content_subtag_click_emb = layers.embedding(
        input=user_content_subtag_click_id, dtype='float32',
        size=[65 * 6, EMB_LEN], is_sparse=True, param_attr='user_content_subtag_click_emb')

    content_pctr_discrete_emb = layers.embedding(
        input=content_pctr_discrete_id, dtype='float32',
        size=[55, EMB_LEN], is_sparse=True, param_attr='content_pctr_discrete_emb')
    # dnn_score_discrete_emb = layers.embedding(
    #     input=dnn_score_discrete_id, dtype='float32',
    #     size=[21, EMB_LEN], is_sparse=True, param_attr='dnn_score_discrete_emb')

    user_click_tags_id_emb = layers.embedding(
        input=user_click_tags_id, size=[11 * 6, EMB_LEN], dtype='float32', is_sparse=True,
        param_attr="user_content_tag_click_emb")
    user_click_tags_id_emb_avg = fluid.layers.sequence_pool(
        input=user_click_tags_id_emb, pool_type='average')
    user_click_subtags_id_emb = layers.embedding(
        input=user_click_subtags_id, size=[65 * 6, EMB_LEN], dtype='float32', is_sparse=True,
        param_attr="user_content_subtag_click_emb")
    user_click_subtags_id_emb_avg = fluid.layers.sequence_pool(
        input=user_click_subtags_id_emb, pool_type='average')

    # 候选内容feature生成
    cand_title_emb = layers.embedding(input=candidate_title_word, size=[19962, EMB_LEN], dtype='float32',
                                      is_sparse=False, param_attr='word_embedding')
    cand_title_conv_pool = nets.sequence_conv_pool(
        input=cand_title_emb, num_filters=NUM_FILTERS, filter_size=3,
        act="relu", pool_type="average", param_attr='title_emb_conv', bias_attr='title_emb_conv_b')

    cand_subtitle_emb = layers.embedding(input=candidate_subtitle_word, size=[19962, EMB_LEN], dtype='float32',
                                         is_sparse=False, param_attr='word_embedding')
    cand_subtitle_conv_pool = nets.sequence_conv_pool(
        input=cand_subtitle_emb, num_filters=NUM_FILTERS, filter_size=3,
        act="relu", pool_type="average", param_attr='subtitle_emb_conv', bias_attr='subtitle_emb_conv_b')

    cand_title_len_emb = layers.embedding(input=candidate_title_len_id, size=[100, EMB_LEN], dtype='float32',
                                          is_sparse=True, param_attr='title_len_emb')
    cand_subtitle_len_emb = layers.embedding(input=candidate_subtitle_len_id, size=[100, EMB_LEN], dtype='float32',
                                             is_sparse=True, param_attr='subtitle_len_emb')

    cand_title_inf = layers.concat(
        input=[cand_title_conv_pool, cand_subtitle_conv_pool,
               cand_title_len_emb, cand_subtitle_len_emb], axis=-1)
    cand_title_feature = layers.fc(
        input=cand_title_inf, size=32, act="relu", param_attr='title_feature_list') #共享参数

    # 用户历史点击内容feature生成
    click_title_emb = layers.embedding(input=click_title_list, size=[19962, EMB_LEN], dtype='float32',
                                       is_sparse=False, param_attr='word_embedding')
    click_title_drnn = fluid.layers.DynamicRNN()
    with click_title_drnn.block():
        title_emb = click_title_drnn.step_input(click_title_emb)
        click_title_conv_pool = nets.sequence_conv_pool(
            input=title_emb, num_filters=NUM_FILTERS, filter_size=3,
            act="relu", pool_type="average", param_attr='title_emb_conv', bias_attr='title_emb_conv_b')
        click_title_drnn.output(click_title_conv_pool)
    click_title_conv_pool_list = click_title_drnn()

    click_subtitle_emb = layers.embedding(input=click_subtitle_list, size=[19962, EMB_LEN], dtype='float32',
                                       is_sparse=False, param_attr='word_embedding')
    click_subtitle_drnn = fluid.layers.DynamicRNN()
    with click_subtitle_drnn.block():
        subtitle_emb = click_subtitle_drnn.step_input(click_subtitle_emb)
        click_subtitle_conv_pool = nets.sequence_conv_pool(
            input=subtitle_emb, num_filters=NUM_FILTERS, filter_size=3,
            act="relu", pool_type="average", param_attr='subtitle_emb_conv', bias_attr='subtitle_emb_conv_b')
        click_subtitle_drnn.output(click_subtitle_conv_pool)
    click_subtitle_conv_pool_list = click_subtitle_drnn()

    click_title_len_emb_list = layers.embedding(input=click_title_len_list, size=[100, EMB_LEN], dtype='float32',
                                          is_sparse=True, param_attr='title_len_emb')
    click_subtitle_len_emb_list = layers.embedding(input=click_subtitle_len_list, size=[100, EMB_LEN], dtype='float32',
                                          is_sparse=True, param_attr='subtitle_len_emb')

    click_title_inf_list = layers.concat(
        input=[click_title_conv_pool_list, click_subtitle_conv_pool_list,
               click_title_len_emb_list, click_subtitle_len_emb_list], axis=-1)
    click_title_feature_list = layers.fc(
        input=click_title_inf_list, size=32, act="relu", param_attr='title_feature_list') #共享参数
    user_click_title_feature = layers.sequence_pool(input=click_title_feature_list, pool_type="average")

    user_emb_feature = layers.concat(
        input=[user_phone_brand_emb, user_gender_emb, user_age_emb, user_status_emb, user_trade_emb,
               user_cater_emb, user_income_emb, user_city_emb,
               user_click_emb, user_b_click_emb, user_c_click_emb, user_d_click_emb], axis=1)
    content_emb_feature = layers.concat(
        input=[content_b_c_d_emb, content_tags_emb_avg, content_subtags_emb_avg,
               content_pctr_discrete_emb, cand_title_feature], axis=1)
    cross_emb_feature = layers.concat(
        input=[user_content_tag_click_emb, user_content_subtag_click_emb,
               user_click_tags_id_emb_avg, user_click_subtags_id_emb_avg,
               user_click_title_feature], axis=1)
    env_emb_feature = layers.concat(
        input=[week_emb, hour_emb], axis=1)

    combined_features = layers.concat(input=[
        user_emb_feature, content_emb_feature, cross_emb_feature, env_emb_feature], axis=1)

    fc1 = layers.fc(input=combined_features, size=200, act='relu', param_attr='fc1', bias_attr='fc1_b')
    fc2 = layers.fc(input=fc1, size=200, act="relu", param_attr='fc2', bias_attr='fc2_b')
    fc3 = layers.fc(input=fc2, size=200, act="relu", param_attr='fc3', bias_attr='fc3_b')

    content_pctr_discrete_id_one_hot = layers.one_hot(
        content_pctr_discrete_id, 55, allow_out_of_range=False)

    final_layer = layers.concat(input=[fc3, content_pctr, content_pctr_discrete_id_one_hot], axis=1)
    predict = layers.fc(
        input=final_layer, size=2, act="softmax",
        param_attr='final_predict', bias_attr='final_predict_b')

    auc = fluid.layers.auc(
        input=predict, label=label, num_thresholds=2 ** 12)
    cost = layers.cross_entropy(input=predict, label=label)
    avg_cost = layers.reduce_mean(cost)

    loader = fluid.io.DataLoader.from_generator(
        feed_list=load_list, capacity=256, use_double_buffer=True, iterable=True)

    return {'predict': predict, 'avg_cost': avg_cost, 'feed_order': feed_order, 'loader': loader, 'auc': auc}