Exemplo n.º 1
0
def create_emb(vocab_name, emb_name=None):
  if emb_name is None:
    emb_name = vocab_name
  
  vs = gezi.get('vocab_sizes')
  
  # 注意不使用keras.layers.Embedding 因为如果不用keras optimizer trainable=False不生效 
  # /juypter/dump-doc-lookup.ipynb 
  # melt 自己维护var 验证生效 同时支持pro映射和部分trainable
  # https://github.com/keras-team/keras/issues/4952
  Embedding = mt.layers.PrEmbedding
  embeddings_initializer = 'uniform'
  pretrain_path = None
  trainable = True
  
  if vocab_name == 'uid':
    trainable = FLAGS.train_uid_emb

  if vocab_name == 'did':
    trainable = FLAGS.train_did_emb

  kwargs = {}

  def _load_emb(name, path, trainable):
    emb = np.load(path)
    logging.info(name, emb.shape, vs[name][0])
    assert emb.shape[0] == vs[name][0], f'{emb.shape[0]} {vs[name][0]}'
    # emb = emb[:vs[name][0]]
    # embeddings_initializer = tf.constant_initializer(emb)
    embeddings_initializer = emb
    pretrain_path = path
    # vs[name][0] = emb.shape[0] # change here to reset if vocab size larger then emb height
    # This is expected behavior. trainable=False is a keras concept; it doesn't automatically override the graph and op behavior of tf 1.x. 
    # In order to do what you want, you should use the new style tf.keras.optimizers optimizers (or GradientTape) 
    # which accept a list of variables to differentiate with respect to and the .trainable_weights attribute of Layers and Models which will filter based on .trainable.
    # TODO seems PrEmbedding is slow so just use Embedding 用户Embedding trainable再对应非keras模式keras optimizer会失效 False设置不起作用
    Embedding = mt.layers.PrEmbedding
    kwargs['base_dim'] = emb.shape[1]
    # Embedding = keras.layers.Embedding
    return Embedding, embeddings_initializer, pretrain_path, trainable

  if FLAGS.use_entity_pretrain and vocab_name == 'entity':
    Embedding, embeddings_initializer, pretrain_path, trainable = _load_emb('entity', FLAGS.entity_pretrain, FLAGS.train_entity_emb)
  if FLAGS.use_word_pretrain and vocab_name == 'word':
    Embedding, embeddings_initializer, pretrain_path, trainable = _load_emb('word', FLAGS.word_pretrain, FLAGS.train_word_emb)
  if FLAGS.use_did_pretrain and vocab_name == 'did':
    Embedding, embeddings_initializer, pretrain_path, trainable = _load_emb('did', FLAGS.did_pretrain, FLAGS.train_did_emb)

  emb_height = vs[vocab_name][0] if not FLAGS.slim_emb_height else vs[vocab_name][1]
  if vocab_name == 'uid': 
    emb_height = vs[vocab_name][1]

  logging.info(vocab_name, vs[vocab_name][0], vs[vocab_name][1], f'({emb_height}, {FLAGS.emb_size})', pretrain_path, embeddings_initializer, trainable)
  # TODO 使用keras Embeding 名字 ok 但是PrEmbedding会重复一层 比如 cat_emb/cat_emb
  return Embedding(emb_height, FLAGS.emb_size, 
                   embeddings_initializer=embeddings_initializer, 
                   trainable=trainable, 
                   train_size=vs[vocab_name][1],
                   name=f'{emb_name}_emb',
                   **kwargs)
Exemplo n.º 2
0
def create_emb(vocab_name, emb_name=None):
  if emb_name is None:
    emb_name = vocab_name
  
  vs = gezi.get('vocab_sizes')
  
  # Embedding = melt.layers.VEmbedding if FLAGS.use_vocab_emb else keras.layers.Embedding
  Embedding = melt.layers.PrEmbedding
  embeddings_initializer = 'uniform'
  trainable = True
  
  if vocab_name == 'uid':
    trainable = FLAGS.train_uid_emb

  if vocab_name == 'did':
    trainable = FLAGS.train_did_emb

  kwargs = {}

  def _load_emb(name, path, trainable):
    emb = np.load(path)
    logging.info(name, emb.shape, vs[name][0])
    assert emb.shape[0] == vs[name][0], f'{emb.shape[0]} {vs[name][0]}'
    # emb = emb[:vs[name][0]]
    embeddings_initializer=tf.constant_initializer(emb)
    # vs[name][0] = emb.shape[0] # change here to reset if vocab size larger then emb height
    # This is expected behavior. trainable=False is a keras concept; it doesn't automatically override the graph and op behavior of tf 1.x. 
    # In order to do what you want, you should use the new style tf.keras.optimizers optimizers (or GradientTape) 
    # which accept a list of variables to differentiate with respect to and the .trainable_weights attribute of Layers and Models which will filter based on .trainable.
    # TODO seems PrEmbedding is slow so just use Embedding 用户Embedding trainable再对应非keras模式keras optimizer会失效 False设置不起作用
    Embedding = melt.layers.PrEmbedding
    kwargs['base_dim'] = emb.shape[1]
    # Embedding = keras.layers.Embedding
    return Embedding, embeddings_initializer, trainable

  if FLAGS.use_entity_pretrain and vocab_name == 'entity':
    Embedding, embeddings_initializer, trainable = _load_emb('entity', FLAGS.entity_pretrain, FLAGS.train_entity_emb)
  if FLAGS.use_word_pretrain and vocab_name == 'word':
    Embedding, embeddings_initializer, trainable = _load_emb('word', FLAGS.word_pretrain, FLAGS.train_word_emb)
  if FLAGS.use_did_pretrain and vocab_name == 'did':
    Embedding, embeddings_initializer, trainable = _load_emb('did', FLAGS.did_pretrain, FLAGS.train_did_emb)

  emb_height = vs[vocab_name][0] if not FLAGS.slim_emb_height else vs[vocab_name][1]
  logging.info(vocab_name, vs[vocab_name][0], vs[vocab_name][1], emb_height, FLAGS.emb_size, embeddings_initializer, trainable)
  return Embedding(emb_height, FLAGS.emb_size, 
              embeddings_initializer=embeddings_initializer, 
              trainable=trainable, 
              name=f'{emb_name}_emb',
              **kwargs)
Exemplo n.º 3
0
def adjust(features, subset):
    if 'hist_len' not in features:
        try:
            features['hist_len'] = mt.length(features['history'])
        except Exception:
            features['hist_len'] = tf.ones_like(features['did'])

    if FLAGS.max_history:
        for key in features:
            if 'history' in key:
                max_history = FLAGS.max_history
                if 'enti' in key:
                    max_history *= 2
                if not FLAGS.fixed_pad:
                    features[key] = features[key][:, :max_history]
                else:
                    features[key] = mt.pad(features[key], max_history)

    # 注意按照nid去获取新闻测信息 did只是用作id特征 可能被mask
    features['ori_did'] = features['did']
    features['ori_history'] = features['history']
    if 'impressions' in features:
        features['ori_impressions'] = features['impressions']

    features['did'] = mask_dids(features['did'], features['did_in_train'],
                                subset, FLAGS.test_all_mask)

    features['uid'] = mask_uids(features['uid'], subset == 'train')

    if 'history' in features:
        features['history'] = unk_aug(features['history'], subset == 'train')

    mask_negative_weights(features, subset == 'train')

    vs = gezi.get('vocab_sizes')
    if FLAGS.min_count_unk and FLAGS.min_count:
        features['uid'] = get_id(features['uid'], vs['uid'][1])
        features['did'] = get_id(features['did'], vs['did'][1])
        if FLAGS.mask_history:
            features['history'] = get_id(features['history'], vs['did'][1])
        if 'impressions' in features:
            features['impressions'] = get_id(features['impressions'],
                                             vs['did'][1])

    if vs['uid'][1] < vs['uid'][0]:
        features['uid'] = get_id(features['uid'], vs['uid'][1])

    return features
Exemplo n.º 4
0
def adjust(features, subset):
  if 'hist_len' not in features:
    try:
      features['hist_len'] = melt.length(features['history'])
    except Exception:
      features['hist_len'] = tf.ones_like(features['did'])

  if FLAGS.max_history:
    for key in features:
      if key.startswith('history'):
        max_history = FLAGS.max_history
        if 'entity' in key:
          max_history *= 2
        features[key] = features[key][:,:max_history]

  # 注意按照nid去获取新闻测信息 did只是用作id特征 可能被mask
  features['ori_did'] = features['did'] 
  features['ori_history'] = features['history']
  if 'impressions' in features:
    features['ori_impressions'] = features['impressions']

  features['did'] = mask_dids(features['did'], features['did_in_train'],
                              subset, FLAGS.test_all_mask)
  features['uid'] = mask_uids(features['uid'], subset=='train')

  try:
    features['history'] = unk_aug(features['history'], subset=='train')
  except Exception:
    pass
  mask_negative_weights(features, subset=='train')

  if FLAGS.min_count_unk and FLAGS.min_count:
    vs = gezi.get('vocab_sizes')
    features['uid'] = get_id(features['uid'], vs['uid'][1])
    features['did'] = get_id(features['did'], vs['did'][1])
    if FLAGS.mask_history:
      features['history'] = get_id(features['history'], vs['did'][1])
    if 'impressins' in features:
      features['impressions'] = get_id(features['impressions'], vs['did'][1])

  return features
Exemplo n.º 5
0
    def __init__(self):
        super(Model, self).__init__()
        self.mode = 'train'

        self.input_ = {}

        def _emb(vocab_name, emb_name=None):
            return util.create_emb(vocab_name, emb_name)

        self.uemb = _emb('uid')
        self.demb = _emb('did')

        self.cat_emb = _emb('cat')
        self.scat_emb = _emb('sub_cat')
        self.entity_emb = _emb('entity')
        self.entity_type_emb = _emb('entity_type')
        self.word_emb = _emb('word')

        self.hour_emb = Embedding(24, FLAGS.emb_size, name='hour_emb')
        self.weekday_emb = Embedding(7, FLAGS.emb_size, name='weekday_emb')
        self.fresh_hour_emb = Embedding(300,
                                        FLAGS.emb_size,
                                        name='fresh_hour_emb')  # 7 * 24
        self.fresh_day_emb = Embedding(50,
                                       FLAGS.emb_size,
                                       name='fresh_day_emb')
        self.position_emb = Embedding(300, FLAGS.emb_size, name='position_emb')

        self.title_lookup = melt.layers.LookupArray(FLAGS.title_lookup)
        self.doc_lookup = melt.layers.LookupArray(FLAGS.doc_lookup)

        self.title_encoder = TitleEncoder(self.word_emb)
        self.titles_encoder = TitlesEncoder(self.title_encoder)

        self.sum_pooling = melt.layers.SumPooling()
        self.mean_pooling = melt.layers.MeanPooling()
        self.pooling = melt.layers.Pooling(FLAGS.pooling)

        self.feat_pooling = melt.layers.Pooling(FLAGS.feat_pooling)
        self.his_simple_pooling = melt.layers.Pooling(FLAGS.his_simple_pooling)

        self.dense = Dense(
            1) if not FLAGS.use_multi_dropout else melt.layers.MultiDorpout(
                1, drop_rate=0.3)
        self.batch_norm = BatchNormalization()
        self.dropout = keras.layers.Dropout(FLAGS.dropout)
        # --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1"
        activation = FLAGS.activation
        mlp_dims = [FLAGS.emb_size *
                    2, FLAGS.emb_size] if not FLAGS.big_mlp else [
                        FLAGS.emb_size * 4, FLAGS.emb_size * 2, FLAGS.emb_size
                    ]
        self.dense_mlp = melt.layers.MLP(mlp_dims,
                                         activation=activation,
                                         drop_rate=FLAGS.mlp_dropout,
                                         name='dense_mlp')

        mlp_dims = [512, 256, 64] if not FLAGS.big_mlp else [1024, 512, 256]
        self.mlp = melt.layers.MLP(mlp_dims,
                                   activation=activation,
                                   drop_rate=FLAGS.mlp_dropout,
                                   name='mlp')

        self.his_encoder = util.get_encoder(FLAGS.his_encoder)
        self.his_dense = keras.layers.Dense(FLAGS.hidden_size)
        self.his_pooling = util.get_att_pooling(FLAGS.his_pooling)
        self.his_pooling2 = util.get_att_pooling(FLAGS.his_pooling2)
        self.cur_dense = keras.layers.Dense(FLAGS.hidden_size)

        if FLAGS.his_strategy.startswith('bst'):
            self.transformer = melt.layers.transformer.Encoder(
                num_layers=1,
                d_model=FLAGS.hidden_size,
                num_heads=FLAGS.num_heads,
                dff=FLAGS.hidden_size,
                maximum_position_encoding=None,
                activation=FLAGS.transformer_activation,
                rate=FLAGS.transformer_dropout)

        self.fusion = melt.layers.SemanticFusion(drop_rate=0.1)

        if FLAGS.feat_pooling == 'cin':
            from deepctr.layers.interaction import CIN
            self.cin = CIN((
                128,
                128,
            ), 'relu', True, 0, 1024)
            self.feat_pooling = self.cin

        if FLAGS.aux_loss_rate or FLAGS.lm_target:
            vsize = gezi.get('vocab_sizes')['vid'][0]
            # hidden_size = FLAGS.hidden_size if FLAGS.his_encoder in ['lstm', 'gru'] else  int(FLAGS.hidden_size / 2)
            hidden_size = int(FLAGS.hidden_size / 2)
            self.sampled_weight = self.add_weight(
                name='sampled_weight',
                shape=(vsize, hidden_size),
                #initializer = keras.initializers.RandomUniform(minval=-10, maxval=10, seed=None),
                dtype=tf.float32,
                trainable=True)

            self.sampled_bias = self.add_weight(
                name='sampled_bias',
                shape=(vsize, ),
                #initializer = keras.initializers.RandomUniform(minval=-10, maxval=10, seed=None),
                dtype=tf.float32,
                trainable=True)

            self.softmax_loss_function = melt.seq2seq.gen_sampled_softmax_loss_function(
                5,
                vsize,
                weights=self.sampled_weight,
                biases=self.sampled_bias,
                log_uniform_sample=True,
                is_predict=False,
                sample_seed=1234)
Exemplo n.º 6
0
    def call(self, input):
        # TODO tf2 keras seem to auto append last dim so need this
        mt.try_squeeze_dim(input)

        if not FLAGS.batch_parse:
            util.adjust(input, self.mode)

        self.embs = []
        self.feats = {}

        bs = mt.get_shape(input['did'], 0)

        def _add(feat, name):
            if _is_ok(name):
                self.feats[name] = feat
                self.embs += [feat]

        def _adds(feats, names):
            for feat, name in zip(feats, names):
                _add(feat, name)

        # --------------------------  user
        if FLAGS.use_uid:
            uemb = self.uemb(input['uid'])
            _add(uemb, 'uid')
        # --------------------------  doc
        if FLAGS.use_did:
            demb = self.demb(input['did'])
            _add(demb, 'did')

        # ---------------------------  context
        if 'history' in input:
            hlen = mt.length(input['history'])
            hlen = tf.math.maximum(hlen, 1)

        if FLAGS.use_time_emb:
            _add(self.hour_emb(input['hour']), 'hour')
            _add(self.weekday_emb(input['weekday']), 'weekday')

        if FLAGS.use_fresh_emb:
            fresh = input['fresh']
            fresh_day = tf.cast(fresh / (3600 * 12), fresh.dtype)
            fresh_hour = tf.cast(fresh / 3600, fresh.dtype)

            _add(self.fresh_day_emb(fresh_day), 'fresh_day')
            _add(self.fresh_hour_emb(fresh_hour), 'fresh_hour')

        if FLAGS.use_position_emb:
            _add(self.position_emb(input['position']), 'position')

        if FLAGS.use_history:
            dids = input['history']
            if FLAGS.his_strategy == 'bst' or FLAGS.his_pooling == 'mhead':
                mask = tf.cast(tf.equal(dids, 0), dids.dtype)
                dids += mask
                hlen = tf.ones_like(hlen) * 50
            hembs = self.demb(dids)

            his_embs = hembs
            his_embs = self.his_encoder(his_embs, hlen)
            self.his_embs = his_embs

            his_emb = self.his_pooling(demb, his_embs, hlen)

            _add(his_emb, 'his_id')

        # --------------- doc info
        doc_feats = gezi.get('doc_feats')
        doc_feat_lens = gezi.get('doc_feat_lens')
        doc = mt.lookup_feats(input['ori_did'], self.doc_lookup, doc_feats,
                              doc_feat_lens)

        cat = tf.squeeze(doc['cat'], -1)
        sub_cat = tf.squeeze(doc['sub_cat'], -1)

        # title_entities = doc['title_entities']
        # title_entity_types = doc['title_entity_types']
        # abstract_entities = doc['abstract_entities']
        # abstract_entity_types = doc['abstract_entity_types']

        title_entities = input['title_entities']
        title_entity_types = input['title_entity_types']
        abstract_entities = input['abstract_entities']
        abstract_entity_types = input['abstract_entity_types']

        # mt.length 不用速度会慢
        # prev_cat_emb = self.cat_emb(cat)
        # prev_scat_emb = self.scat_emb(cat)
        if _is_ok('cat'):
            cat_emb = self.cat_emb(cat)
            scat_emb = self.scat_emb(sub_cat)
            _adds(
                [
                    # prev_cat_emb,
                    # prev_scat_emb,
                    cat_emb,
                    scat_emb,
                ],
                # ['cat', 'sub_cat', 'title_entity_types', 'abstract_entity_types', 'title_entities', 'abstract_entities']
                [
                    # 'prev_cat', 'prev_scat',
                    'cat',
                    'sub_cat'
                ])

        if _is_ok('enti'):
            title_entities = self.entities_encoder(
                tf.concat([title_entities, title_entity_types], -1))
            abstract_entities = self.entities_encoder(
                tf.concat([abstract_entities, abstract_entity_types], -1))

            _adds(
                [
                    # self.pooling(self.entity_emb(title_entities), mt.length(doc['title_entities'])),
                    # self.pooling(self.entity_type_emb(title_entity_types), mt.length(doc['title_entity_types'])),
                    # self.pooling(self.entity_emb(abstract_entities), mt.length(doc['abstract_entities'])),
                    # self.pooling(self.entity_type_emb(abstract_entity_types), mt.length(doc['abstract_entity_types'])),
                    title_entities,
                    abstract_entities
                ],
                ['title_entities', 'abstract_entities'])

            # _adds(
            #     [
            #       self.his_simple_pooling(self.entity_type_emb(input['history_title_entity_types']), mt.length(input['history_title_entity_types'])),
            #       self.his_simple_pooling(self.entity_type_emb(input['history_abstract_entity_types']), mt.length(input['history_abstract_entity_types']))
            #     ],
            #     ['history_title_entity_merge_types', 'history_abstract_entity_merge_types']
            # )
            input['history_title_entities'] = input[
                'history_title_entities'][:, :FLAGS.max_his_title_entities *
                                          FLAGS.max_lookup_history]
            input['history_title_entity_types'] = input[
                'history_title_entity_types'][:, :FLAGS.
                                              max_his_title_entities *
                                              FLAGS.max_lookup_history]
            input['history_abstract_entities'] = input[
                'history_abstract_entities'][:, :FLAGS.max_his_title_entities *
                                             FLAGS.max_lookup_history]
            input['history_abstract_entity_types'] = input[
                'history_abstract_entity_types'][:, :FLAGS.
                                                 max_his_title_entities *
                                                 FLAGS.max_lookup_history]
            _adds([
                self.his_entity_pooling(
                    title_entities,
                    (self.entity_emb(input['history_title_entities']) +
                     self.entity_type_emb(input['history_title_entity_types'])
                     ), mt.length(input['history_title_entities'])),
                self.his_entity_pooling(
                    abstract_entities,
                    (self.entity_emb(input['history_abstract_entities']) +
                     self.entity_type_emb(
                         input['history_abstract_entity_types'])),
                    mt.length(input['history_abstract_entities']))
            ], ['his_title_merge_entities', 'his_abstract_merge_entities'])

            # --------------- history info
        dids = input['ori_history']
        dids = dids[:, :FLAGS.max_lookup_history]
        hlen = mt.length(input['history'])
        hlen = tf.math.maximum(hlen, 1)

        his = mt.lookup_feats(dids, self.doc_lookup, doc_feats, doc_feat_lens)

        his_cats = his['cat']
        his_cats = tf.squeeze(his_cats, -1)
        his_sub_cats = his['sub_cat']
        his_sub_cats = tf.squeeze(his_sub_cats, -1)

        # his_title_entities = his['title_entities']
        # his_title_entity_types = his['title_entity_types']
        # his_abstract_entities = his['abstract_entities']
        # his_abstract_entity_types = his['abstract_entity_types']

        # his_title_entities = self.his_entities_encoder(tf.concat([his_title_entities, his_title_entity_types], -1),
        #                                                tf.math.minimum(hlen, FLAGS.max_titles), title_entities)
        # his_abstract_entities = self.his_entities_encoder(tf.concat([his_abstract_entities, his_abstract_entity_types], -1),
        #                                                   tf.math.minimum(hlen, FLAGS.max_titles), abstract_entities)

        if _is_ok('cat'):
            # FIXME 当前如果直接展平 mt.length有问题 因为都是内壁 0 pad,  类似  2,3,0,0 1,0,0,0  会丢掉很多信息 填1 是一种方式 (v1就是这种 最多 1,1)
            # 另外也可以用encoder
            _adds(
                [
                    self.his_cat_pooling(self.cat_emb(his_cats),
                                         mt.length(his_cats)),
                    self.his_cat_pooling(self.scat_emb(his_sub_cats),
                                         mt.length(his_sub_cats)),
                    ## 对应cat din效果不如att(增加也没有收益) 对应title din效果比att好, entity也是din比较好
                    # self.his_scat_din_pooling(scat_emb, self.scat_emb(his_sub_cats), mt.length(his_sub_cats)),
                    # his_title_entities,
                    # his_abstract_entities,
                ],
                [
                    'his_cats',
                    'his_sub_cats',
                    #  'history_title_entities', 'history_abstract_entities'
                ])

        if not FLAGS.bert_dir or not FLAGS.bert_only:
            if _is_ok('^cur_title&'):
                cur_title = self.title_encoder(doc['title'])
                his_titles = his['title']
                if FLAGS.max_titles:
                    his_titles = his_titles[:, :FLAGS.max_titles]
                his_title = self.titles_encoder(
                    his_titles, tf.math.minimum(hlen, FLAGS.max_titles),
                    cur_title)
                _adds([cur_title, his_title], ['cur_title', 'his_title'])

            if _is_ok('^abstract&'):
                cur_abstract = self.abstract_encoder(doc['abstract'])
                his_abstracts = his['abstract']
                if FLAGS.max_abstracts:
                    his_abstracts = his_abstracts[:, :FLAGS.max_abstracts]
                his_abstract = self.abstracts_encoder(
                    his_abstracts, tf.math.minimum(hlen, FLAGS.max_abstracts),
                    cur_abstract)
                _adds([cur_abstract, his_abstract],
                      ['cur_abstract', 'his_abstract'])

            if FLAGS.use_body:
                if _is_ok('^body&'):
                    cur_body = self.body_encoder(doc['body'])
                    his_bodies = his['body']
                    if FLAGS.max_bodies:
                        his_bodies = his_bodies[:, :FLAGS.max_bodies]
                    his_body = self.bodies_encoder(
                        his_bodies, tf.math.minimum(hlen, FLAGS.max_bodies),
                        cur_body)
                    _adds([
                        cur_body,
                        his_body,
                    ], ['cur_body', 'his_body'])

        if FLAGS.bert_dir:
            if _is_ok('bert_title'):
                bert_title = self.bert_title_encoder(doc['title_uncased'])
                max_titles = FLAGS.max_bert_titles
                his_bert_title = self.bert_titles_encoder(
                    his['title_uncased'][:, :max_titles],
                    tf.math.minimum(hlen, max_titles), bert_title)
                _adds([
                    bert_title,
                    his_bert_title,
                ], ['bert_title', 'his_bert_title'])
            if _is_ok('bert_abstract') and FLAGS.bert_abstract:
                bert_abstract = self.bert_abstract_encoder(
                    doc['abstract_uncased'])
                max_abstracts = FLAGS.max_bert_abstracts
                his_bert_abstract = self.bert_abstracts_encoder(
                    his['abstract_uncased'][:, :max_abstracts],
                    tf.math.minimum(hlen, max_abstracts), bert_abstract)
                _adds([
                    bert_abstract,
                    his_bert_abstract,
                ], ['bert_abstract', 'his_bert_abstract'])
            if _is_ok('bert_body') and FLAGS.bert_body:
                bert_body = self.bert_body_encoder(doc['body_uncased'])
                max_bodies = FLAGS.max_bert_bodies
                his_bert_body = self.bert_bodies_encoder(
                    his['body_uncased'][:, :max_bodies],
                    tf.math.minimum(hlen, max_bodies), bert_body)
                _adds([
                    bert_body,
                    his_bert_body,
                ], ['bert_body', 'his_bert_body'])

        if FLAGS.use_impression_titles:  # dev +0.4% test下降
            his_impression = mt.lookup_feats(input['impressions'],
                                             self.doc_lookup, doc_feats,
                                             doc_feat_lens)
            his_impression_titles = his_impression['title']
            his_impression_title = self.titles_encoder2(
                his_impression_titles, mt.length(input['impressions']),
                cur_title)
            _adds([
                his_impression_title,
            ], ['impression_title'])

        # 用impression id 会dev test不一致 不直接用id
        if FLAGS.use_impressions:
            _add(self.mean_pooling(self.demb(input['impressions'])),
                 'impressions')

        if FLAGS.use_dense:
            dense_emb = self.deal_dense(input)
            _add(dense_emb, 'dense')

        embs = self.embs
        # logging.info('-----------embs:', len(embs))
        logging.info(self.feats.keys())
        # logging.debug(self.feats)
        embs = [
            x if len(mt.get_shape(x)) == 2 else tf.squeeze(x, 1) for x in embs
        ]
        embs = tf.stack(embs, axis=1)

        if FLAGS.batch_norm:
            embs = self.batch_norm(embs)

        if FLAGS.l2_normalize_before_pooling:
            x = tf.math.l2_normalize(embs, axis=FLAGS.l2_norm_axis)

        x = self.feat_pooling(embs)

        # if FLAGS.dropout:
        #   x = self.dropout(x)

        if FLAGS.use_dense:
            x = tf.concat([x, dense_emb], axis=1)

        # if FLAGS.use_his_concat:
        #   x = tf.concat([x, his_concat], axis=1)

        x = self.mlp(x)
        self.logit = self.dense(x)

        self.prob = tf.math.sigmoid(self.logit)
        self.impression_id = input['impression_id']
        self.position = input['position']
        self.history_len = input['hist_len']
        self.impression_len = input['impression_len']
        self.input_ = input
        return self.logit
Exemplo n.º 7
0
    def __init__(self, **kwargs):
        super(Model, self).__init__(**kwargs)
        self.mode = 'train'

        self.input_ = {}

        def _emb(vocab_name, emb_name=None):
            return util.create_emb(vocab_name, emb_name)

        self.uemb = _emb('uid')
        self.demb = _emb('did')

        self.cat_emb = _emb('cat')
        self.scat_emb = _emb('sub_cat')
        self.entity_emb = _emb('entity')
        self.entity_type_emb = _emb('entity_type')
        if not FLAGS.bert_dir or not FLAGS.bert_only:
            self.word_emb = _emb('word')

        self.hour_emb = Embedding(24, FLAGS.emb_size, name='hour_emb')
        self.weekday_emb = Embedding(7, FLAGS.emb_size, name='weekday_emb')
        self.fresh_hour_emb = Embedding(300,
                                        FLAGS.emb_size,
                                        name='fresh_hour_emb')  # 7 * 24
        self.fresh_day_emb = Embedding(50,
                                       FLAGS.emb_size,
                                       name='fresh_day_emb')
        self.position_emb = Embedding(300, FLAGS.emb_size, name='position_emb')

        # self.title_lookup = mt.layers.LookupArray(FLAGS.title_lookup, name='title_lookup')
        self.doc_lookup = mt.layers.LookupArray(FLAGS.doc_lookup,
                                                name='doc_lookup')

        if _is_ok('enti'):
            self.entities_encoder = Encoders(
                [self.entity_emb, self.entity_type_emb],
                None,
                FLAGS.pooling,
                name='entities_encoder')
            self.his_entities_encoder = SeqsEncoder(
                self.entities_encoder,
                None,
                FLAGS.seqs_pooling,
                name='his_entities_encoder')

        if not FLAGS.bert_dir or not FLAGS.bert_only:
            if _is_ok('^cur_title&') or _is_ok('abstract') or _is_ok('body'):
                if FLAGS.share_words_encoder:
                    words_encoder = Encoder(self.word_emb,
                                            FLAGS.seqs_encoder,
                                            FLAGS.pooling,
                                            name='words_encoder')
                else:
                    words_encoder = None

            if _is_ok('^cur_title&'):
                self.title_encoder = words_encoder or Encoder(
                    self.word_emb,
                    FLAGS.seqs_encoder,
                    FLAGS.pooling,
                    name='title_encoder')
                self.titles_encoder = SeqsEncoder(self.title_encoder,
                                                  FLAGS.seqs_encoder,
                                                  FLAGS.seqs_pooling,
                                                  name='titles_encoder')
                self.titles_encoder2 = SeqsEncoder(self.title_encoder,
                                                   FLAGS.seqs_encoder,
                                                   FLAGS.seqs_pooling2,
                                                   name='titles_encoder2')

            if _is_ok('^abstract&'):
                self.abstract_encoder = words_encoder or Encoder(
                    self.word_emb, FLAGS.seqs_encoder, name='abstract_encoder')
                self.abstracts_encoder = SeqsEncoder(self.abstract_encoder,
                                                     FLAGS.seqs_encoder,
                                                     FLAGS.seqs_pooling,
                                                     name='abstracts_encoder')

            if _is_ok('^body&'):
                self.body_encoder = words_encoder or Encoder(
                    self.word_emb, None, FLAGS.pooling, name='body_encoder')
                self.bodies_encoder = SeqsEncoder(self.body_encoder,
                                                  FLAGS.seqs_encoder,
                                                  FLAGS.seqs_pooling,
                                                  name='bodies_encoder')

        if FLAGS.bert_dir:
            # tpu 会做静态检查 bert_encoder会认为没有weight失败
            # Weights for model bert_encoder have not yet been created. Weights are created when the Model is first called on inputs or `build()` is called with an `input_shape` tpu
            # max_input_length = None if not gezi.get('tpu') else FLAGS.max_bert_input_length
            max_input_length = None
            bert_encoder = mt.models.Bert(
                FLAGS.bert_dir,
                FLAGS.emb_size,
                max_input_length=max_input_length,
                return_sequences=FLAGS.bert_pooling_seqs,
                name='bert_encoder')
            self.bert_title_encoder = bert_encoder
            self.bert_abstract_encoder = bert_encoder
            self.bert_body_encoder = bert_encoder
            if FLAGS.bert_pooling_seqs:
                if FLAGS.share_words_encoder:
                    bert_words_encoder = Encoder(None,
                                                 bert_encoder,
                                                 FLAGS.pooling,
                                                 name='words_encoder')
                else:
                    bert_words_encoder = None

                if _is_ok('bert_title'):
                    self.bert_title_encoder = bert_words_encoder or Encoder(
                        None,
                        bert_encoder,
                        FLAGS.pooling,
                        name='bert_title_encoder')
                if _is_ok('bert_abstract'):
                    self.bert_abstract_encoder = bert_words_encoder or Encoder(
                        None,
                        bert_encoder,
                        FLAGS.pooling,
                        name='bert_abstract_encoder')
                if _is_ok('bert_body'):
                    self.bert_body_encoder = bert_words_encoder or Encoder(
                        None,
                        bert_encoder,
                        FLAGS.pooling,
                        name='bert_body_encoder')

            if _is_ok('bert_title'):
                self.bert_titles_encoder = SeqsEncoder(
                    self.bert_title_encoder,
                    FLAGS.seqs_encoder,
                    FLAGS.seqs_pooling,
                    name='bert_titles_encoder')
            if _is_ok('bert_abstract'):
                self.bert_abstracts_encoder = SeqsEncoder(
                    self.bert_abstract_encoder,
                    FLAGS.seqs_encoder,
                    FLAGS.seqs_pooling,
                    name='bert_abstracts_encoder')
            if _is_ok('bert_body'):
                self.bert_bodies_encoder = SeqsEncoder(
                    self.bert_body_encoder,
                    FLAGS.seqs_encoder,
                    FLAGS.seqs_pooling,
                    name='bert_bodies_encoder')

        self.sum_pooling = mt.layers.SumPooling()
        self.mean_pooling = mt.layers.MeanPooling()
        self.pooling = mt.layers.Pooling(FLAGS.pooling)

        self.feat_pooling = mt.layers.Pooling(FLAGS.feat_pooling,
                                              name='feat_pooling')
        self.his_simple_pooling = mt.layers.Pooling(FLAGS.his_simple_pooling)
        # self.his_entity_pooling = mt.layers.Pooling('att', name='his_entity_pooling')
        self.his_entity_pooling = util.get_att_pooling(
            'din', name='his_entity_pooling')
        self.his_cat_pooling = mt.layers.Pooling('att', name='his_cat_pooling')
        self.his_scat_din_pooling = util.get_att_pooling(
            'din', name='his_scat_din_pooling')

        self.dense = Dense(
            1) if not FLAGS.use_multi_dropout else mt.layers.MultiDropout(
                1, drop_rate=0.3)
        self.batch_norm = BatchNormalization()
        self.dropout = keras.layers.Dropout(FLAGS.dropout)
        # --arch-mlp-bot="13-512-256-64-16" --arch-mlp-top="512-256-1"
        activation = FLAGS.activation
        mlp_dims = [FLAGS.emb_size *
                    2, FLAGS.emb_size] if not FLAGS.big_mlp else [
                        FLAGS.emb_size * 4, FLAGS.emb_size * 2, FLAGS.emb_size
                    ]
        self.dense_mlp = mt.layers.MLP(mlp_dims,
                                       activation=activation,
                                       drop_rate=FLAGS.mlp_dropout,
                                       name='dense_mlp')

        mlp_dims = [512, 256, 64] if not FLAGS.big_mlp else [1024, 512, 256]
        self.mlp = mt.layers.MLP(mlp_dims,
                                 activation=activation,
                                 drop_rate=FLAGS.mlp_dropout,
                                 batch_norm=FLAGS.mlp_bn,
                                 name='mlp')

        self.his_encoder = util.get_encoder(FLAGS.his_encoder)
        self.his_dense = keras.layers.Dense(FLAGS.hidden_size)
        self.his_pooling = util.get_att_pooling(FLAGS.his_pooling)
        self.his_pooling2 = util.get_att_pooling(FLAGS.his_pooling2)
        self.cur_dense = keras.layers.Dense(FLAGS.hidden_size)

        if FLAGS.his_strategy.startswith('bst'):
            self.transformer = mt.layers.transformer.Encoder(
                num_layers=1,
                d_model=FLAGS.hidden_size,
                num_heads=FLAGS.num_heads,
                dff=FLAGS.hidden_size,
                maximum_position_encoding=None,
                activation=FLAGS.transformer_activation,
                rate=FLAGS.transformer_dropout)

        self.fusion = mt.layers.SemanticFusion(drop_rate=0.1)

        if FLAGS.feat_pooling == 'cin':
            from deepctr.layers.interaction import CIN
            self.cin = CIN((
                128,
                128,
            ), 'relu', True, 0, 1024)
            self.feat_pooling = self.cin

        if FLAGS.aux_loss_rate or FLAGS.lm_target:
            vsize = gezi.get('vocab_sizes')['vid'][0]
            # hidden_size = FLAGS.hidden_size if FLAGS.his_encoder in ['lstm', 'gru'] else  int(FLAGS.hidden_size / 2)
            hidden_size = int(FLAGS.hidden_size / 2)
            self.sampled_weight = self.add_weight(
                name='sampled_weight',
                shape=(vsize, hidden_size),
                # initializer = keras.initializers.RandomUniform(minval=-10, maxval=10, seed=None),
                dtype=tf.float32,
                trainable=True)

            self.sampled_bias = self.add_weight(
                name='sampled_bias',
                shape=(vsize, ),
                # initializer = keras.initializers.RandomUniform(minval=-10, maxval=10, seed=None),
                dtype=tf.float32,
                trainable=True)

            self.softmax_loss_function = mt.seq2seq.gen_sampled_softmax_loss_function(
                5,
                vsize,
                weights=self.sampled_weight,
                biases=self.sampled_bias,
                log_uniform_sample=True,
                is_predict=False,
                sample_seed=1234)