def __init__(self): super(QCAttention, self).__init__() vocabulary.init() vocab_size = vocabulary.get_vocab_size() self.embedding = wenzheng.Embedding(vocab_size, FLAGS.emb_dim, FLAGS.word_embedding_file, trainable=FLAGS.finetune_word_embedding, vocab2_size=FLAGS.unk_vocab_size) self.num_layers = FLAGS.num_layers self.num_units = FLAGS.rnn_hidden_size self.keep_prob = FLAGS.keep_prob self.encode = wenzheng.Encoder(FLAGS.encoder_type) self.att_encode = melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=self.keep_prob) self.att_dot_attention = melt.layers.DotAttention(hidden=self.num_units, keep_prob=self.keep_prob, combiner=FLAGS.att_combiner) self.pooling = melt.layers.MaxPooling() self.logits = keras.layers.Dense(NUM_CLASSES, activation=None) self.logits2 = keras.layers.Dense(NUM_CLASSES, activation=None)
def __init__(self): super(Model2, self).__init__() vocabulary.init() vocab_size = vocabulary.get_vocab_size() ## adadelta adagrad will need cpu, so just use adam.. #with tf.device('/cpu:0'): self.embedding = wenzheng.Embedding( vocab_size, FLAGS.emb_dim, FLAGS.word_embedding_file, trainable=FLAGS.finetune_word_embedding, vocab2_size=FLAGS.unk_vocab_size) self.num_layers = FLAGS.num_layers self.num_units = FLAGS.rnn_hidden_size self.keep_prob = FLAGS.keep_prob self.encode = melt.layers.CudnnRnn2(num_layers=self.num_layers, num_units=self.num_units, keep_prob=self.keep_prob) self.pooling = melt.layers.MaxPooling2() #self.pooling = keras.layers.GlobalMaxPool1D() self.logits = keras.layers.Dense(NUM_CLASSES, activation=None) self.logits2 = keras.layers.Dense(NUM_CLASSES, activation=None)
def get_embedding(name='emb', height=None, emb_dim=None, trainable=True): emb_dim = emb_dim or FLAGS.emb_dim if height is None: vocabulary.init() height = vocabulary.get_vocab_size() # google transform use below #initializer=tf.random_normal_initializer( # 0., self.hidden_size ** -0.5) # squad use np.random.normal(scale=0.01) if FLAGS.emb_init == 'uniform': init_width = 0.5 / emb_dim emb = melt.variable.get_weights_uniform(name, [height, emb_dim], -init_width, init_width, trainable=trainable) logging.info('emb random_uniform init with width', init_width) elif FLAGS.emb_init == 'normal' or FLAGS.emb_init == 'random': stddev = FLAGS.emb_stddev or emb_dim**-0.5 logging.info('emb random_normal init with stddev', stddev) emb = melt.variable.get_weights_random(name, [height, emb_dim], stddev, trainable=trainable) else: raise ValueError(FLAGS.emb_init) #return to above code if this works not better #emb = melt.variable.get_weights_truncated(name, [vocab_size, emb_dim], stddev=FLAGS.weight_stddev) return emb
def __init__(self): super(Model, self).__init__() vocabulary.init() vocab_size = vocabulary.get_vocab_size() ## adadelta adagrad will need cpu, so just use adam.. #with tf.device('/cpu:0'): self.embedding = wenzheng.Embedding( vocab_size, FLAGS.emb_dim, FLAGS.word_embedding_file, trainable=FLAGS.finetune_word_embedding, vocab2_size=FLAGS.unk_vocab_size) self.num_layers = FLAGS.num_layers self.num_units = FLAGS.rnn_hidden_size self.keep_prob = FLAGS.keep_prob self.encode = wenzheng.Encoder(FLAGS.encoder_type) self.pooling = melt.layers.Pooling(FLAGS.encoder_output_method, top_k=FLAGS.top_k, att_activation=getattr( tf.nn, FLAGS.att_activation)) self.logits = keras.layers.Dense(NUM_CLASSES) self.logits2 = keras.layers.Dense(NUM_CLASSES)
def __init__(self): super(MwAN, self).__init__() vocabulary.init() vocab_size = vocabulary.get_vocab_size() embedding_size = FLAGS.emb_dim encoder_size = FLAGS.rnn_hidden_size self.dropout = nn.Dropout(p=(1 - FLAGS.keep_prob)) self.embedding = wenzheng.pyt.get_embedding( vocab_size, emb_dim, FLAGS.word_embedding_file, FLAGS.finetune_word_embedding) self.q_encoder = nn.GRU(input_size=embedding_size, hidden_size=encoder_size, batch_first=True, bidirectional=True) self.p_encoder = nn.GRU(input_size=embedding_size, hidden_size=encoder_size, batch_first=True, bidirectional=True) self.a_encoder = nn.GRU(input_size=embedding_size, hidden_size=int(embedding_size / 2), batch_first=True, bidirectional=True) self.a_attention = nn.Linear(embedding_size, 1, bias=False) # Concat Attention self.Wc1 = nn.Linear(2 * encoder_size, encoder_size, bias=False) self.Wc2 = nn.Linear(2 * encoder_size, encoder_size, bias=False) self.vc = nn.Linear(encoder_size, 1, bias=False) # Bilinear Attention self.Wb = nn.Linear(2 * encoder_size, 2 * encoder_size, bias=False) # Dot Attention : self.Wd = nn.Linear(2 * encoder_size, encoder_size, bias=False) self.vd = nn.Linear(encoder_size, 1, bias=False) # Minus Attention : self.Wm = nn.Linear(2 * encoder_size, encoder_size, bias=False) self.vm = nn.Linear(encoder_size, 1, bias=False) self.Ws = nn.Linear(2 * encoder_size, encoder_size, bias=False) self.vs = nn.Linear(encoder_size, 1, bias=False) self.gru_agg = nn.GRU(12 * encoder_size, encoder_size, batch_first=True, bidirectional=True) """ prediction layer """ self.Wq = nn.Linear(2 * encoder_size, encoder_size, bias=False) self.vq = nn.Linear(encoder_size, 1, bias=False) self.Wp1 = nn.Linear(2 * encoder_size, encoder_size, bias=False) self.Wp2 = nn.Linear(2 * encoder_size, encoder_size, bias=False) self.vp = nn.Linear(encoder_size, 1, bias=False) self.prediction = nn.Linear(2 * encoder_size, embedding_size, bias=False) self.logits = nn.Linear(3, 3) self.logits2 = nn.Linear(3, 3) self.initiation()
def __init__(self, embedding=None): super(Fastai, self).__init__(embedding) vocabulary.init() vocab_size = vocabulary.get_vocab_size() emb_dim = FLAGS.emb_dim self.num_classes = NUM_CLASSES self.model = lele.fastai.text.classifier( vocab_size, NUM_ATTRIBUTES * self.num_classes, emb_sz=emb_dim, nl=FLAGS.num_layers, embedding_weight=FLAGS.word_embedding_file)
def __init__(self): super(Gru, self).__init__() vocabulary.init() vocab_size = vocabulary.get_vocab_size() emb_dim = FLAGS.emb_dim self.embedding = wenzheng.pyt.get_embedding( vocab_size, emb_dim, FLAGS.word_embedding_file, FLAGS.finetune_word_embedding) self.num_layers = FLAGS.num_layers self.num_units = FLAGS.rnn_hidden_size self.dropout = nn.Dropout(p=(1 - FLAGS.keep_prob)) #self.encode = nn.GRU(input_size=emb_dim, hidden_size=self.num_units, batch_first=True, bidirectional=True) self.encode = lele.layers.StackedBRNN( input_size=emb_dim, hidden_size=self.num_units, num_layers=self.num_layers, dropout_rate=1 - FLAGS.keep_prob, dropout_output=False, concat_layers=False, rnn_type=nn.GRU, padding=FLAGS.rnn_padding, ) ## Support mask #self.pooling = lele.layers.MaxPooling() self.pooling = lele.layers.Pooling(FLAGS.encoder_output_method, input_size=2 * self.num_units, top_k=FLAGS.top_k, att_activation=getattr( F, FLAGS.att_activation)) # input dim not as convinient as tf.. pre_logits_dim = self.pooling.output_size if FLAGS.use_type: pre_logits_dim += 1 num_types = 2 if FLAGS.use_type_emb: type_emb_dim = 10 self.type_embedding = nn.Embedding(num_types, type_emb_dim) pre_logits_dim += type_emb_dim if FLAGS.use_type_rnn: self.type_embedding = nn.Embedding(num_types, emb_dim) self.logits = nn.Linear(pre_logits_dim, NUM_CLASSES)
def __init__(self): super(Bow, self).__init__() vocabulary.init() vocab_size = vocabulary.get_vocab_size() emb_dim = FLAGS.emb_dim self.embedding = wenzheng.pyt.get_embedding( vocab_size, emb_dim, FLAGS.word_embedding_file, FLAGS.finetune_word_embedding) self.num_layers = FLAGS.num_layers self.num_units = FLAGS.rnn_hidden_size self.dropout = nn.Dropout(p=(1 - FLAGS.keep_prob)) self.encode = nn.GRU(input_size=emb_dim, hidden_size=self.num_units, batch_first=True, bidirectional=True) #self.logits = nn.Linear(2 * self.num_units, NUM_CLASSES) self.logits = nn.Linear(emb_dim, NUM_CLASSES)
def __init__(self): super(Model, self).__init__() vocabulary.init() vocab_size = vocabulary.get_vocab_size() #self.embedding = keras.layers.Embedding(vocab_size, FLAGS.emb_dim) #with tf.device('/cpu:0'): self.embedding = wenzheng.utils.Embedding( vocab_size, FLAGS.emb_dim, FLAGS.word_embedding_file, trainable=FLAGS.finetune_word_embedding) #self.encode = MyLayer() self.num_layers = 1 self.num_units = FLAGS.rnn_hidden_size self.keep_prob = 0.7 self.encode = melt.layers.CudnnRnn(num_layers=self.num_layers, num_units=self.num_units, keep_prob=self.keep_prob) # self.encode = keras.layers.CuDNNGRU(units=FLAGS.rnn_hidden_size, # #self.encode = keras.layers.CuDNNLSTM(units=FLAGS.rnn_hidden_size, # return_sequences=True, # return_state=False, # recurrent_initializer='glorot_uniform') #self.encode = keras.layers.GRU(units=FLAGS.rnn_hidden_size, # return_sequences=True, # return_state=False, # recurrent_activation='sigmoid', # recurrent_initializer='glorot_uniform') #self.pooling = keras.layers.GlobalMaxPool1D() self.pooling = melt.layers.MaxPooling() self.logits = keras.layers.Dense(NUM_CLASSES, activation=None) self.temp = MyModel()
def __init__(self, args=None): super(Rnet, self).__init__() # Store config if args is None: args = FLAGS self.args = args vocabulary.init() vocab_size = vocabulary.get_vocab_size() # Word embeddings (+1 for padding) self.embedding = nn.Embedding(vocab_size, args.emb_dim, padding_idx=0) if FLAGS.word_embedding_file: self.embedding.weight.data.copy_( torch.from_numpy(np.load(FLAGS.word_embedding_file))) if not FLAGS.finetune_word_embedding: self.embedding.weight.requires_grad = False doc_input_size = args.emb_dim # Encoder self.encode_rnn = layers.StackedBRNN( input_size=doc_input_size, hidden_size=args.rnn_hidden_size, num_layers=args.num_layers, dropout_rate=1 - args.keep_prob, dropout_output=False, concat_layers=True, rnn_type=self.RNN_TYPES['gru'], padding=False, ) # Output sizes of rnn encoder doc_hidden_size = 2 * args.rnn_hidden_size question_hidden_size = 2 * args.rnn_hidden_size #if args.concat_rnn_layers: doc_hidden_size *= args.num_layers question_hidden_size *= args.num_layers # Gated-attention-based RNN of the whole question self.question_attn = layers.SeqAttnMatch(question_hidden_size, identity=False) self.question_attn_gate = layers.Gate(doc_hidden_size + question_hidden_size) self.question_attn_rnn = layers.StackedBRNN( input_size=doc_hidden_size + question_hidden_size, hidden_size=args.rnn_hidden_size, num_layers=1, dropout_rate=1 - args.keep_prob, dropout_output=False, concat_layers=False, rnn_type=self.RNN_TYPES['gru'], padding=False, ) question_attn_hidden_size = 2 * args.rnn_hidden_size # Self-matching-attention-baed RNN of the whole doc self.doc_self_attn = layers.SelfAttnMatch(question_attn_hidden_size, identity=False) self.doc_self_attn_gate = layers.Gate(question_attn_hidden_size + question_attn_hidden_size) self.doc_self_attn_rnn = layers.StackedBRNN( input_size=question_attn_hidden_size + question_attn_hidden_size, hidden_size=args.rnn_hidden_size, num_layers=1, dropout_rate=1 - args.keep_prob, dropout_output=False, concat_layers=False, rnn_type=self.RNN_TYPES['gru'], padding=False, ) doc_self_attn_hidden_size = 2 * args.rnn_hidden_size self.doc_self_attn_rnn2 = layers.StackedBRNN( input_size=doc_self_attn_hidden_size, hidden_size=args.rnn_hidden_size, num_layers=1, dropout_rate=1 - args.keep_prob, dropout_output=False, concat_layers=False, rnn_type=self.RNN_TYPES['gru'], padding=False, ) self.logits = nn.Linear(2 * args.rnn_hidden_size, NUM_CLASSES, bias=True)
def __init__(self): super(MnemonicReaderV1, self).__init__() vocabulary.init() vocab_size = vocabulary.get_vocab_size() self.embedding = wenzheng.Embedding( vocab_size, FLAGS.emb_dim, FLAGS.word_embedding_file, trainable=FLAGS.finetune_word_embedding, vocab2_size=FLAGS.unk_vocab_size, vocab2_trainable=FLAGS.finetune_unk_vocab) self.num_layers = FLAGS.num_layers self.num_units = FLAGS.rnn_hidden_size self.keep_prob = FLAGS.keep_prob logging.info('num_layers:', self.num_layers) logging.info('num_unints:', self.num_units) logging.info('keep_prob:', self.keep_prob) self.encode = melt.layers.CudnnRnn(num_layers=self.num_layers, num_units=self.num_units, keep_prob=self.keep_prob) if FLAGS.use_qc_att or FLAGS.use_bidaf_att: assert not (FLAGS.use_qc_att and FLAGS.use_bidaf_att ), 'use rnet or use bidaf? just choose one!' #Attention = melt.layers.DotAttention if FLAGS.use_qc_att else melt.layers.BiDAFAttention Attention = melt.layers.SeqAttnMatch if FLAGS.use_qc_att else melt.layers.BiDAFAttention # seems share att and match attention is fine a bit improve ? but just follow squad to use diffent dot attention # NOTICE for eager mode ckpt save should not write as x = [None] * 3 can not save... self.att_dot_attentions = [] self.att_encodes = [] for _ in range(FLAGS.hop): self.att_dot_attentions.append( Attention(hidden=self.num_units, keep_prob=self.keep_prob, combiner=FLAGS.att_combiner)) #self.att_dot_attentions.append(Attention(keep_prob=self.keep_prob, combiner=FLAGS.att_combiner, identity=True)) #self.att_dot_attentions.append(Attention(combiner=FLAGS.att_combiner, identity=True)) # TODO seems not work like layers.Dense... name in graph mode in eager mode will name as att_encode, match_encode # in graph mode just cudnn_rnn, cudnn_rnn_1 so all ignore name=.. not like layers.Dense.. TODO # TODO seems in mreader do not use att_encode .. so check which is better use or not to use.. self.att_encodes.append( melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=self.keep_prob)) #self.att_encode = melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=0.5) if FLAGS.use_label_emb or FLAGS.use_label_att: assert not (FLAGS.use_label_emb and FLAGS.use_label_att) self.label_emb_height = NUM_CLASSES if not FLAGS.label_emb_height else FLAGS.label_emb_height self.label_embedding = melt.layers.Embedding( self.label_emb_height, FLAGS.emb_dim) if not FLAGS.use_label_att: # TODO not use activation ? #self.label_dense = keras.layers.Dense(FLAGS.emb_dim, activation=tf.nn.relu) self.label_dense = keras.layers.Dense(FLAGS.emb_dim) else: self.label_att_dot_attention = melt.layers.DotAttention( hidden=self.num_units, keep_prob=self.keep_prob, combiner=FLAGS.att_combiner) self.label_att_encode = melt.layers.CudnnRnn( num_layers=1, num_units=self.num_units, keep_prob=self.keep_prob) #self.label_att_encode = melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=0.5) if FLAGS.use_self_match: self.match_dot_attentions = [] self.match_encodes = [] for _ in range(FLAGS.hop): self.match_dot_attentions.append( melt.layers.DotAttention(hidden=self.num_units, keep_prob=self.keep_prob, combiner=FLAGS.att_combiner)) #self.match_dot_attentions.append(melt.layers.SelfAttnMatch(keep_prob=self.keep_prob, combiner=FLAGS.att_combiner, identity=True, diag=False)) #self.match_dot_attentions.append(melt.layers.SelfAttnMatch(combiner=FLAGS.att_combiner, identity=True, diag=False)) self.match_encodes.append( melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=self.keep_prob)) #self.match_encode = melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=0.5) if FLAGS.use_answer_emb: self.context_dense = keras.layers.Dense(FLAGS.emb_dim) self.answer_dense = keras.layers.Dense(FLAGS.emb_dim) # self.context_dense = keras.layers.Dense(FLAGS.emb_dim, activation=tf.nn.relu) # self.answer_dense = keras.layers.Dense(FLAGS.emb_dim, activation=tf.nn.relu) logging.info('encoder_output_method:', FLAGS.encoder_output_method) logging.info('topk:', FLAGS.top_k) self.pooling = melt.layers.Pooling(FLAGS.encoder_output_method, top_k=FLAGS.top_k, att_activation=getattr( tf.nn, FLAGS.att_activation)) self.logits = keras.layers.Dense(NUM_CLASSES) if FLAGS.split_type: self.logits2 = keras.layers.Dense(NUM_CLASSES)
def __init__(self, args=None): super(MnemonicReaderV3, self).__init__() if args is None: args = FLAGS # Store config self.args = args vocabulary.init() vocab_size = vocabulary.get_vocab_size() self.embedding = wenzheng.pyt.get_embedding( vocab_size, args.emb_dim, args.word_embedding_file, args.finetune_word_embedding) doc_input_size = args.emb_dim self.dropout_rate = 1 - args.keep_prob self.num_layers = 1 # Encoder self.encoding_rnn = layers.CudnnRnn( input_size=doc_input_size, hidden_size=args.rnn_hidden_size, num_layers=1, dropout_rate=1 - args.keep_prob, dropout_output=False, concat_layers=False, rnn_type=self.RNN_TYPES['gru'], padding=args.rnn_padding, ) doc_hidden_size = 2 * args.rnn_hidden_size # Interactive aligning, self aligning and aggregating self.interactive_aligners = nn.ModuleList() self.interactive_SFUs = nn.ModuleList() self.self_aligners = nn.ModuleList() self.self_SFUs = nn.ModuleList() self.aggregate_rnns = nn.ModuleList() for i in range(args.hop): # interactive aligner self.interactive_aligners.append( layers.SeqAttnMatch(doc_hidden_size, identity=True)) self.interactive_SFUs.append( layers.SFU(doc_hidden_size, 3 * doc_hidden_size)) # self aligner self.self_aligners.append( layers.SelfAttnMatch(doc_hidden_size, identity=True, diag=False)) self.self_SFUs.append( layers.SFU(doc_hidden_size, 3 * doc_hidden_size)) # aggregating self.aggregate_rnns.append( layers.StackedBRNN( input_size=doc_hidden_size, hidden_size=args.rnn_hidden_size, num_layers=1, dropout_rate=1 - args.keep_prob, dropout_output=False, concat_layers=False, rnn_type=self.RNN_TYPES['gru'], padding=False, )) self.pooling = lele.layers.Pooling(FLAGS.encoder_output_method, input_size=2 * args.rnn_hidden_size, top_k=FLAGS.top_k, att_activation=getattr( F, FLAGS.att_activation)) pre_logits_dim = self.pooling.output_size if FLAGS.use_type_emb: num_types = 2 type_emb_dim = 10 self.type_embedding = nn.Embedding(num_types, type_emb_dim) pre_logits_dim += type_emb_dim self.logits = nn.Linear(pre_logits_dim, NUM_CLASSES) self.logits2 = nn.Linear(pre_logits_dim, NUM_CLASSES)
def __init__(self, args=None): super(MnemonicReaderV1, self).__init__() if args is None: args = FLAGS # Store config self.args = args vocabulary.init() vocab_size = vocabulary.get_vocab_size() self.embedding = wenzheng.pyt.get_embedding( vocab_size, args.emb_dim, args.word_embedding_file, args.finetune_word_embedding) doc_input_size = args.emb_dim # Encoder self.encoding_rnn = layers.StackedBRNN( input_size=doc_input_size, hidden_size=args.rnn_hidden_size, num_layers=1, dropout_rate=1 - args.keep_prob, dropout_output=False, concat_layers=False, rnn_type=self.RNN_TYPES['gru'], padding=False, ) doc_hidden_size = 2 * args.rnn_hidden_size # Interactive aligning, self aligning and aggregating self.interactive_aligners = nn.ModuleList() self.interactive_SFUs = nn.ModuleList() self.self_aligners = nn.ModuleList() self.self_SFUs = nn.ModuleList() self.aggregate_rnns = nn.ModuleList() for i in range(args.hop): # interactive aligner self.interactive_aligners.append( layers.SeqAttnMatch(doc_hidden_size, identity=True)) self.interactive_SFUs.append( layers.SFU(doc_hidden_size, 3 * doc_hidden_size)) # self aligner self.self_aligners.append( layers.SelfAttnMatch(doc_hidden_size, identity=True, diag=False)) self.self_SFUs.append( layers.SFU(doc_hidden_size, 3 * doc_hidden_size)) # aggregating self.aggregate_rnns.append( layers.StackedBRNN( input_size=doc_hidden_size, hidden_size=args.rnn_hidden_size, num_layers=1, dropout_rate=1 - args.keep_prob, dropout_output=False, concat_layers=False, rnn_type=self.RNN_TYPES['gru'], padding=False, )) self.logits = nn.Linear(2 * args.rnn_hidden_size, NUM_CLASSES) self.logits2 = nn.Linear(2 * args.rnn_hidden_size, NUM_CLASSES)
def __init__(self): super(RNet, self).__init__() vocabulary.init() vocab_size = vocabulary.get_vocab_size() self.embedding = wenzheng.Embedding( vocab_size, FLAGS.emb_dim, FLAGS.word_embedding_file, trainable=FLAGS.finetune_word_embedding, vocab2_size=FLAGS.unk_vocab_size, vocab2_trainable=FLAGS.finetune_unk_vocab) self.num_layers = FLAGS.num_layers self.num_units = FLAGS.rnn_hidden_size self.keep_prob = FLAGS.keep_prob logging.info('num_layers:', self.num_layers) logging.info('num_unints:', self.num_units) logging.info('keep_prob:', self.keep_prob) self.encode = melt.layers.CudnnRnn(num_layers=self.num_layers, num_units=self.num_units, keep_prob=self.keep_prob) if FLAGS.use_qc_att or FLAGS.use_bidaf_att: assert not (FLAGS.use_qc_att and FLAGS.use_bidaf_att ), 'use rnet or use bidaf? just choose one!' Attention = melt.layers.DotAttention if FLAGS.use_qc_att else melt.layers.BiDAFAttention # seems share att and match attention is fine a bit improve ? but just follow squad to use diffent dot attention self.att_dot_attention = Attention(hidden=self.num_units, keep_prob=self.keep_prob, combiner=FLAGS.att_combiner) # TODO seems not work like layers.Dense... name in graph mode in eager mode will name as att_encode, match_encode # in graph mode just cudnn_rnn, cudnn_rnn_1 so all ignore name=.. not like layers.Dense.. TODO self.att_encode = melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=self.keep_prob) #self.att_encode = melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=0.5) if FLAGS.use_label_emb or FLAGS.use_label_att: assert not (FLAGS.use_label_emb and FLAGS.use_label_att) self.label_emb_height = NUM_CLASSES if not FLAGS.label_emb_height else FLAGS.label_emb_height self.label_embedding = melt.layers.Embedding( self.label_emb_height, FLAGS.emb_dim) if not FLAGS.use_label_att: # TODO not use activation ? #self.label_dense = keras.layers.Dense(FLAGS.emb_dim, activation=tf.nn.relu) self.label_dense = keras.layers.Dense(FLAGS.emb_dim, use_bias=False) else: self.label_att_dot_attention = melt.layers.DotAttention( hidden=self.num_units, keep_prob=self.keep_prob, combiner=FLAGS.att_combiner) self.label_att_encode = melt.layers.CudnnRnn( num_layers=1, num_units=self.num_units, keep_prob=self.keep_prob) #self.label_att_encode = melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=0.5) if FLAGS.use_self_match: self.match_dot_attention = melt.layers.DotAttention( hidden=self.num_units, keep_prob=self.keep_prob, combiner=FLAGS.att_combiner) self.match_encode = melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=self.keep_prob) #self.match_encode = melt.layers.CudnnRnn(num_layers=1, num_units=self.num_units, keep_prob=0.5) # TODO might try to all set use_bias=True if FLAGS.use_answer_emb: self.context_dense = keras.layers.Dense(FLAGS.emb_dim, use_bias=False) self.answer_dense = keras.layers.Dense(FLAGS.emb_dim, use_bias=False) # self.context_dense = keras.layers.Dense(FLAGS.emb_dim, activation=tf.nn.relu) # self.answer_dense = keras.layers.Dense(FLAGS.emb_dim, activation=tf.nn.relu) logging.info('encoder_output_method:', FLAGS.encoder_output_method) logging.info('topk:', FLAGS.top_k) self.pooling = melt.layers.Pooling(FLAGS.encoder_output_method, top_k=FLAGS.top_k, att_activation=getattr( tf.nn, FLAGS.att_activation)) self.logits = keras.layers.Dense(NUM_CLASSES, activation=None) if FLAGS.split_type: self.logits2 = keras.layers.Dense(NUM_CLASSES, activation=None)