def __init__(self, name='ra', nimg=2048, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] na = f.attrs['na'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] # npatch = f.attrs['npatch'] self.config = {'nimg': nimg, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nh, name=self.name+'@attention') # inputs cap = T.imatrix('cap') img = T.tensor3('img') self.inputs = [cap, img] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions and variables are used in test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared(np.zeros((1, npatch, na)).astype(theano.config.floatX))
def __init__(self, embedding_matrix, opt): super(IAN, self).__init__() self.opt = opt self.embed = nn.Embedding.from_pretrained( torch.tensor(embedding_matrix, dtype=torch.float)) self.lstm_context = DynamicLSTM(opt.embed_dim, opt.hidden_dim, num_layers=1, batch_first=True) self.lstm_aspect = DynamicLSTM(opt.embed_dim, opt.hidden_dim, num_layers=1, batch_first=True) self.attention_aspect = Attention(opt.hidden_dim, score_function='bi_linear') self.attention_context = Attention(opt.hidden_dim, score_function='bi_linear') self.dense = nn.Linear(opt.hidden_dim * 2, opt.polarities_dim)
def __init__(self, params): super(Seq2Seq, self).__init__() self.params = params self.embedding_matrix = load_embedding_matrix() self.encoder = Encoder(params["vocab_size"], params["vector_dim"], params["encoder_units"], self.embedding_matrix) self.attention = Attention(params["attn_units"]) self.decoder = Decoder(params["vocab_size"], params["vector_dim"], params["decoder_units"], self.embedding_matrix)
def __init__(self, num_layers, num_heads, embed_dim, ff_dim, dropout=0.): super(Decoder, self).__init__() self.self_atts = nn.ModuleList([]) self.enc_dec_atts = nn.ModuleList([]) self.pos_ffs = nn.ModuleList([]) self.lnorms = nn.ModuleList([]) for i in range(num_layers): self.self_atts.append( Attention(embed_dim, num_heads, dropout=dropout)) self.enc_dec_atts.append( Attention(embed_dim, num_heads, dropout=dropout)) self.pos_ffs.append( PositionWiseFeedForward(embed_dim, ff_dim, dropout=dropout)) self.lnorms.append( nn.ModuleList( [nn.LayerNorm(embed_dim, eps=1e-6) for i in range(3)])) self.last_lnorm = nn.LayerNorm(embed_dim, eps=1e-6) self.dropout = dropout self.num_layers = num_layers
def create_Attention_layer(layer_info): if logging_enabled == True: print("- Entered layers_factory::create_Attention_layer Global Method") if not len(layer_info) == 5: raise RuntimeError('Attention layer must have 5 specs') return Attention(input_dim=int(layer_info['input_dim']), att_times=int(layer_info['att_times']), att_num=int(layer_info['att_num']), att_style=layer_info['att_style'], att_weight=parse_as_bool(layer_info['att_weight']))
def __init__(self, n_feat, n_message_passing, n_hid, n_penultimate, n_class, dropout, embeddings, use_master_node, graph_of_sentences): super(MPAD, self).__init__() self.graph_of_sentences = graph_of_sentences self.n_message_passing = n_message_passing self.embedding = nn.Embedding(embeddings.shape[0], embeddings.shape[1]) self.embedding.weight.data.copy_(torch.from_numpy(embeddings)) self.embedding.weight.requires_grad = False self.mps1 = torch.nn.ModuleList() self.atts1 = torch.nn.ModuleList() for i in range(n_message_passing): if i == 0: self.mps1.append(MessagePassing(n_feat, n_hid)) else: self.mps1.append(MessagePassing(n_hid, n_hid)) self.atts1.append(Attention(n_hid, n_hid, use_master_node)) if use_master_node: self.bn = nn.BatchNorm1d(2 * n_message_passing * n_hid, n_hid) self.fc1 = nn.Linear(2 * n_message_passing * n_hid, n_hid) else: self.bn = nn.BatchNorm1d(n_message_passing * n_hid, n_hid) self.fc1 = nn.Linear(n_message_passing * n_hid, n_hid) if graph_of_sentences == 'sentence_att': self.att = Attention(n_hid, n_hid, False) self.fc2 = nn.Linear(n_hid, n_penultimate) else: self.fc2 = nn.Linear(n_message_passing * n_hid, n_penultimate) self.mps2 = torch.nn.ModuleList() self.atts2 = torch.nn.ModuleList() for i in range(n_message_passing): self.mps2.append(MessagePassing(n_hid, n_hid)) self.atts2.append(Attention(n_hid, n_hid, False)) self.fc3 = nn.Linear(n_penultimate, n_class) self.dropout = nn.Dropout(dropout) self.relu = nn.ReLU()
def atae_lstm(self): input_text = Input(shape=(self.max_len, )) input_aspect = Input(shape=(1, ), ) if self.config.word_embed_type is not 'random': word_embedding = Embedding( input_dim=self.text_embeddings.shape[0], output_dim=self.config.word_embed_dim, weights=[self.text_embeddings], trainable=self.config.word_embed_trainable, mask_zero=True) else: word_embedding = Embedding( input_dim=self.config.text_random_input_dim, output_dim=self.config.word_embed_dim, mask_zero=True) # dropout 丢弃比例0.2 text_embed = SpatialDropout1D(0.2)(word_embedding(input_text)) if self.config.aspect_embed_type is 'random': asp_embedding = Embedding( input_dim=self.config.aspect_random_input_dim, output_dim=self.config.aspect_embed_dim) else: asp_embedding = Embedding( input_dim=self.config.aspect_random_input_dim, # 其实永远为20 output_dim=self.config.aspect_embed_dim, trainable=self.config.aspect_embed_trainable) aspect_embed = asp_embedding(input_aspect) aspect_embed = Flatten()(aspect_embed) # reshape to 2d repeat_aspect = RepeatVector(self.max_len)( aspect_embed) # repeat aspect for every word in sequence input_concat = concatenate([text_embed, repeat_aspect], axis=-1) hidden_vecs, state_h, _ = LSTM(self.config.lstm_units, return_sequences=True, return_state=True)(input_concat) concat = concatenate([hidden_vecs, repeat_aspect], axis=-1) # apply attention mechanism attend_weight = Attention()(concat) attend_weight_expand = Lambda(lambda x: K.expand_dims(x))( attend_weight) attend_hidden = multiply([hidden_vecs, attend_weight_expand]) attend_hidden = Lambda(lambda x: K.sum(x, axis=1))(attend_hidden) attend_hidden_dense = Dense(self.config.lstm_units)(attend_hidden) last_hidden_dense = Dense(self.config.lstm_units)(state_h) final_output = Activation('tanh')(add( [attend_hidden_dense, last_hidden_dense])) return Model([input_text, input_aspect], final_output)
def __init__(self, args): super(modeler, self).__init__() self.args = args self.gcn = nn.ModuleList([GCN(args.ft_size, args.hid_units, args.activation, args.drop_prob, args.isBias) for _ in range(args.nb_graphs)]) self.disc = Discriminator(args.hid_units) self.H = nn.Parameter(torch.FloatTensor(1, args.nb_nodes, args.hid_units)) self.readout_func = self.args.readout_func if args.isAttn: self.attn = nn.ModuleList([Attention(args) for _ in range(args.nheads)]) if args.isSemi: self.logistic = LogReg(args.hid_units, args.nb_classes).to(args.device) self.init_weight()
def __init__(self, config, embeddings=None): # 定义模型输入 sent_inputs = Input(shape=(config.max_words, ), dtype='float64') doc_inputs = Input(shape=(config.max_sents, config.max_words), dtype='float64') # 嵌入层 embed = embedding_layers(config, embeddings)(sent_inputs) # 句子编码 sent_enc = Bidirectional( GRU(config.rnn_units[0], dropout=config.drop_rate[0], recurrent_dropout=config.re_drop[0], return_sequences=True))(embed) sent_att = Attention(config.att_size[0], name='AttLayer_1')(sent_enc) self.sent_model = Model(sent_inputs, sent_att) # 段落编码 doc_emb = TimeDistributed(self.sent_model)(doc_inputs) doc_enc = Bidirectional( GRU(config.rnn_units[1], dropout=config.drop_rate[1], recurrent_dropout=config.re_drop[1], return_sequences=True))(doc_emb) doc_att = Attention(config.att_size[1], name='AttLayer_2')(doc_enc) # FC fc1_drop = Dropout(config.drop_rate[1])(doc_att) fc1_bn = BatchNormalization()(fc1_drop) fc1 = Dense(config.fc_units[0], activation=config.activation_func, kernel_initializer='he_normal', kernel_regularizer=regularizers.l2(0.01))(fc1_bn) fc2_drop = Dropout(config.drop_rate[1])(fc1) # 输出 doc_pred = Dense(config.ntags, activation=config.classifier)(fc2_drop) # 最终模型 self.model = Model(inputs=doc_inputs, outputs=doc_pred) self.config = config
def _setup_layers(self): """ Creating layers of model. 1. GCN layers. 2. Primary capsules. 3. Attention 4. Graph capsules. 5. Class capsules. """ self.base_layers = [GCNConv(self.number_of_features, self.args.gcn_filters)] for layer in range(self.args.gcn_layers-1): self.base_layers.append(GCNConv( self.args.gcn_filters, self.args.gcn_filters)) self.base_layers = ListModule(*self.base_layers) self.first_capsule = PrimaryCapsuleLayer(self.args.gcn_filters, self.args.gcn_layers, self.args.gcn_layers, self.args.capsule_dimensions) self.attention = Attention(self.args.gcn_layers* self.args.gcn_filters*self.args.capsule_dimensions, self.args.inner_attention_dimension) self.graph_capsule = SecondaryCapsuleLayer(self.args.gcn_layers*self.args.gcn_filters, self.args.capsule_dimensions, self.args.number_of_capsules, self.args.capsule_dimensions) self.class_capsule = SecondaryCapsuleLayer(self.args.capsule_dimensions,self.args.number_of_capsules, self.number_of_targets, self.args.capsule_dimensions)
def __init__(self, n_feat, max_resolution, n_classes=0, use_dropout=None, use_attention=False, arch=None, return_features=False): super().__init__() self.max_resolution = max_resolution self.use_dropout = use_dropout self.return_features = return_features self.res1 = ResBlock_D(3, n_feat, downsample=True) self.use_attention = use_attention if use_attention: self.attn = Attention(n_feat) self.residual_blocks = nn.ModuleList([]) n_layers = int(np.log2(self.max_resolution)) - 2 last_block_factor = 0 for i in range(n_layers): is_last = (i == n_layers - 1) if arch is None: prev_factor = 2 ** (i) curr_factor = 2 ** (i + 1) else: prev_factor = arch[i] curr_factor = arch[i + 1] # print(f"block ({i}): {prev_factor}, {curr_factor}") block = ResBlock_D(prev_factor * n_feat, curr_factor * n_feat, downsample=not is_last) self.residual_blocks.add_module(f"res_block_{i}", block) if is_last: last_block_factor = curr_factor if self.use_dropout is not None: self.dropout = nn.Dropout(self.use_dropout) self.fc = nn.utils.spectral_norm( nn.Linear(last_block_factor * n_feat, 1)).apply( init_weight) self.embedding = nn.Embedding(num_embeddings=n_classes, embedding_dim=last_block_factor * n_feat).apply( init_weight)
def build_attention(): """ Build the model architecture for attention output """ inputs = Input(shape=(MAX_LEN, 20), name='Input') masking = Masking(mask_value=0.0, input_shape=(MAX_LEN, 20), name='Masking')(inputs) hidden = Bidirectional(LSTM(512, use_bias=True, dropout=0.5, return_sequences=True), name='Bidirectional-LSTM')(masking) hidden = MultiHeadAttention(head_num=32, activation='relu', use_bias=True, return_multi_attention=False, name='Multi-Head-Attention')(hidden) hidden = Dropout(0.2, name='Dropout_1')(hidden) hidden = Attention(return_attention=True, name='Attention')(hidden) model = Model(inputs=inputs, outputs=hidden) return model
def build(self): if self.opt.match_type == 'pointwise': reps = [ self.representation_model.get_representation(doc) for doc in [self.question, self.answer] ] if self.opt.onehot: output = self.dense_last(Attention()(reps)) else: output = self.distance(reps) # model = Model([self.question, self.answer], output) elif self.opt.match_type == 'pairwise': q_rep = self.representation_model.get_representation(self.question) score1 = self.distance([ q_rep, self.representation_model.get_representation(self.answer) ]) score2 = self.distance([ q_rep, self.representation_model.get_representation(self.neg_answer) ]) basic_loss = MarginLoss(self.opt.margin)([score1, score2]) output = [score1, basic_loss, basic_loss] model = Model([self.question, self.answer, self.neg_answer], output) else: raise ValueError( 'wrong input of matching type. Please input pairwise or pointwise.' ) return model
Embedding(e_vocab_size, EMB_DIM), GRU(EMB_DIM, HID_DIM, m), GRU(EMB_DIM, HID_DIM, m[:, ::-1]) ] x_emb = f_props(encoder[:1], x) h_ef = f_props(encoder[1:2], x_emb) h_eb = f_props(encoder[2:], x_emb[:, ::-1])[:, ::-1, :] h_e = tf.concat([h_ef, h_eb], axis=2) h_d1_0 = tf.reduce_mean(h_e, axis=1) h_d2_0 = tf.reduce_mean(h_e, axis=1) decoder = [ Embedding(d_vocab_size, EMB_DIM), GRU(EMB_DIM, 2 * HID_DIM, tf.ones_like(t_in, dtype='float32'), h_0=h_d1_0), Attention(2 * HID_DIM, 2 * HID_DIM, h_e, ma), GRU(EMB_DIM + 2 * HID_DIM, 2 * HID_DIM, tf.ones_like(t_in, dtype='float32'), h_0=h_d2_0), RVAE(EMB_DIM, 2 * HID_DIM, LAT_DIM), Dense3d(LAT_DIM + 2 * HID_DIM, HID_DIM, tf.nn.tanh), Dense3d(HID_DIM, d_vocab_size, tf.nn.softmax) ] t_in_emb = f_props(decoder[:1], t_in) h_d1 = f_props(decoder[1:2], t_in_emb) h_d1__ = tf.concat([h_d1_0[:, None, :], h_d1], axis=1)[:, :-1, :] c = f_props(decoder[2:3], h_d1) h_d2 = f_props(decoder[3:4], tf.concat([t_in_emb, c], axis=2)) z, KL = f_props(decoder[4:5], [h_d1__, t_in_emb])
def __init__(self, blocks_args, global_args): super().__init__() assert isinstance(blocks_args, list), 'blocks_args should be a list' assert len(blocks_args) > 0, 'block args must be greater than 0' self._global_args = global_args out_channels = 3 # rgb self._input_expand = nn.Linear(self._global_args.input_size, self._global_args.seq_size) def get_block(constructor, use, *args): if use: return constructor(*args) else: return None # linear block self._base_transformer = get_block( Transformer, self._global_args.use_base_transformer, self._global_args.base_transformer_args()) self._seq_to_image_start = SeqToImageStart( self._global_args.seq_to_image_start_args()) self._image_blocks = nn.ModuleList([]) self._image_to_seq_blocks = nn.ModuleList([]) self._seq_blocks = nn.ModuleList([]) last_ch = blocks_args[-1].output_ch for i, block_args in enumerate(blocks_args): input_ch = block_args.input_ch output_ch = block_args.output_ch for repeat_num in range(block_args.num_repeat): block_args.next_block() # TODO: consider not constructing blocks which aren't used... self._image_blocks.append( MBConvGBlock(block_args.mbconv_args())) self._image_to_seq_blocks.append( get_block(ImageToSeq, block_args.use_image_to_seq_this_block, block_args.image_to_seq_args())) self._seq_blocks.append( get_block(Transformer, block_args.use_seq_this_block, block_args.transformer_args())) if (self._global_args.use_nonlocal and i == self._global_args.nonlocal_index - 1): self._attention_index = len(self._image_blocks) - 1 self._attention = Attention(block_args.output_ch) self._swish = MemoryEfficientSwish() self.output_bn = ConfigurableNorm( last_ch, input_gain_bias=False, norm_style=self._global_args.norm_style) self.output_conv = nn.Conv2d(last_ch, out_channels, kernel_size=3, padding=1) negative_allowance = 0.05 # CELU might be a good choice... self._output_activation = nn.CELU(alpha=negative_allowance)
class Model(object): """ Region Attention model """ def __init__(self, name='ra', nimg=2048, nnh=512, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nnh = f.attrs['nnh'] na = f.attrs['na'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] # npatch = f.attrs['npatch'] self.config = {'nimg': nimg, 'nnh': nnh, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nnh, name=self.name+'@attention') # inputs cap = T.imatrix('cap') img = T.tensor3('img') self.inputs = [cap, img] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions and variables are used in test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared(np.zeros((1, npatch, nimg)).astype(theano.config.floatX)) def compute(self, state, w_idx, feat): # word embedding word_vec = self.embedding.compute(w_idx) # split states e_tm1, c_tm1, h_tm1 = split_state(state, scheme=[(1, self.config['na']), (2, self.config['nh'])]) # attention e_t, alpha = self.attention.compute(feat, T.concatenate([e_tm1, h_tm1, word_vec], axis=1)) # lstm step e_w = T.concatenate([e_t, word_vec], axis=-1) c_t, h_t = self.lstm.compute(e_w, c_tm1, h_tm1) # (mb,nh) # merge state new_state = T.concatenate([e_t, c_t, h_t], axis=-1) # predict word probability p = self.pred_mlp.compute(T.concatenate([e_t, h_t, word_vec], axis=-1)) return new_state, p, alpha def scan_func(self, w_tm1, w_t, state, feat): # update state new_state, p, alpha = self.compute(state, w_tm1, feat) # cross-entropy loss loss = T.nnet.categorical_crossentropy(p, w_t) return new_state, p, loss, alpha def init_func(self, img_value): if self._proj_func is None: img = T.tensor3() self._proj_func = theano.function([img], self.proj_mlp.compute(img)) if self._init_func is None: init_e = self._feat_shared.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) self._init_func = theano.function([], init_state) self._feat_shared.set_value(self._proj_func(img_value)) return self._init_func() def step_func(self, state_value, w_value): if self._step_func is None: w = T.ivector() state = T.matrix() new_state, p, _ = self.compute(state, w, self._feat_shared) self._step_func = theano.function([state, w], [new_state, T.log(p)]) return self._step_func(state_value, w_value) def save_to_dir(self, save_dir, idx): save_file = osp.join(save_dir, self.name+'.h5.'+str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)
def __init__(self, config, mode): super(Model, self).__init__() self.logger = ut.get_logger(config['log_file']) ENC_SCOPE = 'encoder' DEC_SCOPE = 'decoder' ATT_SCOPE = 'attention' OUT_SCOPE = 'outputer' SFM_SCOPE = 'softmax' batch_size = config['batch_size'] feed_input = config['feed_input'] grad_clip = config['grad_clip'] beam_size = config['beam_size'] beam_alpha = config['beam_alpha'] num_layers = config['num_layers'] rnn_type = config['rnn_type'] score_func_type = config['score_func_type'] src_vocab_size = config['src_vocab_size'] trg_vocab_size = config['trg_vocab_size'] src_embed_size = config['src_embed_size'] trg_embed_size = config['trg_embed_size'] enc_rnn_size = config['enc_rnn_size'] dec_rnn_size = config['dec_rnn_size'] input_keep_prob = config['input_keep_prob'] output_keep_prob = config['output_keep_prob'] attention_maps = { ac.SCORE_FUNC_DOT: Attention.DOT, ac.SCORE_FUNC_GEN: Attention.GEN, ac.SCORE_FUNC_BAH: Attention.BAH } score_func_type = attention_maps[score_func_type] if mode != ac.TRAINING: batch_size = 1 input_keep_prob = 1.0 output_keep_prob = 1.0 # Placeholder self.src_inputs = tf.placeholder(tf.int32, [batch_size, None]) self.src_seq_lengths = tf.placeholder(tf.int32, [batch_size]) self.trg_inputs = tf.placeholder(tf.int32, [batch_size, None]) self.trg_targets = tf.placeholder(tf.int32, [batch_size, None]) self.target_weights = tf.placeholder(tf.float32, [batch_size, None]) # First, define the src/trg embeddings with tf.variable_scope(ENC_SCOPE): self.src_embedding = tf.get_variable( 'embedding', shape=[src_vocab_size, src_embed_size], dtype=tf.float32) with tf.variable_scope(DEC_SCOPE): self.trg_embedding = tf.get_variable( 'embedding', shape=[trg_vocab_size, trg_embed_size], dtype=tf.float32) # Then select the RNN cell, reuse if not in TRAINING mode if rnn_type != ac.LSTM: raise NotImplementedError reuse = mode != ac.TRAINING # if dev/test, reuse cell encoder_cell = ut.get_lstm_cell(ENC_SCOPE, num_layers, enc_rnn_size, output_keep_prob=output_keep_prob, seed=ac.SEED, reuse=reuse) att_state_size = dec_rnn_size decoder_cell = ut.get_lstm_cell(DEC_SCOPE, num_layers, dec_rnn_size, output_keep_prob=output_keep_prob, seed=ac.SEED, reuse=reuse) # The model encoder = Encoder(encoder_cell, ENC_SCOPE) decoder = Encoder(decoder_cell, DEC_SCOPE) outputer = FeedForward(enc_rnn_size + dec_rnn_size, att_state_size, OUT_SCOPE, activate_func=tf.tanh) self.softmax = softmax = Softmax(att_state_size, trg_vocab_size, SFM_SCOPE) # Encode source sentence encoder_inputs = tf.nn.embedding_lookup(self.src_embedding, self.src_inputs) encoder_inputs = tf.nn.dropout(encoder_inputs, input_keep_prob, seed=ac.SEED) encoder_outputs, last_state = encoder.encode( encoder_inputs, sequence_length=self.src_seq_lengths, initial_state=None) # Define an attention layer over encoder outputs attention = Attention(ATT_SCOPE, score_func_type, encoder_outputs, enc_rnn_size, dec_rnn_size, common_dim=enc_rnn_size if score_func_type == Attention.BAH else None) # This function takes an decoder's output, make it attend to encoder's outputs and # spit out the attentional state which is used for predicting next target word def decoder_output_func(h_t): alignments, c_t = attention.calc_context(self.src_seq_lengths, h_t) c_t_h_t = tf.concat([c_t, h_t], 1) output = outputer.transform(c_t_h_t) return output, alignments # Fit everything in the decoder & start decoding decoder_inputs = tf.nn.embedding_lookup(self.trg_embedding, self.trg_inputs) decoder_inputs = tf.nn.dropout(decoder_inputs, input_keep_prob, seed=ac.SEED) attentional_outputs = decoder.decode(decoder_inputs, decoder_output_func, att_state_size, feed_input=feed_input, initial_state=last_state, reuse=False) attentional_outputs = tf.reshape(attentional_outputs, [-1, att_state_size]) # Loss logits = softmax.calc_logits(attentional_outputs) logits = tf.reshape(logits, [batch_size, -1, trg_vocab_size]) loss = sequence_loss(logits, self.trg_targets, self.target_weights, average_across_timesteps=False, average_across_batch=False) if mode != ac.TRAINING: self.loss = tf.stop_gradient(tf.reduce_sum(loss)) max_output_length = 3 * self.src_seq_lengths[0] tensor_to_state = partial(ut.tensor_to_lstm_state, num_layers=config['num_layers']) beam_outputs = decoder.beam_decode(self.trg_embedding, ac.BOS_ID, ac.EOS_ID, decoder_output_func, att_state_size, softmax.calc_logprobs, trg_vocab_size, max_output_length, tensor_to_state, alpha=beam_alpha, beam_size=beam_size, feed_input=feed_input, initial_state=last_state, reuse=True) self.probs, self.scores, self.symbols, self.parents, self.alignments = beam_outputs # If in training, do the grad backpropagate if mode == ac.TRAINING: self.loss = tf.reduce_sum(loss) # Option to fix some variables fixed_vars = config['fixed_var_list'] if config[ 'fixed_var_list'] else [] if fixed_vars: fixed_vars = operator.attrgetter(*fixed_vars)(self) if isinstance(fixed_vars, list): fixed_var_names = [ _fixed_var.name for _fixed_var in fixed_vars ] else: fixed_var_names = [fixed_vars.name] else: fixed_var_names = [] tvars = tf.trainable_variables() tvars = [ _var for _var in tvars if _var.name not in fixed_var_names ] grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), grad_clip) self.lr = tf.Variable(1.0, trainable=False, name='lr') if config['optimizer'] == ac.ADADELTA: optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.lr, rho=0.95, epsilon=1e-6) else: optimizer = tf.train.GradientDescentOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # Finally, log out some model's stats if mode == ac.TRAINING: def num_params(var): shape = var.get_shape().as_list() var_count = 1 for dim in shape: var_count = var_count * dim return var_count self.logger.info('{} model:'.format('train' if mode == ac.TRAINING else 'dev/test')) self.logger.info('Num trainable variables {}'.format(len(tvars))) self.logger.info('Num params: {:,}'.format( sum([num_params(v) for v in tvars]))) self.logger.info('List of all trainable parameters:') for v in tvars: self.logger.info(' {}'.format(v.name)) self.logger.info('List of all fixed parameters') for v in fixed_var_names: self.logger.info(' {}'.format(v))
def cnn_lstm_f1(): with open('vocab.data', 'rb') as fin: vocab = pickle.load(fin) question1 = Input(shape=(20, )) question2 = Input(shape=(20, )) q1 = Embedding(vocab.nb_words + 1, 300, weights=[vocab.embedding], input_length=20, trainable=False)(question1) q2 = Embedding(vocab.nb_words + 1, 300, weights=[vocab.embedding], input_length=20, trainable=False)(question2) f_rnn = LSTM(30, return_sequences=True, implementation=1) b_rnn = LSTM(30, return_sequences=True, implementation=1, go_backwards=True) pos = Position_Embedding(mode='concat') att = Attention(20) q1 = BatchNormalization()(q1) qf_rnn = f_rnn(q1) qb_rnn = b_rnn(q1) q1_rnn = concatenate([qf_rnn, qb_rnn], axis=-1) q1_rnn = pos(q1_rnn) q1_rnn = concatenate([q1_rnn, att(q1_rnn)]) q2 = BatchNormalization()(q2) af_rnn = f_rnn(q2) ab_rnn = b_rnn(q2) q2_rnn = concatenate([af_rnn, ab_rnn], axis=-1) q2_rnn = pos(q2_rnn) q2_rnn = concatenate([q2_rnn, att(q2_rnn)]) # cnn cnns = [ Conv1D(kernel_size=kernel_size, filters=100, activation='tanh', padding='same') for kernel_size in [1, 2, 3, 5] ] # qq_cnn = merge([cnn(question_pool) for cnn in cnns], mode='concat') q1_cnn = concatenate([cnn(q1_rnn) for cnn in cnns], axis=-1) # q2_cnn = merge([cnn(answer_pool) for cnn in cnns], mode='concat') q2_cnn = concatenate([cnn(q2_rnn) for cnn in cnns], axis=-1) maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2])) maxpool.supports_masking = True q1_pool = Dropout(0.05)(maxpool(q1_cnn)) q2_pool = Dropout(0.05)(maxpool(q2_cnn)) merged1 = Dense(100, activation='relu')(q1_pool) merged2 = Dense(100, activation='relu')(q2_pool) merged = concatenate([merged1, merged2]) is_duplicate = Dense(1, activation='sigmoid')(merged) model = Model(inputs=[question1, question2], outputs=is_duplicate) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model
embedded = Embedding(input_dim=vocabulary_size, output_dim=embedding_dims)(X) # Recurrent Layers if config != 0: encoder_output, hidden_state, cell_state = CuDNNLSTM( units=128, return_sequences=True, return_state=True)(embedded) attention_input = [encoder_output, hidden_state] else: encoder_output = CuDNNLSTM(units=128)(embedded) # Optional Attention Mechanisms if config == 1: encoder_output, attention_weights = SelfAttention( size=128, num_hops=10, use_penalization=False)(encoder_output) elif config == 2: encoder_output, attention_weights = Attention( context='many-to-one', alignment_type='global')(attention_input) encoder_output = Flatten()(encoder_output) elif config == 3: encoder_output, attention_weights = Attention( context='many-to-one', alignment_type='local-p*', window_width=100, score_function='scaled_dot')(attention_input) encoder_output = Flatten()(encoder_output) # Prediction Layer Y = Dense(units=num_categories, activation='softmax')(encoder_output) # Compile model model = Model(inputs=X, outputs=Y) model.compile(loss='sparse_categorical_crossentropy',
trainable=True) print('Embedding matrix completed.') # -------------- DNN goes after here --------------------- cinput = Input(shape=(context_maxlen,), dtype='int32') cembed = embedding_layer(cinput) clstm1 = Bidirectional(LSTM(100, return_sequences=True))(cembed) qinput = Input(shape=(question_maxlen,), dtype='int32') qembed = embedding_layer(qinput) qlstm1 = Bidirectional(LSTM(100, return_sequences=True))(qembed) cdecoder = RecurrentContainer(decode=True, output_length=context_maxlen, input_length=context_maxlen) cdecoder.add(AttentionDecoderCell(output_dim=100, hidden_dim=100)) clstm2 = cdecoder(clstm1) ch1 = Attention(qlstm1)(clstm1) clstm2 = Bidirectional(LSTM(100, return_sequences=True))(ch1) qh1 = Attention(clstm2)(qlstm1) qlstm2 = Bidirectional(LSTM(100, return_sequences=True))(qh1) ch2 = Attention(qlstm2)(clstm2) qh2 = Attention(ch2)(qlstm2) h = Merge([ch2, qh2], mode='concat') hlstm = Bidirectional(LSTM(100))(h) output1 = Dense(context_maxlen, activation='softmax')(hlstm) hmerge = Merge([hlstm, output1], mode='concat') output2 = Dense(context_maxlen, activation='softmax')(hmerge) qnamodel = Model(input=[cinput, qinput], output=[output1, output2])
def attention_lstm(): with open('vocab.data', 'rb') as fin: vocab = pickle.load(fin) question1 = Input(shape=(15, )) question2 = Input(shape=(15, )) q1 = Embedding(vocab.nb_words + 1, 300, weights=[vocab.embedding], input_length=15, trainable=False)(question1) q2 = Embedding(vocab.nb_words + 1, 300, weights=[vocab.embedding], input_length=15, trainable=False)(question2) pos = Position_Embedding() f_rnn = LSTM(256, return_sequences=True, consume_less='mem') b_rnn = LSTM(256, return_sequences=True, consume_less='mem', go_backwards=True) maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2])) maxpool.supports_masking = True q1 = pos(q1) q2 = pos(q2) qf_rnn = f_rnn(q1) qb_rnn = b_rnn(q1) # q1_rnn = merge([qf_rnn, qb_rnn], mode='concat', concat_axis=-1) q1_rnn = concatenate([qf_rnn, qb_rnn], axis=-1) af_rnn = f_rnn(q2) ab_rnn = b_rnn(q2) # q2_rnn = merge([af_rnn, ab_rnn], mode='concat', concat_axis=-1) q2_rnn = concatenate([af_rnn, ab_rnn], axis=-1) att = Attention(20) q1_att = maxpool(att([q1_rnn, q1_rnn, q1_rnn])) q1 = Dense(200, activation='relu')(q1_att) q2_att = maxpool(attention([q2_rnn, q2_rnn, q2_rnn])) q2 = Dense(200, activation='relu')(q2_att) merged = concatenate([q1, q2]) merged = Dense(200, activation='relu')(merged) merged = Dropout(0)(merged) merged = BatchNormalization()(merged) is_duplicate = Dense(1, activation='sigmoid')(merged) model = Model(inputs=[question1, question2], outputs=is_duplicate) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model
def __init__(self, inp, oup, expand_ratio, kernel_size, stride, se_reduction, drop_connect_ratio=0.2): """Basic building block - Inverted Residual Convolution from MobileNet V2 architecture. Arguments: expand_ratio (int): ratio to expand convolution in width inside convolution. It's not the same as width_mult in MobileNet which is used to increase persistent input and output number of channels in layer. Which is not a projection of channels inside the conv. """ super().__init__() hidden_dim = int(inp * expand_ratio) self.use_res_connect = stride == 1 and inp == oup if self.use_res_connect: self.dropconnect = DropConnect(drop_connect_ratio) if expand_ratio == 1: self.conv = nn.Sequential( # depth-wise SamePadConv2d(inp=hidden_dim, oup=hidden_dim, kernel_size=kernel_size, stride=stride, groups=hidden_dim, bias=False), nn.BatchNorm2d(hidden_dim, eps=batch_norm_epsilon, momentum=batch_norm_momentum), Swish(), Attention( channels=hidden_dim, reduction=4), # somehow here reduction should be always 4 # point-wise-linear SamePadConv2d(inp=hidden_dim, oup=oup, kernel_size=1, stride=1, bias=False), nn.BatchNorm2d(oup, eps=batch_norm_epsilon, momentum=batch_norm_momentum), ) else: self.conv = nn.Sequential( # point-wise SamePadConv2d(inp, hidden_dim, kernel_size=1, stride=1, bias=False), nn.BatchNorm2d(hidden_dim, eps=batch_norm_epsilon, momentum=batch_norm_momentum), Swish(), # depth-wise SamePadConv2d(hidden_dim, hidden_dim, kernel_size, stride, groups=hidden_dim, bias=False), nn.BatchNorm2d(hidden_dim, eps=batch_norm_epsilon, momentum=batch_norm_momentum), Swish(), Attention(channels=hidden_dim, reduction=se_reduction), # point-wise-linear SamePadConv2d(hidden_dim, oup, kernel_size=1, stride=1, bias=False), nn.BatchNorm2d(oup, eps=batch_norm_epsilon, momentum=batch_norm_momentum), )
class Model(object): """ region attention + scene-specific contexts """ def __init__(self, name='rass', nimg=2048, nh=512, nw=512, na=512, nout=8843, ns=80, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nh = f.attrs['nh'] nw = f.attrs['nw'] na = f.attrs['na'] ns = f.attrs['ns'] nout = f.attrs['nout'] self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'na': na, 'nout': nout, 'ns': ns, 'npatch': npatch} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nh, name=self.name+'@attention') # lstm self.lstm = BasicLSTM(dim_x=na+nw+ns, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na+nh+nw+ns, nout], output_type='softmax', name=self.name+'@pred_mlp') # inputs cap = T.imatrix('cap') img = T.tensor3('img') scene = T.matrix('scene') self.inputs = [cap, img, scene] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat, scene]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # initialization for test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared(np.zeros((1, npatch, na)).astype(theano.config.floatX)) self._scene_shared = theano.shared(np.zeros((1, ns)).astype(theano.config.floatX)) def compute(self, state, w_idx, feat, scene): # word embedding word_vec = self.embedding.compute(w_idx) # split states e_tm1, c_tm1, h_tm1 = split_state(state, scheme=[(1, self.config['na']), (2, self.config['nh'])]) # attention e_t, alpha = self.attention.compute(feat, T.concatenate([e_tm1, h_tm1, word_vec], axis=1)) # lstm step e_w_s = T.concatenate([e_t, word_vec, scene], axis=-1) c_t, h_t = self.lstm.compute(e_w_s, c_tm1, h_tm1) # merge state new_state = T.concatenate([e_t, c_t, h_t], axis=-1) # add w_{t-1} as feature e_h_w_s = T.concatenate([e_t, h_t, word_vec, scene], axis=-1) # predict probability p = self.pred_mlp.compute(e_h_w_s) return new_state, p, alpha def scan_func(self, w_tm1, w_t, state, feat, scene): # update state new_state, p, alpha = self.compute(state, w_tm1, feat, scene) # cross-entropy loss loss = T.nnet.categorical_crossentropy(p, w_t) return new_state, p, loss, alpha def init_func(self, img_value, scene_value): if self._proj_func is None: img = T.tensor3() self._proj_func = theano.function([img], self.proj_mlp.compute(img)) if self._init_func is None: init_e = self._feat_shared.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) self._init_func = theano.function([], init_state) self._feat_shared.set_value(self._proj_func(img_value)) self._scene_shared.set_value(scene_value) return self._init_func() def step_func(self, state_value, w_value): if self._step_func is None: w = T.ivector() state = T.matrix() new_state, p, _ = self.compute(state, w, self._feat_shared, self._scene_shared) self._step_func = theano.function([state, w], [new_state, T.log(p)]) return self._step_func(state_value, w_value) def save_to_dir(self, save_dir, idx): save_file = osp.join(save_dir, self.name+'.h5.'+str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)
def _setup_attention(self): """ Creating attention layer. """ self.attention = Attention(self.args.gcn_layers* self.args.capsule_dimensions, self.args.inner_attention_dimension)
def build_sentiment_classifier(self, x): x = Attention(384)(x) x = Dropout(0.2)(x) return Dense(NUM_SENTIMENTS, activation='softmax', name='sen_output')(x)
def __init__(self, name='ra', nimg=2048, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] na = f.attrs['na'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] # npatch = f.attrs['npatch'] self.config = { 'nimg': nimg, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch } # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name + '@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2 * nh], output_type='tanh', name=self.name + '@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name + '@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=na + nw, dim_h=nh, name=self.name + '@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na + nh + nw, nout], output_type='softmax', name=self.name + '@pred_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na + nw + nh, hsize=nh, name=self.name + '@attention') # inputs cap = T.imatrix('cap') img = T.tensor3('img') self.inputs = [cap, img] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate( [init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan( fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [ self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp ] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions and variables are used in test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared( np.zeros((1, npatch, nimg)).astype(theano.config.floatX))
def char_word_HAN(max_words, max_sents, embed_size, vocab_cnt, gru_units, drop_rate, att_size, re_drop, num_labels, fc_units, classifier, loss_function, activation_func, pre_trained, embedding_matrix): word_sent_inputs = Input(shape=(max_words[0], ), dtype='float64') word_embed = embedding_layers(vocab_cnt[0], embed_size, max_words[0], embedding_matrix[0], pre_trained)(word_sent_inputs) word_sent_enc = Bidirectional( GRU(gru_units[0], dropout=drop_rate[0], recurrent_dropout=re_drop[0], return_sequences=True))(word_embed) word_sent_att = Attention(att_size[0], name='AttLayer')(word_sent_enc) word_sent_model = Model(word_sent_inputs, word_sent_att) word_doc_inputs = Input(shape=(max_sents[0], max_words[0]), dtype='float64', name='word_inputs') word_doc_emb = TimeDistributed(word_sent_model)(word_doc_inputs) word_doc_enc = Bidirectional( GRU(gru_units[1], dropout=drop_rate[1], recurrent_dropout=re_drop[1], return_sequences=True))(word_doc_emb) word_doc_att = Attention(att_size[1], name='AttLayer_word')(word_doc_enc) word_fc1_drop = Dropout(drop_rate[1])(word_doc_att) word_fc1 = Dense(fc_units, activation=activation_func, kernel_initializer='he_normal')(word_fc1_drop) word_fc2_drop = Dropout(drop_rate[2])(word_fc1) char_sent_inputs = Input(shape=(max_words[1], ), dtype='float64') char_embed = embedding_layers(vocab_cnt[1], embed_size, max_words[1], embedding_matrix[1], pre_trained)(char_sent_inputs) char_sent_enc = Bidirectional( GRU(gru_units[0], dropout=drop_rate[0], recurrent_dropout=re_drop[0], return_sequences=True))(char_embed) char_sent_att = Attention(att_size[2], name='AttLayer')(char_sent_enc) char_sent_model = Model(char_sent_inputs, char_sent_att) char_doc_inputs = Input(shape=(max_sents[1], max_words[1]), dtype='float64', name='char_inputs') char_doc_emb = TimeDistributed(char_sent_model)(char_doc_inputs) char_doc_enc = Bidirectional( GRU(gru_units[1], dropout=drop_rate[1], recurrent_dropout=re_drop[1], return_sequences=True))(char_doc_emb) char_doc_att = Attention(att_size[3], name='AttLayer_char')(char_doc_enc) char_fc1_drop = Dropout(drop_rate[1])(char_doc_att) char_fc1 = Dense(fc_units, activation=activation_func, kernel_initializer='he_normal')(char_fc1_drop) char_fc2_drop = Dropout(drop_rate[2])(char_fc1) merge_info = concatenate([word_fc2_drop, char_fc2_drop], axis=1) output = Dense(num_labels, activation=classifier, name='out')(merge_info) model = Model(inputs=[word_doc_inputs, char_doc_inputs], outputs=output) nadam = optimizers.Nadam(clipnorm=1.) model.compile(loss=loss_function, optimizer=nadam, metrics=['accuracy']) return model
embedded = Embedding(input_dim=vocabulary_size, output_dim=embedding_dims)(X) # Recurrent Layer if config != 0: encoder_output, hidden_state, cell_state = CuDNNLSTM( units=512, return_sequences=True, return_state=True)(embedded) attention_input = [encoder_output, hidden_state] else: encoder_output = CuDNNLSTM(units=512)(embedded) # Optional Attention Mechanisms if config == 1: encoder_output, attention_weights = SelfAttention( size=50, num_hops=16, use_penalization=False)(encoder_output) elif config == 2: encoder_output, attention_weights = Attention( context='many-to-one', alignment_type='global')(attention_input) encoder_output = Flatten()(encoder_output) elif config == 3: encoder_output, attention_weights = Attention( context='many-to-one', alignment_type='local-p*', window_width=25)(attention_input) encoder_output = Flatten()(encoder_output) # Prediction Layer Y = Dense(units=vocabulary_size, activation='softmax')(encoder_output) # Compile model model = Model(inputs=X, outputs=Y) model.compile(loss=loss, optimizer='adam', metrics=[perplexity, categorical_accuracy])
def build_category_classifier(self, x): x = Attention(384)(x) x = Dropout(0.2)(x) return Dense(NUM_CATEGORIES, activation='softmax', name='cat_output')(x)
def _setup_attention(self): self.attention = Attention( self.args.gcn_layers * self.args.capsule_dimensions, self.args.inner_attention_dimension)
embedded_target = Embedding(input_dim=target_vocabulary_size, output_dim=embedding_dim)(X_target) # NOTE: The embedded target sequences (deriving from X_target) allow us to enforce Teacher Forcing: # using the actual output (correct translation) from the training dataset at the current time step # as input in the next time step, rather than the output generated by the network. # Recurrent Layers # i) Encoder encoder_output = CuDNNLSTM(units=128, return_sequences=True)(embedded_input) # ii) Decoder decoder_recurrent_layer = CuDNNLSTM(units=128, return_state=True) # NOTE: The encoder is always fully vectorized and returns the hidden representations of the whole # sequence at once, whereas the decoder does this step by step. # Optional Attention Mechanism if config == 1: attention_layer = Attention(context='many-to-many', alignment_type='global') elif config == 2: attention_layer = Attention(context='many-to-many', alignment_type='local-m') elif config == 3: attention_layer = Attention(context='many-to-many', alignment_type='local-p') # Prediction Layer decoder_dense_layer = Dense(units=target_vocabulary_size, activation='softmax') # Training Loop outputs = [] for timestep in range(target_sequence_length): # Get current input in from embedded target sequences current_word = Lambda(lambda x: x[:, timestep: timestep+1, :])(embedded_target) # Apply optional attention mechanism if config != 0:
def __init__(self, hidden_dim, output_dim): super().__init__() self.embedding = Embedding(output_dim, hidden_dim, mask_zero=True) self.lstm = LSTM(hidden_dim, return_state=True, return_sequences=True) self.attn = Attention(hidden_dim, hidden_dim) self.out = Dense(output_dim, activation='softmax')