def forward(self, sentence): # print(sentence) # [torch.LongTensor of size 42x64] x = self.word_embeddings(sentence) x = self.dropout_embed(x) # print(embeds.size()) # torch.Size([42, 64, 100]) # x = embeds.view(len(sentence), self.batch_size, -1) # print(x.size()) # torch.Size([42, 64, 100]) lstm_out, self.hidden = self.lstm(x, self.hidden) # lstm_out 10*5*50 hidden 1*5*50 *2 # print(lstm_out) # lstm_out = [F.max_pool1d(i, len(lstm_out)).unsqueeze(2) for i in lstm_out] lstm_out = torch.transpose(lstm_out, 0, 1) lstm_out = torch.transpose(lstm_out, 1, 2) lstm_out = F.max_pool1d(lstm_out, lstm_out.size(2)) # print(lstm_out.size()) lstm_out = lstm_out.squeeze(2) # y = self.hidden2label(lstm_out) #lstm_out = torch.cat(lstm_out, 1) # lstm_out = self.dropout(lstm_out) # lstm_out = lstm_out.view(len(sentence), -1) y = self.hidden2label1(F.tanh(lstm_out)) y = self.hidden2label2(F.tanh(y)) # log_probs = F.log_softmax(y) log_probs = y return log_probs
def PeepholeLSTMCell(input: torch.Tensor, hidden: Tuple[torch.Tensor, torch.Tensor], w_ih: torch.Tensor, w_hh: torch.Tensor, w_ip: torch.Tensor, w_fp: torch.Tensor, w_op: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ An LSTM cell with peephole connections without biases. Mostly ripped from the pytorch autograd lstm implementation. """ hx, cx = hidden gates = F.linear(input, w_ih) + F.linear(hx, w_hh) ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) peep_i = w_ip.unsqueeze(0).expand_as(cx) * cx ingate = ingate + peep_i peep_f = w_fp.unsqueeze(0).expand_as(cx) * cx forgetgate = forgetgate + peep_f ingate = F.sigmoid(ingate) forgetgate = F.sigmoid(forgetgate) cellgate = F.tanh(cellgate) cy = (forgetgate * cx) + (ingate * cellgate) peep_o = w_op.unsqueeze(0).expand_as(cy) * cy outgate = outgate + peep_o hy = outgate * F.tanh(cy) return hy, cy
def forward(self, xt, fc_feats, att_feats, p_att_feats, state): # The p_att_feats here is already projected att_size = att_feats.numel() // att_feats.size(0) // self.att_feat_size att = p_att_feats.view(-1, att_size, self.att_hid_size) att_h = self.h2att(state[0][-1]) # batch * att_hid_size att_h = att_h.unsqueeze(1).expand_as(att) # batch * att_size * att_hid_size dot = att + att_h # batch * att_size * att_hid_size dot = F.tanh(dot) # batch * att_size * att_hid_size dot = dot.view(-1, self.att_hid_size) # (batch * att_size) * att_hid_size dot = self.alpha_net(dot) # (batch * att_size) * 1 dot = dot.view(-1, att_size) # batch * att_size weight = F.softmax(dot) # batch * att_size att_feats_ = att_feats.view(-1, att_size, self.att_feat_size) # batch * att_size * att_feat_size att_res = torch.bmm(weight.unsqueeze(1), att_feats_).squeeze(1) # batch * att_feat_size all_input_sums = self.i2h(xt) + self.h2h(state[0][-1]) sigmoid_chunk = all_input_sums.narrow(1, 0, 3 * self.rnn_size) sigmoid_chunk = F.sigmoid(sigmoid_chunk) in_gate = sigmoid_chunk.narrow(1, 0, self.rnn_size) forget_gate = sigmoid_chunk.narrow(1, self.rnn_size, self.rnn_size) out_gate = sigmoid_chunk.narrow(1, self.rnn_size * 2, self.rnn_size) in_transform = all_input_sums.narrow(1, 3 * self.rnn_size, 2 * self.rnn_size) + \ self.a2c(att_res) in_transform = torch.max(\ in_transform.narrow(1, 0, self.rnn_size), in_transform.narrow(1, self.rnn_size, self.rnn_size)) next_c = forget_gate * state[1][-1] + in_gate * in_transform next_h = out_gate * F.tanh(next_c) output = self.dropout(next_h) state = (next_h.unsqueeze(0), next_c.unsqueeze(0)) return output, state
def forward(self, x): # print("fffff",x) embed = self.embed(x) # CNN cnn_x = embed cnn_x = torch.transpose(cnn_x, 0, 1) cnn_x = cnn_x.unsqueeze(1) cnn_x = [F.relu(conv(cnn_x)).squeeze(3) for conv in self.convs1] # [(N,Co,W), ...]*len(Ks) cnn_x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in cnn_x] # [(N,Co), ...]*len(Ks) cnn_x = torch.cat(cnn_x, 1) cnn_x = self.dropout(cnn_x) # LSTM lstm_x = embed.view(len(x), embed.size(1), -1) lstm_out, self.hidden = self.lstm(lstm_x, self.hidden) lstm_out = torch.transpose(lstm_out, 0, 1) lstm_out = torch.transpose(lstm_out, 1, 2) # lstm_out = F.tanh(lstm_out) lstm_out = F.max_pool1d(lstm_out, lstm_out.size(2)).squeeze(2) # CNN and LSTM cat cnn_x = torch.transpose(cnn_x, 0, 1) lstm_out = torch.transpose(lstm_out, 0, 1) cnn_lstm_out = torch.cat((cnn_x, lstm_out), 0) cnn_lstm_out = torch.transpose(cnn_lstm_out, 0, 1) # linear cnn_lstm_out = self.hidden2label1(F.tanh(cnn_lstm_out)) cnn_lstm_out = self.hidden2label2(F.tanh(cnn_lstm_out)) # output logit = cnn_lstm_out return logit
def norm_flow(self, params, z, v, logposterior): h = F.tanh(params[0][0](z)) mew_ = params[0][1](h) sig_ = F.sigmoid(params[0][2](h)+5.) #[PB,Z] z_reshaped = z.view(self.P, self.B, self.z_size) gradients = torch.autograd.grad(outputs=logposterior(z_reshaped), inputs=z_reshaped, grad_outputs=self.grad_outputs, create_graph=True, retain_graph=True, only_inputs=True)[0] gradients = gradients.detach() gradients = gradients.view(-1,self.z_size) v = v*sig_ + mew_*gradients logdet = torch.sum(torch.log(sig_), 1) h = F.tanh(params[1][0](v)) mew_ = params[1][1](h) sig_ = F.sigmoid(params[1][2](h)+5.) #[PB,Z] z = z*sig_ + mew_*v logdet2 = torch.sum(torch.log(sig_), 1) #[PB] logdet = logdet + logdet2 #[PB,Z], [PB] return z, v, logdet
def forward(self, h_out, fake_region, conv_feat, conv_feat_embed): # View into three dimensions att_size = conv_feat.numel() // conv_feat.size(0) // self.rnn_size conv_feat = conv_feat.view(-1, att_size, self.rnn_size) conv_feat_embed = conv_feat_embed.view(-1, att_size, self.att_hid_size) # view neighbor from bach_size * neighbor_num x rnn_size to bach_size x rnn_size * neighbor_num fake_region = self.fr_linear(fake_region) fake_region_embed = self.fr_embed(fake_region) h_out_linear = self.ho_linear(h_out) h_out_embed = self.ho_embed(h_out_linear) txt_replicate = h_out_embed.unsqueeze(1).expand(h_out_embed.size(0), att_size + 1, h_out_embed.size(1)) img_all = torch.cat([fake_region.view(-1,1,self.input_encoding_size), conv_feat], 1) img_all_embed = torch.cat([fake_region_embed.view(-1,1,self.input_encoding_size), conv_feat_embed], 1) hA = F.tanh(img_all_embed + txt_replicate) hA = F.dropout(hA,self.drop_prob_lm, self.training) hAflat = self.alpha_net(hA.view(-1, self.att_hid_size)) PI = F.softmax(hAflat.view(-1, att_size + 1)) visAtt = torch.bmm(PI.unsqueeze(1), img_all) visAttdim = visAtt.squeeze(1) atten_out = visAttdim + h_out_linear h = F.tanh(self.att2h(atten_out)) h = F.dropout(h, self.drop_prob_lm, self.training) return h
def forward(self, inputs): x, u = inputs x = self.bn0(x) x = F.tanh(self.linear1(x)) x = F.tanh(self.linear2(x)) V = self.V(x) mu = F.tanh(self.mu(x)) Q = None if u is not None: num_outputs = mu.size(1) L = self.L(x).view(-1, num_outputs, num_outputs) L = L * \ self.tril_mask.expand_as( L) + torch.exp(L) * self.diag_mask.expand_as(L) P = torch.bmm(L, L.transpose(2, 1)) u_mu = (u - mu).unsqueeze(2) A = -0.5 * \ torch.bmm(torch.bmm(u_mu.transpose(2, 1), P), u_mu)[:, :, 0] Q = A + V return mu, Q, V
def norm_flow(self, params, z, v): # print (z.size()) h = F.tanh(params[0][0](z)) mew_ = params[0][1](h) sig_ = F.sigmoid(params[0][2](h)+5.) #[PB,Z] # print (v.size()) # print (mew_.size()) # print (self.B) # print (self.P) v = v*sig_ + mew_ logdet = torch.sum(torch.log(sig_), 1) h = F.tanh(params[1][0](v)) mew_ = params[1][1](h) sig_ = F.sigmoid(params[1][2](h)+5.) #[PB,Z] z = z*sig_ + mew_ logdet2 = torch.sum(torch.log(sig_), 1) #[PB] logdet = logdet + logdet2 #[PB,Z], [PB] return z, v, logdet
def forward(self, inputs): x = inputs x = self.bn0(x) x = F.tanh(self.linear1(x)) x = F.tanh(self.linear2(x)) mu = F.tanh(self.mu(x)) return mu
def forward(self, xt, img_fc, state): hs = [] cs = [] for L in range(self.num_layers): # c,h from previous timesteps prev_h = state[0][L] prev_c = state[1][L] # the input to this layer if L == 0: x = xt i2h = self.w2h(x) + self.v2h(img_fc) else: x = hs[-1] x = F.dropout(x, self.drop_prob_lm, self.training) i2h = self.i2h[L-1](x) all_input_sums = i2h+self.h2h[L](prev_h) sigmoid_chunk = all_input_sums.narrow(1, 0, 3 * self.rnn_size) sigmoid_chunk = F.sigmoid(sigmoid_chunk) # decode the gates in_gate = sigmoid_chunk.narrow(1, 0, self.rnn_size) forget_gate = sigmoid_chunk.narrow(1, self.rnn_size, self.rnn_size) out_gate = sigmoid_chunk.narrow(1, self.rnn_size * 2, self.rnn_size) # decode the write inputs if not self.use_maxout: in_transform = F.tanh(all_input_sums.narrow(1, 3 * self.rnn_size, self.rnn_size)) else: in_transform = all_input_sums.narrow(1, 3 * self.rnn_size, 2 * self.rnn_size) in_transform = torch.max(\ in_transform.narrow(1, 0, self.rnn_size), in_transform.narrow(1, self.rnn_size, self.rnn_size)) # perform the LSTM update next_c = forget_gate * prev_c + in_gate * in_transform # gated cells form the output tanh_nex_c = F.tanh(next_c) next_h = out_gate * tanh_nex_c if L == self.num_layers-1: if L == 0: i2h = self.r_w2h(x) + self.r_v2h(img_fc) else: i2h = self.r_i2h(x) n5 = i2h+self.r_h2h(prev_h) fake_region = F.sigmoid(n5) * tanh_nex_c cs.append(next_c) hs.append(next_h) # set up the decoder top_h = hs[-1] top_h = F.dropout(top_h, self.drop_prob_lm, self.training) fake_region = F.dropout(fake_region, self.drop_prob_lm, self.training) state = (torch.cat([_.unsqueeze(0) for _ in hs], 0), torch.cat([_.unsqueeze(0) for _ in cs], 0)) return top_h, fake_region, state
def forward(self, inputs, actions): x = inputs x = self.bn0(x) x = F.tanh(self.linear1(x)) a = F.tanh(self.linear_action(actions)) x = torch.cat((x, a), 1) x = F.tanh(self.linear2(x)) V = self.V(x) return V
def forward(self, input, cell): hx, cx = cell input = self.i2h_bn(self.i2h(input)) + self.h2h_bn(self.h2h(hx)) gates = F.sigmoid(input[:, :3*self.hidden_size]) in_gate = gates[:, :self.hidden_size] forget_gate = gates[:, self.hidden_size:2*self.hidden_size] out_gate = gates[:, 2*self.hidden_size:3*self.hidden_size] input = F.tanh(input[:, 3*self.hidden_size:4*self.hidden_size]) cx = (forget_gate * cx) + (in_gate * input) hx = out_gate * F.tanh(self.cx_bn(cx)) return hx, cx
def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): hx, cx = hidden gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) ingate = F.sigmoid(ingate) forgetgate = F.sigmoid(forgetgate) cellgate = F.tanh(cellgate) outgate = F.sigmoid(outgate) cy = (forgetgate * cx) + (ingate * cellgate) hy = outgate * F.tanh(cy) return hy, cy
def _get_lstm_features(self, names, lengths): self.hidden = self.init_hidden(names.size(-1)) embeds = self.char_embeds(names) # Figure 4 packed_input = pack_padded_sequence(embeds, lengths) # Figure 5 packed_output, (ht, ct) = self.lstm(packed_input, self.hidden) # Figure 6 lstm_out, _ = pad_packed_sequence(packed_output) # Figure 7 lstm_out = torch.transpose(lstm_out, 0, 1) lstm_out = torch.transpose(lstm_out, 1, 2) lstm_out = F.tanh(lstm_out) # Figure 8 lstm_out, indices = F.max_pool1d(lstm_out, lstm_out.size(2), return_indices=True) # Figure 9 lstm_out = lstm_out.squeeze(2) #对维度的修正,使其符合输入格式 lstm_out = F.tanh(lstm_out) lstm_feats = self.fully_connected_layer(lstm_out) output = self.softmax(lstm_feats) # Figure 10 return output
def forward(self, x): embed = self.embed(x) embed = self.dropout_embed(embed) x = embed.view(len(x), embed.size(1), -1) # lstm lstm_out, self.hidden = self.lstm(x, self.hidden) lstm_out = torch.transpose(lstm_out, 0, 1) lstm_out = torch.transpose(lstm_out, 1, 2) # pooling lstm_out = F.tanh(lstm_out) lstm_out = F.max_pool1d(lstm_out, lstm_out.size(2)).squeeze(2) lstm_out = F.tanh(lstm_out) # linear logit = self.hidden2label(lstm_out) return logit
def score(self, hidden, encoder_output): if self.method == 'dot': # hidden is 1 by 256 # encoder_output is 22 by 256 encoder_output = torch.transpose(encoder_output, 0, 1) # encoder_output is 256 by 22 energy = torch.matmul(hidden, encoder_output) return energy elif self.method == 'general': # hidden is 1 by 256 # encoder_output is 256 by 22 # encoder_output = torch.transpose(encoder_output, 0, 1) hidden = hidden.view(1, -1) a = self.attn(encoder_output) a = torch.transpose(a, 0, 1) energy = torch.matmul(hidden, a) return energy elif self.method == 'concat': len_encoder_output = encoder_output.size()[1] # hidden is 1 by 256 # encoder_output is 256 by 22 hidden = torch.transpose(hidden, 0, 1) # hidden is 256 by 1 hidden = hidden.repeat(hidden_size, len_encoder_output) # hidden is 256 by 22 concat = torch.cat((hidden, encoder_output), dim=0) # concat is 512 by 22 # self.attn(concat) --> 256 by 22 energy = torch.matmul(self.v, F.tanh(self.attn(concat))) return energy
def step(self, x, h_tm1, src_encodings, src_encodings_att_linear, src_token_mask=None, return_att_weight=False): """Perform a single time-step of computation in decoder LSTM Args: x: variable of shape (batch_size, hidden_size), input h_tm1: Tuple[Variable(batch_size, hidden_size), Variable(batch_size, hidden_size)], previous hidden and cell states src_encodings: variable of shape (batch_size, src_sent_len, hidden_size * 2), encodings of source utterances src_encodings_att_linear: linearly transformed source encodings src_token_mask: mask over source tokens (Note: unused entries are masked to **one**) return_att_weight: return attention weights Returns: The new LSTM hidden state and cell state """ # h_t: (batch_size, hidden_size) h_t, cell_t = self.decoder_lstm(x, h_tm1) ctx_t, alpha_t = nn_utils.dot_prod_attention(h_t, src_encodings, src_encodings_att_linear, mask=src_token_mask) att_t = F.tanh(self.att_vec_linear(torch.cat([h_t, ctx_t], 1))) # E.q. (5) att_t = self.dropout(att_t) if return_att_weight: return (h_t, cell_t), att_t, alpha_t else: return (h_t, cell_t), att_t
def forward(self, x): """ :param x: tensor with shape [batch_size, max_seq_len, max_word_len, char_embed_size] :return: tensor with shape [batch_size, max_seq_len, depth_sum] applies multikenrel 1d-conv layer along every word in input with max-over-time pooling to emit fixed-size output """ input_size = x.size() input_size_len = len(input_size) assert input_size_len == 4, \ 'Wrong input rang, must be equal to 4, but {} found'.format(input_size_len) [batch_size, seq_len, _, embed_size] = input_size assert embed_size == self.params.char_embed_size, \ 'Wrong embedding size, must be equal to {}, but {} found'.format(self.params.char_embed_size, embed_size) # leaps with shape x = x.view(-1, self.params.max_word_len, self.params.char_embed_size).transpose(1, 2).contiguous() xs = [F.tanh(F.conv1d(x, kernel, bias=self.biases[i])) for i, kernel in enumerate(self.kernels)] xs = [x.max(2)[0].squeeze(2) for x in xs] x = t.cat(xs, 1) x = x.view(batch_size, seq_len, -1) return x
def readout(h, h2): catted_reads = map(lambda x: torch.cat([h[x[0]], h2[x[1]]], 1), zip(h2.keys(), h.keys())) activated_reads = map(lambda x: F.selu( R(x) ), catted_reads) readout = Variable(torch.zeros(1, 128)) for read in activated_reads: readout = readout + read return F.tanh( readout )
def forward(self, x): x = F.leaky_relu(self.fc1(x), 0.2, inplace=True) x = F.leaky_relu(self.fc11(x), 0.2, inplace=True) x = F.leaky_relu(self.fc2(x), 0.2, inplace=True) x = F.leaky_relu(self.fc3(x), 0.2, inplace=True) x = F.tanh(self.out(x)) return x
def init_decoder_state(self, enc_last_state, enc_last_cell): """Compute the initial decoder hidden state and cell state""" h_0 = self.decoder_cell_init(enc_last_cell) h_0 = F.tanh(h_0) return h_0, Variable(self.new_tensor(h_0.size()).zero_())
def forward(self, s_t_hat, h, enc_padding_mask, coverage): b, t_k, n = list(h.size()) h = h.view(-1, n) # B * t_k x 2*hidden_dim encoder_feature = self.W_h(h) dec_fea = self.decode_proj(s_t_hat) # B x 2*hidden_dim dec_fea_expanded = dec_fea.unsqueeze(1).expand(b, t_k, n).contiguous() # B x t_k x 2*hidden_dim dec_fea_expanded = dec_fea_expanded.view(-1, n) # B * t_k x 2*hidden_dim att_features = encoder_feature + dec_fea_expanded # B * t_k x 2*hidden_dim if config.is_coverage: coverage_input = coverage.view(-1, 1) # B * t_k x 1 coverage_feature = self.W_c(coverage_input) # B * t_k x 2*hidden_dim att_features = att_features + coverage_feature e = F.tanh(att_features) # B * t_k x 2*hidden_dim scores = self.v(e) # B * t_k x 1 scores = scores.view(-1, t_k) # B x t_k attn_dist_ = F.softmax(scores, dim=1)*enc_padding_mask # B x t_k normalization_factor = attn_dist_.sum(1, keepdim=True) attn_dist = attn_dist_ / normalization_factor attn_dist = attn_dist.unsqueeze(1) # B x 1 x t_k h = h.view(-1, t_k, n) # B x t_k x 2*hidden_dim c_t = torch.bmm(attn_dist, h) # B x 1 x n c_t = c_t.view(-1, config.hidden_dim * 2) # B x 2*hidden_dim attn_dist = attn_dist.view(-1, t_k) # B x t_k if config.is_coverage: coverage = coverage.view(-1, t_k) coverage = coverage + attn_dist return c_t, attn_dist, coverage
def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc11(x)) x = F.relu(self.fc2(x)) x = F.relu(self.fc3(x)) x = F.tanh(self.out(x)) return x
def forward(self, x): x = self.fc1(x).view(-1, self.channels, self.rows, self.rows) x = F.relu(self.batch_norm1(x)) x = F.relu(self.batch_norm2(self.conv1(x))) x = F.relu(self.batch_norm3(self.conv2(x))) x = F.relu(self.batch_norm4(self.conv3(x))) return F.tanh(self.conv4(x))
def forward(self, input_seq, last_hidden, encoder_outputs): # Note: we run this one step at a time # Get the embedding of the current input word (last output word) embedded = self.embedding(input_seq) embedded = self.embedding_dropout(embedded) #[1, 64, 512] if(embedded.size(0) != 1): raise ValueError('Decoder input sequence length should be 1') # Get current hidden state from input word and last hidden state rnn_output, hidden = self.gru(embedded, last_hidden) # Calculate attention from current RNN state and all encoder outputs; # apply to encoder outputs to get weighted average attn_weights = self.attn(rnn_output, encoder_outputs) #[64, 1, 14] # encoder_outputs [14, 64, 512] context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) #[64, 1, 512] # Attentional vector using the RNN hidden state and context vector # concatenated together (Luong eq. 5) rnn_output = rnn_output.squeeze(0) #[64, 512] context = context.squeeze(1) #[64, 512] concat_input = torch.cat((rnn_output, context), 1) #[64, 1024] concat_output = F.tanh(self.concat(concat_input)) #[64, 512] # Finally predict next token (Luong eq. 6, without softmax) output = self.out(concat_output) #[64, output_size] output = F.softmax(output) # Return final output, hidden state, and attention weights (for visualization) return output, hidden, attn_weights
def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): """ input: word_inputs: (batch_size, sent_len) word_seq_lengths: list of batch_size, (batch_size,1) char_inputs: (batch_size*sent_len, word_length) char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) char_seq_recover: variable which records the char order information, used to recover char order output: Variable(batch_size, sent_len, hidden_dim) """ word_represent = self.wordrep(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) ## word_embs (batch_size, seq_len, embed_size) if self.word_feature_extractor == "CNN": word_in = F.tanh(self.word2cnn(word_represent)).transpose(2,1).contiguous() for idx in range(self.cnn_layer): if idx == 0: cnn_feature = F.relu(self.cnn_list[idx](word_in)) else: cnn_feature = F.relu(self.cnn_list[idx](cnn_feature)) cnn_feature = self.cnn_drop_list[idx](cnn_feature) cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature) feature_out = cnn_feature.transpose(2,1).contiguous() else: packed_words = pack_padded_sequence(word_represent, word_seq_lengths.cpu().numpy(), True) hidden = None lstm_out, hidden = self.lstm(packed_words, hidden) lstm_out, _ = pad_packed_sequence(lstm_out) ## lstm_out (seq_len, seq_len, hidden_size) feature_out = self.droplstm(lstm_out.transpose(1,0)) ## feature_out (batch_size, seq_len, hidden_size) outputs = self.hidden2tag(feature_out) return outputs
def forward(self, xt, fc_feats, att_feats, state): att_size = att_feats.numel() // att_feats.size(0) // self.att_feat_size att = att_feats.view(-1, self.att_feat_size) if self.att_hid_size > 0: att = self.ctx2att(att) # (batch * att_size) * att_hid_size att = att.view(-1, att_size, self.att_hid_size) # batch * att_size * att_hid_size att_h = self.h2att(state[0][-1]) # batch * att_hid_size att_h = att_h.unsqueeze(1).expand_as(att) # batch * att_size * att_hid_size dot = att + att_h # batch * att_size * att_hid_size dot = F.tanh(dot) # batch * att_size * att_hid_size dot = dot.view(-1, self.att_hid_size) # (batch * att_size) * att_hid_size dot = self.alpha_net(dot) # (batch * att_size) * 1 dot = dot.view(-1, att_size) # batch * att_size else: att = self.ctx2att(att)(att) # (batch * att_size) * 1 att = att.view(-1, att_size) # batch * att_size att_h = self.h2att(state[0][-1]) # batch * 1 att_h = att_h.expand_as(att) # batch * att_size dot = att_h + att # batch * att_size weight = F.softmax(dot) att_feats_ = att_feats.view(-1, att_size, self.att_feat_size) # batch * att_size * att_feat_size att_res = torch.bmm(weight.unsqueeze(1), att_feats_).squeeze(1) # batch * att_feat_size output, state = self.rnn(torch.cat([xt, att_res], 1).unsqueeze(0), state) return output.squeeze(0), state
def baseline(self, samples, enc_states): # compute baseline, which is an MLP # (sample_size) FIXME: reward is log-likelihood, shall we use activation here? b_x = self.b_x_l2(F.tanh(self.b_x_l1(enc_states.detach()))).view(-1) return b_x + self.b
def forward(self, output, context): batch_size = output.size(0) hidden_size = output.size(2) input_size = context.size(1) # (batch, out_len, dim) * (batch, in_len, dim) -> (batch, out_len, in_len) attn = torch.bmm(output, context.transpose(1, 2)) mask = torch.eq(attn, 0).data.byte() attn.data.masked_fill_(mask, -float('inf')) attn = F.softmax(attn.view(-1, input_size), dim=1).view(batch_size, -1, input_size) # (batch, out_len, in_len) * (batch, in_len, dim) -> (batch, out_len, dim) mix = torch.bmm(attn, context) # concat -> (batch, out_len, 2*dim) combined = torch.cat((mix, output), dim=2) # output -> (batch, out_len, dim) output = F.tanh(self.linear_out(combined.view(-1, 2 * hidden_size))).view(batch_size, -1, hidden_size) if not output.is_contiguous(): output = output.contiguous() return output, attn
def forward(self, x): x = self.embed(x) x = self.dropout(x) # x = x.view(len(x), x.size(1), -1) # x = embed.view(len(x), embed.size(1), -1) bilstm_out, self.hidden = self.bilstm(x, self.hidden) bilstm_out = torch.transpose(bilstm_out, 0, 1) bilstm_out = torch.transpose(bilstm_out, 1, 2) # bilstm_out = F.max_pool1d(bilstm_out, bilstm_out.size(2)).squeeze(2) bilstm_out = F.max_pool1d(bilstm_out, bilstm_out.size(2)) bilstm_out = bilstm_out.squeeze(2) hidden2lable = self.hidden2label1(F.tanh(bilstm_out)) gate_layer = F.sigmoid(self.gate_layer(bilstm_out)) # calculate highway layer values gate_hidden_layer = torch.mul(hidden2lable, gate_layer) # if write like follow ,can run,but not equal the HighWay NetWorks formula # gate_input = torch.mul((1 - gate_layer), hidden2lable) gate_input = torch.mul((1 - gate_layer), bilstm_out) highway_output = torch.add(gate_hidden_layer, gate_input) logit = self.logit_layer(highway_output) return logit
def _select_function(h, function_id): h = torch.stack([F.tanh(h), F.relu(h), F.sigmoid(h), h], dim=0) h = h[function_id] return h
def _nas_cell(self, sample_arc, x, prev_s, input_mask, layer_mask): """Multi-layer LSTM. Args: sample_arc: [num_layers * 2], sequence of tokens representing architecture. x: [batch_size, num_steps, hidden_size]. prev_s: [batch_size, hidden_size]. w_prev: [2 * hidden_size, 2 * hidden_size]. w_skip: [None, [hidden_size, 2 * hidden_size] * (num_layers-1)]. input_mask: `[batch_size, hidden_size]`. layer_mask: `[batch_size, hidden_size]`. params: hyper-params object. Returns: next_s: [batch_size, hidden_size]. all_s: [[batch_size, num_steps, hidden_size] * num_layers]. """ num_layers = len(sample_arc) // 2 # extract the relevant variables, so that you only do L2-reg on them. # u_skip = [] # start_idx = 0 # for layer_id in range(1, num_layers): # prev_idx = sample_arc[start_idx] # func_idx = sample_arc[start_idx + 1] # u_skip.append(self.w_combined[prev_idx][layer_id][func_idx]) # start_idx += 2 # w_skip = u_skip # var_s = [self.w_prev] + w_skip[1:] def _select_function(h, function_id): h = torch.stack([F.tanh(h), F.relu(h), F.sigmoid(h), h], dim=0) h = h[function_id] return h """Body function.""" # important change: first input uses a tanh() if layer_mask is not None: assert input_mask is not None # self.w_prev.weight.data = self.w_prev.weight.data * self.w_prev_mask ht = self.w_prev(torch.cat([x * input_mask, prev_s * layer_mask], dim=1)) else: ht = self.w_prev(torch.cat([x, prev_s], dim=1)) h, t = torch.split(ht, self.args.shared_hid, dim=1) h = F.tanh(h) t = F.sigmoid(t) s = prev_s + t * (h - prev_s) layers = [s] start_idx = 0 used = [] for layer_id in range(1, num_layers): prev_idx = sample_arc[start_idx].item() func_idx = sample_arc[start_idx + 1].item() # used.append(tf.one_hot(prev_idx, depth=num_layers, dtype=tf.int32)) not used? prev_s = torch.stack(layers, dim=0)[prev_idx] if layer_mask is not None: # self.w_combined[prev_idx][layer_id][func_idx].weight.data =\ # self.w_combined[prev_idx][layer_id][func_idx].weight.data * self.weight_mask ht = self.w_combined[prev_idx][layer_id][func_idx](prev_s * layer_mask) else: ht = self.w_combined[prev_idx][layer_id][func_idx](prev_s) h, t = torch.split(ht, self.args.shared_hid, dim=1) h = _select_function(h, func_idx) t = F.sigmoid(t) s = prev_s + t * (h - prev_s) # s.set_shape([batch_size, self.hidden_size]) # s = s.view(batch_size, self.hidden_size) layers.append(s) start_idx += 2 t_layers = torch.stack(layers[1:]), next_s = torch.sum(t_layers[0], dim=0) / num_layers return next_s
def forward(self, state): """Build an actor (policy) network that maps states -> actions.""" x = F.relu(self.fc1(self.bn1(state))) x = F.relu(self.fc2(self.bn2(x))) return F.tanh(self.fc3(self.bn3(x)))
def test_tom(opt, test_loader, model, board): model.cuda() model.eval() base_name = os.path.basename(opt.checkpoint) # save_dir = os.path.join(opt.result_dir, base_name, opt.datamode) save_dir = os.path.join(opt.result_dir, opt.name, opt.datamode) if not os.path.exists(save_dir): os.makedirs(save_dir) try_on_dir = os.path.join(save_dir, 'try-on') if not os.path.exists(try_on_dir): os.makedirs(try_on_dir) p_rendered_dir = os.path.join(save_dir, 'p_rendered') if not os.path.exists(p_rendered_dir): os.makedirs(p_rendered_dir) m_composite_dir = os.path.join(save_dir, 'm_composite') if not os.path.exists(m_composite_dir): os.makedirs(m_composite_dir) im_pose_dir = os.path.join(save_dir, 'im_pose') if not os.path.exists(im_pose_dir): os.makedirs(im_pose_dir) shape_dir = os.path.join(save_dir, 'shape') if not os.path.exists(shape_dir): os.makedirs(shape_dir) im_h_dir = os.path.join(save_dir, 'im_h') if not os.path.exists(im_h_dir): os.makedirs(im_h_dir) # for test data print('Dataset size: %05d!' % (len(test_loader.dataset)), flush=True) for step, inputs in enumerate(test_loader.data_loader): iter_start_time = time.time() im_names = inputs['im_name'] im = inputs['image'].cuda() im_pose = inputs['pose_image'] im_h = inputs['head'] shape = inputs['shape'] agnostic = inputs['agnostic'].cuda() c = inputs['cloth'].cuda() cm = inputs['cloth_mask'].cuda() # outputs = model(torch.cat([agnostic, c], 1)) # CP-VTON outputs = model(torch.cat([agnostic, c, cm], 1)) # CP-VTON+ p_rendered, m_composite = torch.split(outputs, 3, 1) p_rendered = F.tanh(p_rendered) m_composite = F.sigmoid(m_composite) p_tryon = c * m_composite + p_rendered * (1 - m_composite) visuals = [[im_h, shape, im_pose], [c, 2 * cm - 1, m_composite], [p_rendered, p_tryon, im]] save_images(p_tryon, im_names, try_on_dir) save_images(im_h, im_names, im_h_dir) save_images(shape, im_names, shape_dir) save_images(im_pose, im_names, im_pose_dir) save_images(m_composite, im_names, m_composite_dir) save_images(p_rendered, im_names, p_rendered_dir) # For test data if (step + 1) % opt.display_count == 0: board_add_images(board, 'combine', visuals, step + 1) t = time.time() - iter_start_time print('step: %8d, time: %.3f' % (step + 1, t), flush=True)
def forward(self, original_value, to_update_value): x = torch.cat((original_value, to_update_value), dim=-1) update_value = F.tanh(self.update_value(x)) update_gate = F.sigmoid(self.update_gate(x)) return original_value * (1 - update_gate) + update_value * update_gate
def train_tom(opt, train_loader, model, board): model #.cuda() model.train() # criterion criterionL1 = nn.L1Loss() criterionVGG = VGGLoss() criterionMask = nn.L1Loss() # optimizer optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.5, 0.999)) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda step: 1.0 - max(0, step - opt.keep_step) / float( opt.decay_step + 1)) for step in range(opt.keep_step + opt.decay_step): iter_start_time = time.time() inputs = train_loader.next_batch() im = inputs['image'] #.cuda() im_pose = inputs['pose_image'] im_h = inputs['head'] shape = inputs['shape'] agnostic = inputs['agnostic'] #.cuda() c = inputs['cloth'] #.cuda() cm = inputs['cloth_mask'] #.cuda() pcm = inputs['parse_cloth_mask'] #.cuda() # outputs = model(torch.cat([agnostic, c], 1)) # CP-VTON outputs = model(torch.cat([agnostic, c, cm], 1)) # CP-VTON+ p_rendered, m_composite = torch.split(outputs, 3, 1) p_rendered = F.tanh(p_rendered) m_composite = F.sigmoid(m_composite) p_tryon = c * m_composite + p_rendered * (1 - m_composite) """visuals = [[im_h, shape, im_pose], [c, cm*2-1, m_composite*2-1], [p_rendered, p_tryon, im]]""" # CP-VTON visuals = [[im_h, shape, im_pose], [c, pcm * 2 - 1, m_composite * 2 - 1], [p_rendered, p_tryon, im]] # CP-VTON+ loss_l1 = criterionL1(p_tryon, im) loss_vgg = criterionVGG(p_tryon, im) # loss_mask = criterionMask(m_composite, cm) # CP-VTON loss_mask = criterionMask(m_composite, pcm) # CP-VTON+ loss = loss_l1 + loss_vgg + loss_mask optimizer.zero_grad() loss.backward() optimizer.step() if (step + 1) % opt.display_count == 0: board_add_images(board, 'combine', visuals, step + 1) board.add_scalar('metric', loss.item(), step + 1) board.add_scalar('L1', loss_l1.item(), step + 1) board.add_scalar('VGG', loss_vgg.item(), step + 1) board.add_scalar('MaskL1', loss_mask.item(), step + 1) t = time.time() - iter_start_time print( 'step: %8d, time: %.3f, loss: %.4f, l1: %.4f, vgg: %.4f, mask: %.4f' % (step + 1, t, loss.item(), loss_l1.item(), loss_vgg.item(), loss_mask.item()), flush=True) if (step + 1) % opt.save_count == 0: save_checkpoint( model, os.path.join(opt.checkpoint_dir, opt.name, 'step_%06d.pth' % (step + 1)))
def forward(self, inputs, triples, lengths, elmo_embedding, id2_ids_batch): if self.args.pretrain_model_type == 'elmo': elmo_inputs = torch.Tensor().cuda() for i in range(len(inputs)): elmo_input = torch.from_numpy(elmo_embedding[' '.join(map(str, inputs[i].cpu().numpy()))].value).type(torch.cuda.FloatTensor) try: elmo_inputs = torch.cat((elmo_inputs, elmo_input.unsqueeze(dim=0))) except: elmo_inputs = torch.cat((elmo_inputs, elmo_input.unsqueeze(dim=0)[:,:128,:]), dim=0) inputs = elmo_inputs else: inputs = self.embedding(inputs) # Introducing external knowledge in different ways. t = torch.zeros(inputs.size(0), self.seq_length, self.input_dim + self.triples_embedding_dim).cuda() if self.args.concat_mode=="graph_attention": for i in range(len(inputs)): b = torch.full([self.seq_length, self.triples_number], -1, dtype=torch.long).cuda() bb = torch.zeros(self.seq_length, self.triples_embedding_dim).cuda() if (torch.equal(id2_ids_batch[i], b)): t[i] = torch.cat((inputs[i], bb), dim=-1) else: for k in range(len(id2_ids_batch[i])): c = torch.full([self.triples_number], -1, dtype=torch.long).cuda() cc = torch.zeros(self.triples_embedding_dim).cuda() if (torch.equal(id2_ids_batch[i][k], c)): t[i][k] = torch.cat((inputs[i][k], cc), dim=-1) else: list1 = torch.Tensor().cuda() list2 = torch.Tensor().cuda() head_id, tail_id, relation_id = torch.chunk(triples[i][k], 3, dim=1) t2 = self.embeddings_entity(head_id).cuda() t21 = self.embeddings_entity(tail_id).cuda() t22 = self.embeddings_relation(relation_id).cuda() head_tail = torch.cat((t2, t21), dim=2) list1 = torch.cat((list1, head_tail), dim=0) list2 = torch.cat((list2, t22), dim=0) head_tail_transformed = self.entity_transformed(list1) head_tail_transformed_final = F.tanh(head_tail_transformed) relation_transformed1 = F.tanh(list2) e_weight = (head_tail_transformed_final * relation_transformed1).sum(dim=2) alpha_weight = F.softmax(e_weight, dim=0) graph_embed = (alpha_weight.unsqueeze(1) * head_tail).sum(dim=0) aa = torch.cat((inputs[i][k], graph_embed.squeeze(0))) t[i][k] = aa else: for i in range(len(inputs)): dict = {} b = torch.full([self.seq_length, self.triples_number], -1, dtype=torch.long).cuda() bb = torch.zeros(self.seq_length, self.triples_embedding_dim).cuda() if (torch.equal(id2_ids_batch[i], b)): t[i] = torch.cat((inputs[i], bb), dim=-1) else: for k in range(len(id2_ids_batch[i])): a = 0 input = torch.Tensor().cuda() c = torch.full([self.triples_number], -1, dtype=torch.long).cuda() cc = torch.zeros(self.triples_embedding_dim).cuda() if (torch.equal(id2_ids_batch[i][k], c)): t[i][k] = torch.cat((inputs[i][k], cc), dim=-1) else: for j in range(len(id2_ids_batch[i][k])): if id2_ids_batch[i][k][j].cpu().numpy() == 1: inputs_triples = torch.cat( (inputs[i][k], self.embeddings_entity(triples[i][k][j][1]))) elif id2_ids_batch[i][k][j].cpu().numpy() == 2: inputs_triples = torch.cat( (inputs[i][k], self.embeddings_entity(triples[i][k][j][0]))) else: continue if a == 0: a = a + 1 input = torch.cat((inputs_triples, input)) else: a = a + 1 input = input + inputs_triples if a != 0: input = input / a dict[k] = input for k in dict: t[i][k] = dict[k] # 1. input embedded_input = self.dropout_on_input_to_LSTM(t) (sorted_input, sorted_lengths, input_unsort_indices, _) = sort_batch_by_length(embedded_input, lengths) packed_input = pack_padded_sequence(sorted_input, sorted_lengths.data.tolist(), batch_first=True) packed_sorted_output, _ = self.rnn(packed_input) sorted_output, _ = pad_packed_sequence(packed_sorted_output, batch_first=True) output = sorted_output[input_unsort_indices] # 2. use attention if self.args.attention_layer == 'att': attention_logits = self.attention_weights(output).squeeze(-1) mask_attention_logits = (attention_logits != 0).type( torch.cuda.FloatTensor if inputs.is_cuda else torch.FloatTensor) softmax_attention_logits = last_dim_softmax(attention_logits, mask_attention_logits) softmax_attention_logits0 = softmax_attention_logits.unsqueeze(dim=1) input_encoding = torch.bmm(softmax_attention_logits0, output) input_encoding0 = input_encoding.squeeze(dim=1) else: input_encoding = torch.Tensor().cuda() querys = self.query_embedding(torch.arange(0,self.args.num_classes,1).cuda()) attention_weights = torch.Tensor(self.args.num_classes, len(output), len(output[0])).cuda() for i in range(self.args.num_classes): attention_logits = self.proquery_weights_mp(output) attention_logits = torch.bmm(attention_logits, querys[i].unsqueeze(dim=1).repeat(len(output),1,1)).squeeze(dim=-1) mask_attention_logits = (attention_logits != 0).type( torch.cuda.FloatTensor if inputs.is_cuda else torch.FloatTensor) softmax_attention_logits = last_dim_softmax(attention_logits, mask_attention_logits) input_encoding_part = torch.bmm(softmax_attention_logits.unsqueeze(dim=1), output) input_encoding = torch.cat((input_encoding,input_encoding_part.squeeze(dim=1)), dim=-1) attention_weights[i] = softmax_attention_logits # 3. run linear layer if self.args.attention_layer == 'att': input_encodings = self.dropout_on_input_to_linear_layer(input_encoding0) unattized_output = self.output_projection(input_encodings) output_distribution = F.log_softmax(unattized_output, dim=-1) return output_distribution, softmax_attention_logits.squeeze(dim=1) else: input_encodings = self.dropout_on_input_to_linear_layer(input_encoding) unattized_output = self.multi_output_projection(input_encodings) output_distribution = F.log_softmax(unattized_output, dim=-1) cos = torch.nn.CosineSimilarity(dim=0, eps=1e-16) attention_loss = abs(cos(querys[0], querys[1])) + abs(cos(querys[1], querys[2])) \ + abs(cos(querys[0], querys[2])) return output_distribution, attention_weights, attention_loss
def forward(self, x): x.cuda(self.device) x = F.relu(self.fc1(x)).to(self.device) x = F.tanh(self.fc2(x)).to(self.device) #[-1,1] return x.cpu().data
def forward(self, input): return F.tanh(self.fc(input)) * F.sigmoid(self.gate_fc(input))
def first_pooler(x, w, b, train, dropout_prob): x = x[:, 0] x = F.linear(x, w, b) x = F.tanh(x) return x
def forward(self, input, mapping_layers=[]): seg = input ret_acts = {} x = F.interpolate(seg, size=(self.sh, self.sw)) x = self.fc(x) x = self.fc_norm(x) if 'fc' in mapping_layers: ret_acts['fc'] = x x = self.head_0(x, seg) if 'head_0' in mapping_layers: ret_acts['head_0'] = x x = self.up(x) x = self.G_middle_0(x, seg) if 'G_middle_0' in mapping_layers: ret_acts['G_middle_0'] = x if self.opt.num_upsampling_layers == 'more' or \ self.opt.num_upsampling_layers == 'most': x = self.up(x) x = self.G_middle_1(x, seg) if 'G_middle_1' in mapping_layers: ret_acts['G_middle_1'] = x x = self.up(x) x = self.up_0(x, seg) if 'up_0' in mapping_layers: ret_acts['up_0'] = x x = self.up(x) x = self.up_1(x, seg) if 'up_1' in mapping_layers: ret_acts['up_1'] = x x = self.up(x) x = self.up_2(x, seg) if 'up_2' in mapping_layers: ret_acts['up_2'] = x x = self.up(x) x = self.up_3(x, seg) if 'up_3' in mapping_layers: ret_acts['up_3'] = x if self.opt.num_upsampling_layers == 'most': x = self.up(x) x = self.up_4(x, seg) if 'up_4' in mapping_layers: ret_acts['up_4'] = x x = self.conv_img(F.leaky_relu(x, 2e-1)) x = F.tanh(x) if len(mapping_layers) == 0: return x else: return x, ret_acts
def forward(self, X): return F.sigmoid( self.linear3( self.dropout( F.tanh(self.linear2(self.dropout(F.tanh( self.linear1(X))))))))
def forward(self, input_seq, last_hidden, encoder_outputs): ''' :param input_seq: (B,) :param last_hidden: a tuple of two elem; (num_layers, batch, hidden_size) :param encoder_outputs: (seq_len, batch, hidden_size * num_directions); num_dir = 1 :return: ''' # Note: we run this one step at a time # Get the embedding of the current input word (last output word) max_len = encoder_outputs.size(0) batch_size = input_seq.size(0) input_seq = input_seq encoder_outputs = encoder_outputs.transpose(0,1) # shape (B,max_len, H*num_dir) word_embedded = self.embedding(input_seq) # S=1 x B x N; 此处还没有1; need to unsqueeze() word_embedded = self.embedding_dropout(word_embedded) # ATTENTION CALCULATION last_hidden (H_n, c_n) s_t = last_hidden[0][-1].unsqueeze(0) # shape (1,B,H) H = s_t.repeat(max_len,1,1).transpose(0,1) # shape (B,max_len, H) energy = F.tanh(self.W1(torch.cat([H,encoder_outputs], 2))) # (B,max_len,H) energy = energy.transpose(2,1) # (B,H,max_len) v = self.v.repeat(encoder_outputs.data.shape[0],1).unsqueeze(1) # [B*1*H] p_ptr = torch.bmm(v,energy) # [B*1*T] a = F.softmax(p_ptr) # dim = len(p_ptr.data.size())-1 context = a.bmm(encoder_outputs) # [B*1*T] * [B,T,H] ---> [B,1,H] # Combine embedded input word and attended context, run through RNN # (1,B,2*H) # TODO: for the case of B = 1 rnn_input = torch.cat((word_embedded, context.squeeze(1)), 1).unsqueeze(0) ''' Inputs: input, (h_0, c_0) - **input** (seq_len, batch, input_size): tensor containing the features of the input sequence. The input can also be a packed variable length sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence` for details. - **h_0** (num_layers \* num_directions, batch, hidden_size): tensor containing the initial hidden state for each element in the batch. - **c_0** (num_layers \* num_directions, batch, hidden_size): tensor containing the initial cell state for each element in the batch. If (h_0, c_0) is not provided, both **h_0** and **c_0** default to zero. Outputs: output, (h_n, c_n) - **output** (seq_len, batch, hidden_size * num_directions): tensor containing the output features `(h_t)` from the last layer of the RNN, for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output will also be a packed sequence. - **h_n** (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t=seq_len - **c_n** (num_layers * num_directions, batch, hidden_size): tensor containing the cell state for t=seq_len ''' # output shape (1,B,H); 为什么squeeze都不写?? output, hidden = self.lstm(rnn_input, last_hidden) p_vacab = self.U(output) # (1,B,Out_size) ??? gate = F.sigmoid(self.W(hidden[0][-1])) # (B,1) # # (B*1*T) (1,B.Out_size) (B,1) a tuple of two elem; (num_layers, batch, hidden_size) return p_ptr, p_vacab, gate, hidden
def forward(self, x): x = self.nonlin(self.fc1(x)) x = self.nonlin(self.fc2(x)) x = (self.fc3(x)) return f.tanh(x)
def vectorize_question(args, batch, model, vocab_map, embeddings, padding_id): if args.model == 'lstm': lstm = model else: cnn = model titles, bodies, triples = batch title_length, title_num_questions = titles.shape body_length, body_num_questions = bodies.shape title_embeddings, body_embeddings = corpus.get_embeddings( titles, bodies, vocab_map, embeddings) # title if args.model == 'lstm': if args.cuda: title_inputs = [ autograd.Variable(torch.FloatTensor(title_embeddings).cuda()) ] title_inputs = torch.cat(title_inputs).view( title_length, title_num_questions, -1) # title_inputs = torch.cat(title_inputs).view(title_num_questions, title_length, -1) title_hidden = (autograd.Variable( torch.zeros(1, title_num_questions, args.hidden_size).cuda()), autograd.Variable( torch.zeros((1, title_num_questions, args.hidden_size)).cuda())) # title_hidden = (autograd.Variable(torch.zeros(1, title_length, args.hidden_size)), # autograd.Variable(torch.zeros((1, title_length, args.hidden_size)))) else: title_inputs = [ autograd.Variable(torch.FloatTensor(title_embeddings)) ] title_inputs = torch.cat(title_inputs).view( title_length, title_num_questions, -1) # title_inputs = torch.cat(title_inputs).view(title_num_questions, title_length, -1) title_hidden = (autograd.Variable( torch.zeros(1, title_num_questions, args.hidden_size)), autograd.Variable( torch.zeros((1, title_num_questions, args.hidden_size)))) else: if args.cuda: title_inputs = [ autograd.Variable(torch.FloatTensor(title_embeddings).cuda()) ] #title_inputs = torch.cat(title_inputs).view(title_num_questions, 200, -1) else: title_inputs = [ autograd.Variable(torch.FloatTensor(title_embeddings)) ] title_inputs = torch.cat(title_inputs).transpose(0, 1).transpose(1, 2) if args.model == 'lstm': title_out, title_hidden = lstm(title_inputs, title_hidden) else: title_out = cnn(title_inputs) title_out = F.tanh(title_out) title_out = title_out.transpose(1, 2).transpose(0, 1) # average all words of each question from title_out # title_out (max sequence length) x (batch size) x (hidden size) average_title_out = average_questions(title_out, titles, padding_id) # body if args.model == 'lstm': if args.cuda: body_inputs = [ autograd.Variable(torch.FloatTensor(body_embeddings).cuda()) ] body_inputs = torch.cat(body_inputs).view(body_length, body_num_questions, -1) body_hidden = (autograd.Variable( torch.zeros(1, body_num_questions, args.hidden_size).cuda()), autograd.Variable( torch.zeros((1, body_num_questions, args.hidden_size)).cuda())) else: body_inputs = [ autograd.Variable(torch.FloatTensor(body_embeddings)) ] body_inputs = torch.cat(body_inputs).view(body_length, body_num_questions, -1) body_hidden = (autograd.Variable( torch.zeros(1, body_num_questions, args.hidden_size)), autograd.Variable( torch.zeros( (1, body_num_questions, args.hidden_size)))) else: if args.cuda: body_inputs = [ autograd.Variable(torch.FloatTensor(body_embeddings).cuda()) ] else: body_inputs = [ autograd.Variable(torch.FloatTensor(body_embeddings)) ] body_inputs = torch.cat(body_inputs).transpose(0, 1).transpose(1, 2) if args.model == 'lstm': body_out, body_hidden = lstm(body_inputs, body_hidden) else: body_out = cnn(body_inputs) body_out = F.tanh(body_out) body_out = body_out.transpose(1, 2).transpose(0, 1) average_body_out = average_questions(body_out, bodies, padding_id) # average body and title # representations of the questions as found by the LSTM hidden = (average_title_out + average_body_out) * 0.5 return hidden
def forward(self, input): out = F.leaky_relu(self.fc1(input), LEAK) out = F.leaky_relu(self.fc2(out), LEAK) out = F.leaky_relu(self.fc3(out), LEAK) out = F.tanh(self.fc4(out)) return out
def forward(self, input, context): """ input (FloatTensor): batch x tgt_len x dim: decoder's rnn's output. context (FloatTensor): batch x src_len x dim: src hidden states """ # one step input if isinstance(context, tuple): context, tree_context = context if input.dim() == 2: one_step = True input = input.unsqueeze(1) else: one_step = False batch, sourceL, dim = context.size() batch_, targetL, dim_ = input.size() aeq(batch, batch_) aeq(dim, dim_) aeq(self.dim, dim) # compute attention scores, as in Luong et al. align = self.score(input, context) if self.mask is not None: mask_ = self.mask[:, None, :] align.data.masked_fill_(mask_, -math.inf) # Softmax to normalize attention weights align_vectors = F.softmax(align, dim=-1) # each context vector c_t is the weighted average # over all the source hidden states c = torch.bmm(align_vectors, context) if self.multi_key: # sharing attention weight if self.share_attn: sc = torch.bmm(align_vectors, tree_context) else: # computing attention scores for syntax tree_align = self.score(input, tree_context, True) if self.mask is not None: tree_align.data.masked_fill_(self.mask[:, None, :], -math.inf) tree_align_vectors = F.softmax(tree_align, dim=-1) sc = torch.bmm(tree_align_vectors, tree_context) z = F.sigmoid(self.gate(input)) # batch x tgt_len x dim self.z = z # for visualization sc = sc * z concat_c = torch.cat([c, input, sc], 2).view(batch * targetL, dim * 3) else: concat_c = torch.cat([c, input], 2).view(batch * targetL, dim * 2) attn_h = self.linear_out(concat_c).view(batch, targetL, dim) attn_h = F.tanh(attn_h) if one_step: attn_h = attn_h.squeeze(1) align_vectors = align_vectors.squeeze(1) # Check output sizes batch_, dim_ = attn_h.size() aeq(batch, batch_) aeq(dim, dim_) batch_, sourceL_ = align_vectors.size() aeq(batch, batch_) aeq(sourceL, sourceL_) else: attn_h = attn_h.transpose(0, 1).contiguous() align_vectors = align_vectors.transpose(0, 1).contiguous() # Check output sizes targetL_, batch_, dim_ = attn_h.size() aeq(targetL, targetL_) aeq(batch, batch_) aeq(dim, dim_) targetL_, batch_, sourceL_ = align_vectors.size() aeq(targetL, targetL_) aeq(batch, batch_) aeq(sourceL, sourceL_) return attn_h, align_vectors
def forward(self, lefts, rights, tracking=None): batch_size = len(lefts) ret = torch.cat(lefts, 0) + F.tanh(torch.cat(rights, 0)) return torch.chunk(ret, batch_size, 0)
def forward(self, state): x = F.relu(self.bn1(self.fc1(state))) x = F.relu(self.fc2(x)) return F.tanh(self.fc3(x))
def forward(self, x): s = F.sigmoid(x) t = F.tanh(x) result = t + s return result
def forward(self, x, fusions): r_f = torch.cat([x, fusions], 2) r = F.tanh(self.linear_r(r_f)) g = F.sigmoid(self.linear_g(r_f)) o = g * r + (1 - g) * x return o
def forward(self, state): """Build an actor (policy) network that maps states -> actions.""" x = state for i_f, f in enumerate(self.hidden): x = F.relu(f(x)) if i_f < len(self.hidden) - 1 else f(x) return F.tanh(x)
def forward(self, x): x = F.tanh(self.affine1(x)) x = F.tanh(self.affine2(x)) state_values = self.value_head(x) return state_values
def forward_glimpse_clouds(self, final_fm, pose_fm): # Size of the feature maps B, D, T, W, H = final_fm.size() # For storing attention weights of the workers self.list_attention_worker = [[] for _ in range(self.nb_glimpses)] # List of attention points list_v = [] list_attention_points_glimpses = [] # Init the hidden state of the zoomer h = torch.zeros(1, B, self.rnn_zoomer_size) h = h.cuda() if CUDA else h # Init the hidden state of the workers list_r = [ torch.zeros(1, B, int(D / 4.)) for _ in range(self.nb_workers) ] list_r = [x.cuda() if CUDA else x for x in list_r] # Loop over time list_logits = [] for t in range(T): # Extract the feature maps and the pose features final_fm_t, pose_fm_t = final_fm[:, :, t], pose_fm[:, :, t] # (B, 2048, 7, 7) - (B, 1024, 14, 14) c = self.avgpool_14x14(pose_fm_t).view(B, int(D / 2.)) # (B, 1024) # Hidden state of th workers r_all_workers = list_r[0] for r_w in list_r[1:]: r_all_workers = r_all_workers + r_w r_all_workers = r_all_workers.transpose(0, 1) # (B, 1, D/4) # Loop over the glimpses for g in range(self.nb_glimpses): # Input of the RNN zoomer input_loc_params = torch.cat([c, h.view(B, int(D / 4.))], 1) # (B, 1536) # Estimate (x,y,scale_x,scale_y) of the glimpse loc = self.mlp_glimpse_location(input_loc_params) # (B, 4) # ipdb.set_trace() loc_xy = F.tanh( loc[:, :2]) # to make sure it is between -1 and 1 loc_zooms = F.sigmoid( loc[:, 2:] + 3. ) # to make sure it is between 0 and 1 - +3 for starting with a zoom ~ 1 # Extract the corresponding features map with Spatial Transformer Z = zoom_ST(final_fm_t, loc_xy, loc_zooms, W, H, CUDA) # (B, 2048, 7, 7) # Get the visual and location features and finally append z = self.avgpool_7x7(Z).view(B, D) # (B, 2048) v = z * self.mlp_embedding_location(loc) # (B, 2048) # Store glimpse features and attention points list_v.append(v) list_attention_points_glimpses.append( torch.cat([loc_xy, loc_zooms], 1)) # Update the zoomer _, h = self.rnn_zoomer( torch.cat([v.view(B, 1, D), r_all_workers], 2), h) # Compute the similarity matrix all_v = torch.stack(list_v, 1).view(B, t + 1, self.nb_glimpses, D) # (B,t,C,D) similarity_matrix = self.compute_similarity_matrix(all_v) # Create the input for each worker list_v_tild = [] # Distribute the features over the workers for w in range(self.nb_workers): # Get the input for the worker input_worker = self.get_worker_input(similarity_matrix, all_v, w, t) list_v_tild.append(input_worker) # Catch the workers and its previous hidden state rnn, hidden = self.list_worker[w], list_r[0] # Run the rnn out, hidden = rnn(input_worker.unsqueeze(1), hidden) # Update the list of hidden state list_r[w] = hidden # And finally classify fc = self.list_fc[w] logits = fc(out.view(B, int(D / 4.))) list_logits.append(logits) # Stack all_logits = torch.stack(list_logits, 1) # (B,T,60) # Average the logits logits = torch.mean(all_logits, 1) # (B, 60) # Stack attention points attention_points_glimpses = torch.stack(list_attention_points_glimpses, 1).view( B, T, self.nb_glimpses, 4) return logits, attention_points_glimpses
def score(self, query, key): input = tr.cat([query, key], dim=-1) return self.linear2(F.tanh(self.linear1(input)))
def forward(self, xes, hidden, attn_params): """ Compute attention over attn_params given input and hidden states. :param xes: input state. will be combined with applied attention. :param hidden: hidden state from model. will be used to select states to attend to in from the attn_params. :param attn_params: tuple of encoder output states and a mask showing which input indices are nonzero. :returns: output, attn_weights output is a new state of same size as input state `xes`. attn_weights are the weights given to each state in the encoder outputs. """ if self.attention == 'none': # do nothing, no attention return xes, None if type(hidden) == tuple: # for lstms use the "hidden" state not the cell state hidden = hidden[0] last_hidden = hidden[-1] # select hidden state from last RNN layer enc_out, attn_mask = attn_params bsz, seqlen, hszXnumdir = enc_out.size() numlayersXnumdir = last_hidden.size(1) if self.attention == 'local': # local attention weights aren't based on encoder states h_merged = torch.cat((xes.squeeze(1), last_hidden), 1) attn_weights = F.softmax(self.attn(h_merged), dim=1) # adjust state sizes to the fixed window size if seqlen > self.max_length: offset = seqlen - self.max_length enc_out = enc_out.narrow(1, offset, self.max_length) seqlen = self.max_length if attn_weights.size(1) > seqlen: attn_weights = attn_weights.narrow(1, 0, seqlen) else: hid = last_hidden.unsqueeze(1) if self.attention == 'concat': # concat hidden state and encoder outputs hid = hid.expand(bsz, seqlen, numlayersXnumdir) h_merged = torch.cat((enc_out, hid), 2) # then do linear combination of them with activation active = F.tanh(self.attn(h_merged)) attn_w_premask = self.attn_v(active).squeeze(2) elif self.attention == 'dot': # dot product between hidden and encoder outputs if numlayersXnumdir != hszXnumdir: # enc_out has two directions, so double hid hid = torch.cat([hid, hid], 2) enc_t = enc_out.transpose(1, 2) attn_w_premask = torch.bmm(hid, enc_t).squeeze(1) elif self.attention == 'general': # before doing dot product, transform hidden state with linear # same as dot if linear is identity hid = self.attn(hid) enc_t = enc_out.transpose(1, 2) attn_w_premask = torch.bmm(hid, enc_t).squeeze(1) # calculate activation scores, apply mask if needed if attn_mask is not None: # remove activation from NULL symbols attn_w_premask.masked_fill_((1 - attn_mask), -NEAR_INF) attn_weights = F.softmax(attn_w_premask, dim=1) # apply the attention weights to the encoder states attn_applied = torch.bmm(attn_weights.unsqueeze(1), enc_out) # concatenate the input and encoder states merged = torch.cat((xes.squeeze(1), attn_applied.squeeze(1)), 1) # combine them with a linear layer and tanh activation output = torch.tanh(self.attn_combine(merged).unsqueeze(1)) return output, attn_weights
def forward(self, rep1, len1, mask1, rep2, len2): # Compute context vectors using attention. def context_vector(h_t): WhH = torch.matmul(h_t, self.Wh) # Use mask to ignore the outputs of the padding part in premise shape = WhH.size() WhH = WhH.view(shape[0], 1, shape[1]) WhH = WhH.expand(shape[0], max_seq_len, shape[1]) M1 = mask1.type(self.float_type) shape = M1.size() M = M1.view(shape[0], shape[1], 1).type(self.float_type) M = M.expand(shape[0], shape[1], self.lstm_size) WhH = WhH * M M = torch.tanh(WyY + WhH) aW = self.aW.view(1, 1, -1) aW = aW.expand(batch_size, max_seq_len, aW.size()[2]) # Compute batch dot: the first step of a softmax batch_dot = M * aW batch_dot = torch.sum(batch_dot, 2) # Avoid overflow max_by_column, _ = torch.max(batch_dot, 1) max_by_column = max_by_column.view(-1, 1) max_by_column = max_by_column.expand(max_by_column.size()[0], max_seq_len) batch_dot = torch.exp(batch_dot - max_by_column) * M1 # Partition function and attention: # the second step of a softmax, use mask to ignore the padding partition = torch.sum(batch_dot, 1) partition = partition.view(-1, 1) partition = partition.expand(partition.size()[0], max_seq_len) attention = batch_dot / partition # compute context vector shape = attention.size() attention = attention.view(shape[0], shape[1], 1) attention = attention.expand(shape[0], shape[1], self.lstm_size) cv_t = outputs_1 * attention cv_t = torch.sum(cv_t, 1) return cv_t # ################# Forward Propagation code ################### # Set batch size batch_size = rep1.size()[0] # Representation of input sentences sent1 = self.embedding(rep1) sent2 = self.embedding(rep2) # Transform sentences representations to: # (sequence length * batch size * feqture size) sent1 = sent1.transpose(1, 0) sent2 = sent2.transpose(1, 0) # ----------------- YOUR CODE HERE ---------------------- # Run the two LSTM's, compute the context vectors, # compute the final representation of the sentence pair, # and run it through the fully connected layer, then # through the softmax layer. rep = torch.cat((rep1, rep2), 0) #length = torch.cat((len1, len2), 0) # Representation for input sentences batch_size = rep1.size()[0] sents = self.embedding(rep) (sents_premise, sents_hypothesis) = torch.split(sents, batch_size) # (sequence length * batch size * feature size) sents_premise = sents_premise.transpose(1, 0) sents_hypothesis = sents_hypothesis.transpose(1, 0) # Initialize hidden states and cell states (hx, cx) = self.init_hidden(batch_size) hx = hx.view(batch_size, -1) cx = cx.view(batch_size, -1) hidden = (hx, cx) # Ouput of LSTM: sequence (length x mini batch x lstm size) outp = [] hidden_states = [] for inp in range(sents_premise.size(0)): hidden = self.lstm1(sents_premise[inp], hidden) outp += [hidden[0]] hidden_states += [hidden[1]] outp = torch.stack(outp).transpose(0, 1) len1 = (len1 - 1).view(-1, 1, 1).expand(outp.size(0), 1, outp.size(2)) out = torch.gather(outp, 1, len1).transpose(1, 0) hidden_states = torch.stack(hidden_states).transpose(0, 1) #len1 = (len1-1).view(-1, 1, 1).expand(hidden_states.size(0), 1, hidden_states.size(2)) hidden_state = torch.gather(hidden_states, 1, len1).transpose(1, 0) lstm_outs, hidden_hypothesis = self.lstm2(sents_hypothesis, (out, hidden_state)) lstm_outs = lstm_outs.transpose(0, 1) len2 = (len2 - 1).view(-1, 1, 1).expand(lstm_outs.size(0), 1, lstm_outs.size(2)) lstm_out = torch.gather(lstm_outs, 1, len2) lstm_out = lstm_out.view(lstm_out.size(0), -1) ############################################# outputs_1 = lstm_outs max_seq_len = rep1.size()[1] WyY = torch.matmul(outputs_1, self.Wy) context_vec = context_vector(lstm_out) final = torch.tanh( torch.matmul(context_vec, self.Wp) + torch.matmul(lstm_out, self.Wh)) ############################################# # Concatenate premise and hypothesis representations final = F.dropout(final, p=self.drop_out) # Output of fully connected layers fc_out = F.dropout(F.tanh(self.linear1(lstm_out)), p=self.drop_out) #fc_out = F.dropout(F.tanh(self.linear2(fc_out)), p=self.drop_out) #fc_out = F.dropout(F.tanh(self.linear3(fc_out)), p=self.drop_out) # Output of Softmax fc_out = self.linear2(fc_out) return F.log_softmax(fc_out, dim=1)
def forward(self, input, z=None): seg = input if self.opt.use_vae: # we sample z from unit normal and reshape the tensor if z is None: z = torch.randn(input.size(0), self.opt.z_dim, dtype=torch.float32, device=input.get_device()) x = self.fc(z) x = x.view(-1, 16*self.opt.ngf, self.sh, self.sw) else: # we downsample segmap and run convolution x = F.interpolate(seg, size=(self.sh, self.sw)) x = self.fc(x) # encode segmentation labels seg1 = self.labelenc1(seg) # 256 seg2 = self.labelenc2(seg1) # 128 seg3 = self.labelenc3(seg2) # 64 seg4 = self.labelenc4(seg3) # 32 seg5 = self.labelenc5(seg4) # 16 seg6 = self.labelenc6(seg5) # 8 if self.num_upsampling_layers == 'more': seg7 = self.labelenc7(seg6) segout1 = seg7 segout2 = self.up(segout1) + self.labellat1(seg6) segout2 = self.labeldec1(segout2) segout3 = self.up(segout2) + self.labellat2(seg5) segout3 = self.labeldec2(segout3) segout4 = self.up(segout3) + self.labellat3(seg4) segout4 = self.labeldec3(segout4) segout5 = self.up(segout4) + self.labellat4(seg3) segout5 = self.labeldec4(segout5) segout6 = self.up(segout5) + self.labellat5(seg2) segout6 = self.labeldec5(segout6) segout7 = self.up(segout6) + self.labellat6(seg1) segout7 = self.labeldec6(segout7) else: segout1 = seg6 segout2 = self.up(segout1) + self.labellat1(seg5) segout2 = self.labeldec1(segout2) segout3 = self.up(segout2) + self.labellat2(seg4) segout3 = self.labeldec2(segout3) segout4 = self.up(segout3) + self.labellat3(seg3) segout4 = self.labeldec3(segout4) segout5 = self.up(segout4) + self.labellat4(seg2) segout5 = self.labeldec4(segout5) segout6 = self.up(segout5) + self.labellat5(seg1) segout6 = self.labeldec5(segout6) x = self.head_0(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout1), dim=1)) # 8 x = self.up(x) x = self.G_middle_0(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout2), dim=1)) # 16 if self.num_upsampling_layers == 'more': x = self.up(x) x = self.G_middle_1(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout3), dim=1)) else: x = self.G_middle_1(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout2), dim=1)) # 16 x = self.up(x) if self.num_upsampling_layers == 'more': x = self.up_0(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout4), dim=1)) # 32 else: x = self.up_0(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout3), dim=1)) # 32 x = self.up(x) if self.num_upsampling_layers == 'more': x = self.up_1(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout5), dim=1)) # 64 else: x = self.up_1(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout4), dim=1)) # 64 x = self.up(x) if self.num_upsampling_layers == 'more': x = self.up_2(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout6), dim=1)) # 128 else: x = self.up_2(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout5), dim=1)) # 128 x = self.up(x) if self.num_upsampling_layers == 'more': x = self.up_3(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout7), dim=1)) # 256 else: x = self.up_3(x, torch.cat((F.interpolate(seg, size=x.size()[2:], mode='nearest'), segout6), dim=1)) # 256 x = self.conv_img(F.leaky_relu(x, 2e-1)) x = F.tanh(x) return x
def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) out = F.tanh(self.out(x)) return out
def forward(self, low, high): low = F.leaky_relu(self.low_conv1(low), negative_slope=0.05) low = F.leaky_relu(self.low_conv2(low), negative_slope=0.05) low = self.low_block1(low) low = F.leaky_relu(self.low_down1(low), negative_slope=0.05) low = self.low_channel_wise(low) # low = self.low_spatial_wise(low) low = self.low_block2(low) low = F.leaky_relu(self.low_down2(low), negative_slope=0.05) low = self.low_channel_wise2(low) low = self.low_block3(low) low = F.leaky_relu(self.low_down3(low), negative_slope=0.05) low = self.low_channel_wise3(low) low = self.low_block4(low) low = F.leaky_relu(self.low_down4(low), negative_slope=0.05) low = self.low_channel_wise4(low) high = F.leaky_relu(self.high_conv1(high), negative_slope=0.05) high = F.leaky_relu(self.high_conv2(high), negative_slope=0.05) high = self.high_block1(high) high = F.leaky_relu(self.high_down1(high), negative_slope=0.05) high = self.high_channel_wise(high) # high = self.high_spatial_wise(high) high = self.high_block2(high) high = F.leaky_relu(self.high_down2(high), negative_slope=0.05) high = self.high_channel_wise2(high) high = self.high_block3(high) high = F.leaky_relu(self.high_down3(high), negative_slope=0.05) high = self.high_channel_wise3(high) high = self.high_block4(high) high = F.leaky_relu(self.high_down4(high), negative_slope=0.05) high = self.high_channel_wise4(high) lstm_input = torch.cat([low, high], 1) #print(lstm_input.shape) fuse = self.fuse(lstm_input) h = torch.zeros(low.size(0), 32, low.size(2), low.size(3)).type(torch.cuda.FloatTensor) c = torch.zeros(low.size(0), 32, low.size(2), low.size(3)).type(torch.cuda.FloatTensor) lstm_seq = [] for i in range(10): z = torch.cat([lstm_input, h], 1) i = self.conv_i(z) f = self.conv_f(z) g = self.conv_g(z) o = self.conv_o(z) c = f * c + i * g h = o * F.tanh(c) output_lstm = self.conv_lstm_output(h) lstm_seq.append(output_lstm) final = fuse + lstm_seq[len(lstm_seq) - 1] #return final, lstm_seq[len(lstm_seq) - 1] return final