def forward(self, x, y, z): x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2(x), 2)) x = x.view(-1, 1600) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) return F.log_softmax(x), F.log_softmax(x), F.log_softmax(x)
def _decode_step(self, input_list, state_list, k=1, feed_all_timesteps=False, remove_unknown=False, get_attention=False): view_shape = (-1, 1) if self.decoder.batch_first else (1, -1) time_dim = 1 if self.decoder.batch_first else 0 device = next(self.decoder.parameters()).device # For recurrent models, the last input frame is all we care about, # use feed_all_timesteps whenever the whole input needs to be fed if feed_all_timesteps: inputs = [torch.tensor(inp, device=device, dtype=torch.long) for inp in input_list] inputs = batch_sequences( inputs, device=device, batch_first=self.decoder.batch_first)[0] else: last_tokens = [inputs[-1] for inputs in input_list] inputs = torch.stack(last_tokens).view(*view_shape) states = State().from_list(state_list) logits, new_states = self.decode( inputs, states, get_attention=get_attention) # use only last prediction logits = logits.select(time_dim, -1).contiguous() if remove_unknown: # Remove possibility of unknown logits[:, UNK].fill_(-float('inf')) logprobs = log_softmax(logits, dim=1) logprobs, words = logprobs.topk(k, 1) new_states_list = [new_states[i] for i in range(len(input_list))] return words, logprobs, new_states_list
def inference(self, unary, num_iter=5): if not self.conf['logsoftmax']: lg_unary = torch.log(unary) prediction = exp_and_normalize(lg_unary, dim=1) else: lg_unary = nnfun.log_softmax(unary, dim=1, _stacklevel=5) if self.conf['softmax'] and False: prediction = exp_and_normalize(lg_unary, dim=1) else: prediction = lg_unary for i in range(num_iter): message = self.kernel.compute(prediction) if self.comp is not None: # message_r = message.view(tuple([1]) + message.shape) comp = self.comp(message) message = message + comp if self.weight is None: prediction = lg_unary + message else: prediction = (self.unary_weight - self.weight) * lg_unary + \ self.weight * message if not i == num_iter - 1 or self.final_softmax: if self.conf['softmax']: prediction = exp_and_normalize(prediction, dim=1) return prediction
def forward(self, x): x = F.max_pool2d(F.relu(self.conv1(x)), 2) x = F.max_pool2d(F.relu(self.conv2(x)), 2) x = x.view(-1, 64 * 7 * 7) # reshape Variable x = F.relu(self.fc1(x)) x = self.fc2(x) return F.log_softmax(x, dim=-1)
def forward(self, x): x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) return F.log_softmax(self.fc2(x))
def f_next(self, ctx_dict, y, h): # Get hidden states from the first decoder (purely cond. on LM) h1 = self.dec0(y, h) # Apply attention over multiple modalities txt_alpha_t, txt_z_t = self.txt_att(h1.unsqueeze(0), *ctx_dict['txt']) img_alpha_t, img_z_t = self.img_att(h1.unsqueeze(0), *ctx_dict['image']) # Context will double dimensionality if fusion_type is concat # final_z_t should be compatible with hidden_size final_z_t = self.fusion(txt_z_t, img_z_t) h2 = self.dec1(final_z_t, h1) # This is a bottleneck to avoid going from H to V directly logit = self.hid2out(h2) # Apply dropout if any if self.dropout_out > 0: logit = self.do_out(logit) # Transform logit to T*B*V (V: vocab_size) # Compute log_softmax over token dim log_p = -F.log_softmax(self.out2prob(logit), dim=-1) # Return log probs and new hidden states return log_p, h2
def masked_cross_entropy(logits, target, length): length = Variable(torch.LongTensor(length)).cuda() """ Args: logits: A Variable containing a FloatTensor of size (batch, max_len, num_classes) which contains the unnormalized probability for each class. target: A Variable containing a LongTensor of size (batch, max_len) which contains the index of the true class for each corresponding step. length: A Variable containing a LongTensor of size (batch,) which contains the length of each data in a batch. Returns: loss: An average loss value masked by the length. """ # logits_flat: (batch * max_len, num_classes) logits_flat = logits.view(-1, logits.size(-1)) # log_probs_flat: (batch * max_len, num_classes) log_probs_flat = functional.log_softmax(logits_flat) # target_flat: (batch * max_len, 1) target_flat = target.view(-1, 1) # losses_flat: (batch * max_len, 1) losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat) # losses: (batch, max_len) losses = losses_flat.view(*target.size()) # mask: (batch, max_len) mask = sequence_mask(sequence_length=length, max_len=target.size(1)) losses = losses * mask.float() loss = losses.sum() / length.float().sum() return loss
def forward(self, **sentence): input_words = sentence['input_words'] embeds = self.word_embeddings(input_words) lstm_out, self.hidden = self.lstm(embeds.view(len(input_words), 1, -1)) tag_space = self.hidden2tag(lstm_out.view(len(input_words), -1)) tag_scores = F.log_softmax(tag_space) return tag_scores
def forward(self, x): y = F.dropout(F.relu(self.linears[0](x)), self.training) for layer in self.linears[1:-1]: y = F.relu(layer(y)) y = F.dropout(y, self.training) y = F.log_softmax(self.linears[-1](y)) return y
def forward(self, input, hidden, encoder_outputs): ''' input: batch, 1 hidden: 1, batch, hidden encoder_outputs: length, hidden ''' embedded = self.embedding(input) # batch, 1, hidden embedded = self.dropout(embedded) embedded = embedded.squeeze(1) # batch, hidden attn_weights = F.softmax( self.attn(torch.cat((embedded, hidden[0]), 1))) # batch, max_length encoder_outputs = encoder_outputs.unsqueeze(0) # batch, max_length, hidden attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs) # batch, 1, hidden output = torch.cat((embedded, attn_applied.squeeze(1)), 1) # batch, 2xhidden output = self.attn_combine(output).unsqueeze(0) #1, batch, hidden for i in range(self.n_layers): output = F.relu(output) output, hidden = self.gru(output, hidden) output = F.log_softmax(self.out(output.squeeze(0))) return output, hidden, attn_weights
def train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, device="cpu"): optimizer.zero_grad() mb_adv = mb_rewards - mb_values adv_v = torch.FloatTensor(mb_adv).to(device) obs_v = torch.FloatTensor(mb_obs).to(device) rewards_v = torch.FloatTensor(mb_rewards).to(device) actions_t = torch.LongTensor(mb_actions).to(device) logits_v, values_v = net(obs_v) log_prob_v = F.log_softmax(logits_v, dim=1) log_prob_actions_v = adv_v * log_prob_v[range(len(mb_actions)), actions_t] loss_policy_v = -log_prob_actions_v.mean() loss_value_v = F.mse_loss(values_v.squeeze(-1), rewards_v) prob_v = F.softmax(logits_v, dim=1) entropy_loss_v = (prob_v * log_prob_v).sum(dim=1).mean() loss_v = ENTROPY_BETA * entropy_loss_v + VALUE_LOSS_COEF * loss_value_v + loss_policy_v loss_v.backward() nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD) optimizer.step() tb_tracker.track("advantage", mb_adv, step_idx) tb_tracker.track("values", values_v, step_idx) tb_tracker.track("batch_rewards", rewards_v, step_idx) tb_tracker.track("loss_entropy", entropy_loss_v, step_idx) tb_tracker.track("loss_policy", loss_policy_v, step_idx) tb_tracker.track("loss_value", loss_value_v, step_idx) tb_tracker.track("loss_total", loss_v, step_idx) return obs_v
def sample_beam(self, fc_feats, att_feats, opt={}): beam_size = opt.get('beam_size', 10) batch_size = fc_feats.size(0) assert beam_size <= self.vocab_size + 1, 'lets assume this for now, otherwise this corner case causes a few headaches down the road. can be dealt with in future if needed' seq = torch.LongTensor(self.seq_length, batch_size).zero_() seqLogprobs = torch.FloatTensor(self.seq_length, batch_size) # lets process every image independently for now, for simplicity self.done_beams = [[] for _ in range(batch_size)] for k in range(batch_size): state = self.init_hidden(beam_size) for t in range(2): if t == 0: xt = self.img_embed(fc_feats[k:k+1]).expand(beam_size, self.input_encoding_size) elif t == 1: # input <bos> it = fc_feats.data.new(beam_size).long().zero_() xt = self.embed(Variable(it, requires_grad=False)) output, state = self.core(xt, state) logprobs = F.log_softmax(self.logit(output)) self.done_beams[k] = self.beam_search(state, logprobs, opt=opt) seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score seqLogprobs[:, k] = self.done_beams[k][0]['logps'] # return the samples and their log likelihoods return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)
def forward(self, sentence): embeds = self.word_embeddings(sentence) # sentence must be a list of word_ixs lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden) # print(lstm_out.view(len(sentence), -1).shape) # torch.Size([5, 6]) or torch.Size([4, 6]) tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1)) # Batch, embeding_dim tag_scores = F.log_softmax(tag_space, dim=1) return tag_scores
def forward(self, input): x, slope = input x = x.view(-1, 784) x_fc1 = self.act((self.fc1(x), slope)) x_fc2 = self.fc2(x_fc1) x_out = F.log_softmax(x_fc2, dim=1) return x_out
def cross_entropy2d(input, target, weight=None, size_average=True): """ Function to compute pixelwise cross-entropy for 2D image. This is the segmentation loss. Args: input: input tensor of shape (minibatch x num_channels x h x w) target: 2D label map of shape (minibatch x h x w) weight (optional): tensor of size 'C' specifying the weights to be given to each class size_average (optional): boolean value indicating whether the NLL loss has to be normalized by the number of pixels in the image """ # input: (n, c, h, w), target: (n, h, w) n, c, h, w = input.size() # log_p: (n, c, h, w) log_p = F.log_softmax(input) # log_p: (n*h*w, c) log_p = log_p.transpose(1, 2).transpose(2, 3).contiguous().view(-1, c) try: log_p = log_p[target.view(n, h, w, 1).repeat(1, 1, 1, c) >= 0] except: print "Exception: ", target.size() log_p = log_p.view(-1, c) # target: (n*h*w,) mask = target >= 0 target = target[mask] target = torch.squeeze(target) loss = F.nll_loss(log_p, target, weight=weight, size_average=False) if size_average: loss /= mask.data.sum() return loss
def action_logprobs(self, x): x = self(x) log_probs = F.log_softmax(x, dim=1) # probs = F.softmax(x) return log_probs
def predict(self, inputs): classifier = self.nets.classifier outputs = classifier(inputs) predicted = torch.max(F.log_softmax(outputs, dim=1).data, 1)[1] return predicted
def forward(self, sentences, sentences_len, hidden): sentences_len = sentences_len.cpu().data.numpy() idx = np.argsort(sentences_len).tolist()[::-1] ridx = np.argsort(idx).tolist() sentences = sentences[idx, :] sentences_len = sentences_len[idx, ] embedding = self.embedding(sentences) embedding = nn.Dropout(0.1)(embedding) packed_embedding = pack_padded_sequence(embedding, sentences_len, batch_first=True) packed_rnn_feature, hidden = self.rnn_feature(packed_embedding, hidden) sentence_feature, _ = pad_packed_sequence(packed_rnn_feature, batch_first=True) idx = Variable(LongTensor(sentences_len - 1)) idx = idx.view(-1, 1).expand(sentence_feature.size(0), sentence_feature.size(2)).unsqueeze(1) if sentence_feature.is_cuda: idx = idx.cuda() sentence_feature = sentence_feature.gather(1, idx).squeeze() sentence_feature = sentence_feature[ridx, :] sentences_len = sentences_len[ridx, ] logits = self.classifier(sentence_feature) pred = F.log_softmax(logits, dim=0) return pred
def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None): states, actions, rewards, dones, next_states = common.unpack_batch(batch) batch_size = len(batch) states_v = torch.tensor(states).to(device) actions_v = torch.tensor(actions).to(device) next_states_v = torch.tensor(next_states).to(device) # next state distribution next_distr_v, next_qvals_v = tgt_net.both(next_states_v) next_actions = next_qvals_v.max(1)[1].data.cpu().numpy() next_distr = tgt_net.apply_softmax(next_distr_v).data.cpu().numpy() next_best_distr = next_distr[range(batch_size), next_actions] dones = dones.astype(np.bool) # project our distribution using Bellman update proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma) # calculate net output distr_v = net(states_v) state_action_values = distr_v[range(batch_size), actions_v.data] state_log_sm_v = F.log_softmax(state_action_values, dim=1) proj_distr_v = torch.tensor(proj_distr).to(device) if save_prefix is not None: pred = F.softmax(state_action_values, dim=1).data.cpu().numpy() save_transition_images(batch_size, pred, proj_distr, next_best_distr, dones, rewards, save_prefix) loss_v = -state_log_sm_v * proj_distr_v return loss_v.sum(dim=1).mean()
def train(epoch, model): LEARNING_RATE = lr / math.pow((1 + 10 * (epoch - 1) / epochs), 0.75) print('learning rate{: .4f}'.format(LEARNING_RATE) ) optimizer = torch.optim.SGD([ {'params': model.sharedNet.parameters()}, {'params': model.cls_fc.parameters(), 'lr': LEARNING_RATE}, ], lr=LEARNING_RATE / 10, momentum=momentum, weight_decay=l2_decay) model.train() iter_source = iter(source_loader) iter_target = iter(target_train_loader) num_iter = len_source_loader for i in range(1, num_iter): data_source, label_source = iter_source.next() data_target, _ = iter_target.next() if i % len_target_loader == 0: iter_target = iter(target_train_loader) if cuda: data_source, label_source = data_source.cuda(), label_source.cuda() data_target = data_target.cuda() data_source, label_source = Variable(data_source), Variable(label_source) data_target = Variable(data_target) optimizer.zero_grad() label_source_pred, loss_mmd = model(data_source, data_target) loss_cls = F.nll_loss(F.log_softmax(label_source_pred, dim=1), label_source) gamma = 2 / (1 + math.exp(-10 * (epoch) / epochs)) - 1 loss = loss_cls + gamma * loss_mmd loss.backward() optimizer.step() if i % log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tsoft_Loss: {:.6f}\tmmd_Loss: {:.6f}'.format( epoch, i * len(data_source), len_source_dataset, 100. * i / len_source_loader, loss.data[0], loss_cls.data[0], loss_mmd.data[0]))
def forward(self, sentence): embeds = self.word_embeddings(sentence) lstm_out, self.hidden = self.lstm( embeds.view(len(sentence), 1, -1), self.hidden) tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1)) tag_scores = F.log_softmax(tag_space, dim=1) return tag_scores
def forward(self, inputs, hidden): sum_emb = torch.zeros(self.d) for input in inputs: emb = self.encoder(input) sum_emb.add_(emb) decoded = self.decoder(sum_emb) return F.log_softmax(decoded)
def forward(self, x): x = F.relu(self.linear1(x)) x = F.dropout(x, 0.8) x = F.relu(self.linear2(x)) x = F.dropout(x, 0.8) x = F.log_softmax(self.linear3(x)) return x
def forward(self, screen, variables): action_prob, input = super(AdvantageActorCriticNoisy, self).forward(screen, variables) if not self.training: _, action = action_prob.max(1, keepdim=True) return action, None # greedy actions if random.random() < 0.1: action = torch.LongTensor(action_prob.size(0), 1).random_(0, action_prob.size(1)) action = Variable(action) if USE_CUDA: action = action.cuda() else: _, action = action_prob.max(1, keepdim=True) # value prediction - critic value = F.relu(self.value1(input)) value = torch.cat([value, variables], 1) value = self.value2(value) # save output for backpro action_prob = F.log_softmax(action_prob, dim=1) self.outputs.append(ModelOutput(action_prob.gather(-1, action), value)) return action, value
def forward(self, x): out = F.relu(F.max_pool2d(self.conv1(x), 2)) out = F.relu(F.max_pool2d(self.conv2(out), 2)) out = out.view(-1, 320) out = F.relu(self.fc1(out)) out = self.fc2(out) return F.log_softmax(out, dim=1)
def forward(self, x, word): char = torch.FloatTensor() for each in word: char_list = [] for letter in each: char_list.append(character_to_idx[letter.lower()]) char_list = torch.LongTensor(char_list) char_list = char_list.unsqueeze(0) if torch.cuda.is_available(): tempchar = self.char_lstm(Variable(char_list).cuda()) else: tempchar = self.char_lstm(Variable(char_list)) tempchar = tempchar.squeeze(0) char = torch.cat((char, tempchar.cpu().data), 0) if torch.cuda.is_available(): char = char.cuda() char = Variable(char) x = self.word_embedding(x) x = torch.cat((x, char), 1) x = x.unsqueeze(0) x, _ = self.lstm(x) x = x.squeeze(0) x = self.linear1(x) y = F.log_softmax(x) return y
def forward(self, fc_feats, att_feats, seq): batch_size = fc_feats.size(0) state = self.init_hidden(batch_size) outputs = [] for i in range(seq.size(1)): if i == 0: xt = self.img_embed(fc_feats) else: if self.training and i >= 2 and self.ss_prob > 0.0: # otherwiste no need to sample sample_prob = fc_feats.data.new(batch_size).uniform_(0, 1) sample_mask = sample_prob < self.ss_prob if sample_mask.sum() == 0: it = seq[:, i-1].clone() else: sample_ind = sample_mask.nonzero().view(-1) it = seq[:, i-1].data.clone() #prob_prev = torch.exp(outputs[-1].data.index_select(0, sample_ind)) # fetch prev distribution: shape Nx(M+1) #it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1)) prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1) it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind)) it = Variable(it, requires_grad=False) else: it = seq[:, i-1].clone() # break if all the sequences end if i >= 2 and seq[:, i-1].data.sum() == 0: break xt = self.embed(it) output, state = self.core(xt, state) output = F.log_softmax(self.logit(output)) outputs.append(output) return torch.cat([_.unsqueeze(1) for _ in outputs[1:]], 1).contiguous()
def routine(self, inputs, targets, criterion=nn.CrossEntropyLoss(reduce=False)): ''' Args: criterion: Classifier criterion. ''' classifier = self.nets.classifier outputs = classifier(inputs) predicted = torch.max(F.log_softmax(outputs, dim=1).data, 1)[1] unlabeled = targets.eq(-1).long() losses = criterion(outputs, (1 - unlabeled) * targets) labeled = 1. - unlabeled.float() loss = (losses * labeled).sum() / labeled.sum() if labeled.sum() > 0: correct = 100. * (labeled * predicted.eq( targets.data).float()).cpu().sum() / labeled.cpu().sum() self.results.accuracy = correct self.losses.classifier = loss self.results.perc_labeled = labeled.mean()
def action_probs(self, x): x = self(x) log_probs = F.log_softmax(x) probs = F.softmax(x) return probs
def forward(self, x): in_size = x.size(0) x = F.relu(self.mp(self.conv1(x))) x = F.relu(self.mp(self.conv2(x))) x = x.view(in_size, -1) # flatten the tensor x = self.fc(x) return F.log_softmax(x)
def forward(self, x, adj): x = F.dropout(x, self.dropout, training=self.training) x = torch.cat([att(x, adj) for att in self.attentions], dim=1) x = F.dropout(x, self.dropout, training=self.training) x = F.elu(self.out_att(x, adj)) return F.log_softmax(x, dim=1)
def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor( [self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos // len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses
def train(self, epoch): T = 2 self.model.train() print("Epochs %d" % epoch) tasknum = self.train_data_iterator.dataset.t start = 0 end = self.train_data_iterator.dataset.end mid = end - self.args.step_size kwargs = {'num_workers': 32, 'pin_memory': True} exemplar_dataset_loaders = ExemplarLoader( self.train_data_iterator.dataset) exemplar_iterator = torch.utils.data.DataLoader( exemplar_dataset_loaders, batch_size=self.args.replay_batch_size, shuffle=True, drop_last=True, **kwargs) selfsupervised_dataset_loaders = SelfSupervisedLoader( self.train_data_iterator.dataset) exemplar_iterator = torch.utils.data.DataLoader( selfsupervised_dataset_loaders, batch_size=self.args.batch_size, shuffle=True, drop_last=True, **kwargs) if tasknum > 0: iterator = zip(selfsupervised_dataset_loaders, exemplar_iterator) else: iterator = selfsupervised_dataset_loaders for samples in tqdm(iterator): if tasknum > 0: curr, prev = samples data, target, target_rot = curr data, target, target_rot = data.cuda(), target.cuda( ), rot_target.cuda() batch_size = data.shape[0] data_r, target_r, target_rot_r = prev data_r, target_r, target_rot_r = data_r.cuda(), target_r.cuda( ), rot_target_r.cuda() replay_size = data_r.shape[0] data = torch.cat((data, data_r)) target = torch.cat((target, target_r)) target_rot = torch.cat((target_rot, target_rot_r)) else: data, target, target_rot = samples data, target, target_rot = data.cuda(), target.cuda( ), target_rot.cuda() batch_size = data.shape[0] data = data.view(-1, 3, 224, 224) target = target.view(data.size(0), -1) target_rot = target_rot.view(data.size(0), -1) y_onehot = torch.FloatTensor(len(target), self.dataset.classes).cuda() y_onehot.zero_() # y_onehot.scatter_(1, torch.unsqueeze(target, 1), 1) y_onehot.scatter_(1, target, 1) y_onehot_rot = torch.FloatTensor(len(target_rot), 4).cuda() y_onehot_rot.zero_() # y_onehot_rot.scatter_(1, torch.unsqueeze(target_rot, 1), 1) y_onehot_rot.scatter_(1, target_rot, 1) uniform = torch.ones_like(y_onehot) output = self.model(data) output_log = F.log_softmax(output[:batch_size, start:end], dim=1) output_rot_log = F.log_softmax(output[:batch_size, 1000:1004], dim=1) if self.args.loss == 'GCE': loss_CE = self.gce(output[:batch_size, start:end], target % (end - start)) else: if self.args.prev_new: loss_CE_curr = 0 loss_CE_prev = 0 curr = output[:batch_size, mid:end] curr_log = F.log_softmax(curr, dim=1) loss_CE_curr = F.kl_div(curr_log, y_onehot[:batch_size, mid:end], reduction='sum') curr_rot = output[:batch_size, 1000:1004] curr_rot_log = F.log_softmax(curr_rot, dim=1) loss_rot_CE_curr = F.kl_div(curr_rot_log, y_onehot_rot[:batch_size], reduction='sum') loss_CE_curr += loss_rot_CE_curr if tasknum > 0: prev = output[batch_size:batch_size + replay_size, start:mid] prev_log = F.log_softmax(prev, dim=1) loss_CE_prev = F.kl_div( prev_log, y_onehot[batch_size:batch_size + replay_size, start:mid], reduction='sum') # prev_rot = output[batch_size:batch_size+replay_size,1000:1004] # prev_rot_log = F.log_softmax(prev_rot, dim=1) # loss_rot_CE_prev = F.kl_div(prev_rot_log, # y_onehot_rot[batch_size:batch_size+replay_size], reduction='sum') # loss_CE_prev += loss_rot_CE_prev loss_CE = (loss_CE_curr + loss_CE_prev) / (batch_size + replay_size) else: loss_CE = loss_CE_curr / batch_size else: loss_CE = F.kl_div(output_log, y_onehot[:batch_size, start:end], reduction='batchmean') loss_rot_CE = F.kl_div(output_rot_log, y_onehot_rot[:batch_size], reduction='batchmean') loss_CE += loss_rot_CE if self.args.CI: loss_CE += F.kl_div(output_log, uniform[:batch_size, start:end] / (end - start), reduction='batchmean') * self.args.beta if tasknum > 0 and self.args.uniform_penalty: prev_uni = output[batch_size:batch_size + replay_size, start:end] prev_uni_log = F.log_softmax(prev_uni, dim=1) loss_uni_prev = F.kl_div( prev_uni_log, uniform[:replay_size, start:end] / (end - start), reduction='batchmean') * self.args.beta loss_CE += loss_uni_prev self.optimizer.zero_grad() (loss_CE).backward() self.optimizer.step()
def forward(self, x): x = self.fc1(x) x = torch.sigmoid(x) x = self.fc2(x) return f.log_softmax(x, dim=1)
def forward(self, x): x = self.model(x) return F.log_softmax(x,dim=1)
def forward(self, x): x, trans, trans_feat = self.feat(x) x = F.relu(self.bn1(self.fc1(x))) x = F.relu(self.bn2(self.dropout(self.fc2(x)))) x = self.fc3(x) return F.log_softmax(x, dim=1), trans, trans_feat
def train(rank, args, shared_model): torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)])) if args.model_type == 'pacman': model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} model = NavPlannerControllerModel(**model_kwargs) else: exit() lossFn = torch.nn.CrossEntropyLoss().cuda() optim = torch.optim.Adam( filter(lambda p: p.requires_grad, shared_model.parameters()), lr=args.learning_rate) train_loader_kwargs = { 'questions_h5': args.train_h5, 'data_json': args.data_json, 'vocab': args.vocab_json, 'batch_size': args.batch_size, 'input_type': args.model_type, 'num_frames': 5, 'split': 'train', 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': args.to_cache } eval_loader_kwargs = { 'questions_h5': getattr(args, args.eval_split + '_h5'), 'data_json': args.data_json, 'vocab': args.vocab_json, 'target_obj_conn_map_dir': args.target_obj_conn_map_dir, 'map_resolution': args.map_resolution, 'batch_size': 1, 'input_type': args.model_type, 'num_frames': 5, 'split': args.eval_split, 'max_threads_per_gpu': args.max_threads_per_gpu, 'gpu_id': args.gpus[rank % len(args.gpus)], 'to_cache': False } args.output_log_path = os.path.join(args.log_dir, 'train_' + str(rank) + '.json') if 'pacman' in args.model_type: metrics = NavMetric( info={'split': args.eval_split, 'thread': rank}, metric_names=[ 'd_0_10', 'd_0_30', 'd_0_50', 'd_T_10', 'd_T_30', 'd_T_50', 'd_D_10', 'd_D_30', 'd_D_50', 'd_min_10', 'd_min_30', 'd_min_50', 'r_T_10', 'r_T_30', 'r_T_50', 'r_e_10', 'r_e_30', 'r_e_50', 'stop_10', 'stop_30', 'stop_50', 'ep_len_10', 'ep_len_30', 'ep_len_50' ], log_json=args.output_log_path) else: metrics = NavMetric( info={'split': 'train', 'thread': rank}, metric_names=['loss'], log_json=args.output_log_path) train_loader = EqaDataLoader(**train_loader_kwargs) eval_loader = EqaDataLoader(**eval_loader_kwargs) print('train_loader has %d samples' % len(train_loader.dataset)) t, epoch, best_eval_acc = 0, 0, 0 while epoch < int(args.max_epochs): if 'pacman' in args.model_type: planner_lossFn = MaskedNLLCriterion().cuda() controller_lossFn = MaskedNLLCriterion().cuda() done = False model.train() all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded() while done == False: for batch in train_loader: t += 1 model.load_state_dict(shared_model.state_dict()) model.train() model.cuda() idx, questions, _, planner_img_feats, planner_actions_in, \ planner_actions_out, planner_action_lengths, planner_masks, \ controller_img_feats, controller_actions_in, planner_hidden_idx, \ controller_outs, controller_action_lengths, controller_masks = batch questions_var = Variable(questions.cuda()) planner_img_feats_var = Variable(planner_img_feats.cuda()) planner_actions_in_var = Variable( planner_actions_in.cuda()) planner_actions_out_var = Variable( planner_actions_out.cuda()) planner_action_lengths = planner_action_lengths.cuda() planner_masks_var = Variable(planner_masks.cuda()) controller_img_feats_var = Variable( controller_img_feats.cuda()) controller_actions_in_var = Variable( controller_actions_in.cuda()) planner_hidden_idx_var = Variable( planner_hidden_idx.cuda()) controller_outs_var = Variable(controller_outs.cuda()) controller_action_lengths = controller_action_lengths.cuda( ) controller_masks_var = Variable(controller_masks.cuda()) planner_action_lengths, perm_idx = planner_action_lengths.sort( 0, descending=True) questions_var = questions_var[perm_idx] planner_img_feats_var = planner_img_feats_var[perm_idx] planner_actions_in_var = planner_actions_in_var[perm_idx] planner_actions_out_var = planner_actions_out_var[perm_idx] planner_masks_var = planner_masks_var[perm_idx] controller_img_feats_var = controller_img_feats_var[ perm_idx] controller_actions_in_var = controller_actions_in_var[ perm_idx] controller_outs_var = controller_outs_var[perm_idx] planner_hidden_idx_var = planner_hidden_idx_var[perm_idx] controller_action_lengths = controller_action_lengths[ perm_idx] controller_masks_var = controller_masks_var[perm_idx] planner_scores, controller_scores, planner_hidden = model( questions_var, planner_img_feats_var, planner_actions_in_var, planner_action_lengths.cpu().numpy(), planner_hidden_idx_var, controller_img_feats_var, controller_actions_in_var, controller_action_lengths) planner_logprob = F.log_softmax(planner_scores, dim=1) controller_logprob = F.log_softmax( controller_scores, dim=1) planner_loss = planner_lossFn( planner_logprob, planner_actions_out_var[:, :planner_action_lengths.max( )].contiguous().view(-1, 1), planner_masks_var[:, :planner_action_lengths.max()] .contiguous().view(-1, 1)) controller_loss = controller_lossFn( controller_logprob, controller_outs_var[:, :controller_action_lengths.max( )].contiguous().view(-1, 1), controller_masks_var[:, :controller_action_lengths.max( )].contiguous().view(-1, 1)) # zero grad optim.zero_grad() # update metrics # metrics.update( # [planner_loss.data[0], controller_loss.data[0]]) # backprop and update (planner_loss + controller_loss).backward() ensure_shared_grads(model.cpu(), shared_model) optim.step() # if t % args.print_every == 0: # print(metrics.get_stat_string()) # if args.to_log == 1: # metrics.dump_log() print('[CHECK][Cache:%d][Total:%d]' % (len(train_loader.dataset.img_data_cache), len(train_loader.dataset.env_list))) if all_envs_loaded == False: train_loader.dataset._load_envs(in_order=True) if len(train_loader.dataset.pruned_env_set) == 0: done = True if args.to_cache == False: train_loader.dataset._load_envs( start_idx=0, in_order=True) else: done = True invalids = [] done = False model.eval() while done == False: for batch in tqdm(eval_loader): if batch is None: continue model.load_state_dict(shared_model.state_dict()) model.cuda() idx, question, answer, actions, action_length = batch metrics_slug = {} h3d = eval_loader.dataset.episode_house # evaluate at multiple initializations for i in [10, 30, 50]: t += 1 if i > action_length[0]: invalids.append([idx[0], i]) continue question_var = Variable(question.cuda()) controller_step = False planner_hidden = model.planner_nav_rnn.init_hidden(1) # forward through planner till spawn planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, init_pos = eval_loader.dataset.get_hierarchical_features_till_spawn( actions[0, :action_length[0] + 1].numpy(), i) planner_actions_in_var = Variable( planner_actions_in.cuda()) planner_img_feats_var = Variable( planner_img_feats.cuda()) for step in range(planner_actions_in.size(0)): planner_scores, planner_hidden = model.planner_step( question_var, planner_img_feats_var[step].view( 1, 1, 3200), planner_actions_in_var[step].view( 1, 1), planner_hidden) if controller_step == True: controller_img_feat_var = Variable( controller_img_feat.cuda()) controller_action_in_var = Variable( torch.LongTensor(1, 1).fill_( int(controller_action_in)).cuda()) controller_scores = model.controller_step( controller_img_feat_var.view(1, 1, 3200), controller_action_in_var.view(1, 1), planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1: controller_step = True else: controller_step = False action = int(controller_action_in) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() else: prob = F.softmax(planner_scores, dim=1) action = int(prob.max(1)[1].data.cpu().numpy()[0]) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() h3d.env.reset( x=init_pos[0], y=init_pos[2], yaw=init_pos[3]) init_dist_to_target = h3d.get_dist_to_target( h3d.env.cam.pos) if init_dist_to_target < 0: # unreachable invalids.append([idx[0], i]) continue episode_length = 0 episode_done = True controller_action_counter = 0 dists_to_target, pos_queue, pred_actions = [ init_dist_to_target ], [init_pos], [] planner_actions, controller_actions = [], [] if action != 3: # take the first step img, _, _ = h3d.step(action) img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224).cuda())).view( 1, 1, 3200) for step in range(args.max_episode_length): episode_length += 1 if controller_step == False: planner_scores, planner_hidden = model.planner_step( question_var, img_feat_var, Variable(action_in), planner_hidden) prob = F.softmax(planner_scores, dim=1) action = int( prob.max(1)[1].data.cpu().numpy()[0]) planner_actions.append(action) pred_actions.append(action) img, _, episode_done = h3d.step(action) episode_done = episode_done or episode_length >= args.max_episode_length img = torch.from_numpy(img.transpose( 2, 0, 1)).float() / 255.0 img_feat_var = eval_loader.dataset.cnn( Variable(img.view(1, 3, 224, 224) .cuda())).view(1, 1, 3200) dists_to_target.append( h3d.get_dist_to_target(h3d.env.cam.pos)) pos_queue.append([ h3d.env.cam.pos.x, h3d.env.cam.pos.y, h3d.env.cam.pos.z, h3d.env.cam.yaw ]) if episode_done == True: break # query controller to continue or not controller_action_in = Variable( torch.LongTensor(1, 1).fill_(action).cuda()) controller_scores = model.controller_step( img_feat_var, controller_action_in, planner_hidden[0]) prob = F.softmax(controller_scores, dim=1) controller_action = int( prob.max(1)[1].data.cpu().numpy()[0]) if controller_action == 1 and controller_action_counter < 4: controller_action_counter += 1 controller_step = True else: controller_action_counter = 0 controller_step = False controller_action = 0 controller_actions.append(controller_action) action_in = torch.LongTensor( 1, 1).fill_(action + 1).cuda() # compute stats metrics_slug['d_0_' + str(i)] = dists_to_target[0] metrics_slug['d_T_' + str(i)] = dists_to_target[-1] metrics_slug['d_D_' + str( i)] = dists_to_target[0] - dists_to_target[-1] metrics_slug['d_min_' + str(i)] = np.array( dists_to_target).min() metrics_slug['ep_len_' + str(i)] = episode_length if action == 3: metrics_slug['stop_' + str(i)] = 1 else: metrics_slug['stop_' + str(i)] = 0 inside_room = [] for p in pos_queue: inside_room.append( h3d.is_inside_room( p, eval_loader.dataset.target_room)) if inside_room[-1] == True: metrics_slug['r_T_' + str(i)] = 1 else: metrics_slug['r_T_' + str(i)] = 0 if any([x == True for x in inside_room]) == True: metrics_slug['r_e_' + str(i)] = 1 else: metrics_slug['r_e_' + str(i)] = 0 # collate and update metrics metrics_list = [] for i in metrics.metric_names: if i not in metrics_slug: metrics_list.append(metrics.metrics[ metrics.metric_names.index(i)][0]) else: metrics_list.append(metrics_slug[i]) # update metrics metrics.update(metrics_list) try: print(metrics.get_stat_string(mode=0)) except: pass print('epoch', epoch) print('invalids', len(invalids)) eval_loader.dataset._load_envs() if len(eval_loader.dataset.pruned_env_set) == 0: done = True # checkpoint if best val loss print("ecoch {}: if {} > best_eval_acc {}".format(epoch, metrics.metrics[8][0], best_eval_acc)) if metrics.metrics[8][0] > best_eval_acc: # d_D_50 best_eval_acc = metrics.metrics[8][0] if epoch % args.eval_every == 0 and args.to_log == 1: metrics.dump_log() model_state = get_state(model) aad = dict(args.__dict__) ad = {} for i in aad: if i[0] != '_': ad[i] = aad[i] checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch} checkpoint_path = '%s/epoch_%d_d_D_50_%.04f.pt' % ( args.checkpoint_dir, epoch, best_eval_acc) print('Saving checkpoint to %s' % checkpoint_path) torch.save(checkpoint, checkpoint_path) print('[best_eval_d_D_50:%.04f]' % best_eval_acc) eval_loader.dataset._load_envs(start_idx=0, in_order=True) epoch += 1
def forward(self, input, hidden, encoder_output, encoder_outputs, input_variable, attn=False): output = self.embedding(input).unsqueeze(0) #.view(1, 1, -1) output = self.dropout(output) attn_weights = None if attn == 1: #print(output) #print("is the output") #print(" ") #print(hidden) #print("is the hidden") #print(" ") if self.recurrent_unit == "LSTM" or self.recurrent_unit == "MyLSTM" or self.recurrent_unit == "LSTMSqueeze" or self.recurrent_unit == "ONLSTM": attn_weights = F.softmax( self.attn(torch.cat((output[0], hidden[0][0]), 1))) else: attn_weights = F.softmax( self.attn(torch.cat((output[0], hidden[0]), 1))) #print(attn_weights.unsqueeze(1)) #print(encoder_outputs.transpose(0,1)) #attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0)) attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs.transpose(0, 1)) #print(attn_applied) attn_applied = attn_applied.transpose(0, 1) #print(output) #print(attn_applied) output = torch.cat((output[0], attn_applied[0]), 1) #print(output) output = self.attn_combine(output).unsqueeze(0) #print(output) if attn == 2: # For the other type of attention #print("encoder_outputs", encoder_outputs) #print("input_variable", input_variable) input_length = input_variable.size()[ 0] # Check if this is the right index u_i = Variable(torch.zeros(len(encoder_outputs), batch_size)) #print("u_i", u_i) if use_cuda: u_i = u_i.cuda() for i in range( input_length ): # can this be done with just matrix operations (i.e. without a for loop)? (probably) #print("enc out input", encoder_outputs[i].unsqueeze(0)) #print("hidden_reshaped", hidden[0].unsqueeze(0)) #print("output", output) #print("output_reshaped", output.unsqueeze(0)) if self.recurrent_unit == "LSTM" or self.recurrent_unit == "MyLSTM" or self.recurrent_unit == "LSTMSqueeze" or self.recurrent_unit == "ONLSTM": attn_hidden = F.tanh( self.attn_layer( torch.cat((encoder_outputs[i].unsqueeze(0), hidden[0][0].unsqueeze(0), output), 2))) else: attn_hidden = F.tanh( self.attn_layer( torch.cat((encoder_outputs[i].unsqueeze(0), hidden[0].unsqueeze(0), output), 2))) # the view(-1) is probably bad #print("attn_hidden", attn_hidden) #print("v", self.v.unsqueeze(1).unsqueeze(0)) u_i_j = torch.bmm(attn_hidden, self.v.unsqueeze(1).unsqueeze(0)) #print("u_i_j", u_i_j) #print("u_i_j[0][0][0]", u_i_j[0][0][0]) u_i[i] = u_i_j[0].view(-1) a_i = F.softmax(u_i.transpose( 0, 1)) # is it correct to be log softmax? #print("a_i", a_i) #print("a_i_reshaped", a_i.unsqueeze(1)) #print("enc outputs transpose", encoder_outputs.transpose(0,1)) attn_applied = torch.bmm(a_i.unsqueeze(1), encoder_outputs.transpose(0, 1)) #print("attn_applied", attn_applied) attn_applied = attn_applied.transpose(0, 1) #print("output[0]", output) output = torch.cat((output[0], attn_applied[0]), 1) output = self.attn_combine(output).unsqueeze(0) #print("output_end", output) for i in range(self.n_layers): #print(output) #print(" ") #print(hidden) #print(" ") output = F.relu(output) output, hidden = self.rnn(output, hidden) output = F.log_softmax(self.out(output[0])) return output, hidden, attn_weights
def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of one number representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # (tgt_len, b) source_padded_chars = self.vocab.src.to_input_tensor_char( source, device=self.device) # (src_len, b, w_len) target_padded_chars = self.vocab.tgt.to_input_tensor_char( target, device=self.device) # (tgt_len, b, w_len) enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars) ### YOUR CODE HERE for part 1i ### TODO: ### Modify the code lines above as needed to fetch the character-level tensor ### to feed into encode() and decode(). You should: ### - Keep `target_padded` from A4 code above for predictions ### - Add `source_padded_chars` for character level padded encodings for source ### - Add `target_padded_chars` for character level padded encodings for target ### - Modify calls to encode() and decode() to use the character level encodings ### END YOUR CODE P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum( ) # mhahn2 Small modification from A4 code. if self.charDecoder is not None: max_word_len = target_padded_chars.shape[-1] target_words = target_padded[1:].contiguous().view(-1) target_chars = target_padded_chars[1:].reshape(-1, max_word_len) target_outputs = combined_outputs.view(-1, 256) target_chars_oov = target_chars # torch.index_select(target_chars, dim=0, index=oovIndices) rnn_states_oov = target_outputs # torch.index_select(target_outputs, dim=0, index=oovIndices) oovs_losses = self.charDecoder.train_forward( target_chars_oov.t().contiguous(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0))) scores = scores - oovs_losses return scores
def decode(self, hbatch, lengths, model_lm=None): device = hbatch.device batch_size = hbatch.size(0) num_frames = hbatch.size(1) beam_width = self.hp.beam_width e_mask = torch.ones((batch_size, num_frames, 1), device=device, requires_grad=False) token_beam_sel = [ ([], 0.0, (torch.zeros((batch_size, self.num_decoder_hidden_nodes), device=device, requires_grad=False), torch.zeros((batch_size, self.num_decoder_hidden_nodes), device=device, requires_grad=False), torch.zeros((batch_size, 1, num_frames), device=device, requires_grad=False))) ] for i, tmp in enumerate(lengths): if tmp < num_frames: e_mask[i, tmp:] = 0.0 alpha_accum = [] for seq_step in range(self.hp.max_decoder_seq_len): token_beam_all = [] for current_token in token_beam_sel: cand_seq, cand_seq_score, (c, s, alpha) = current_token g, alpha = self.att(s, hbatch, alpha, e_mask) alpha_accum.append(alpha.cpu().numpy()) # generate y = self.L_yy(torch.tanh(self.L_gy(g) + self.L_sy(s))) if self.hp.score_func == 'log_softmax': y = F.log_softmax(y, dim=1) if model_lm is not None and len(cand_seq) > 0: lm_input = torch.from_numpy(np.array( [cand_seq])).to(DEVICE).long() lm_score = model_lm(lm_input)[:, -1, :] tmpy = y + self.hp.lm_weight * F.log_softmax(lm_score, dim=1) else: tmpy = y.clone() elif self.hp.score_func == 'softmax': y = F.softmax(y, dim=1) if model_lm is not None: lm_input = torch.from_numpy(np.array( [cand_seq])).to(DEVICE).long() lm_score = model_lm(lm_input)[:, -1, :] y = y + self.hp.lm_weight * F.softmax(lm_score, dim=1) else: tmpy = y.clone() #tmpy = y.clone() for _ in range(beam_width): bestidx = tmpy.data.argmax(1) tmpseq = cand_seq.copy() tmpseq.append(bestidx.item()) tmpscore = cand_seq_score + tmpy.data[0][bestidx] tmpy.data[0][bestidx] = -10000000000.0 rec_input = self.L_ys(bestidx) + self.L_ss(s) + self.L_gs( g) tmps, tmpc = self._func_lstm(rec_input, c) token_beam_all.append( (tmpseq, tmpscore, (tmpc, tmps, alpha))) sorted_token_beam_all = sorted(token_beam_all, key=itemgetter(1), reverse=True) token_beam_sel = sorted_token_beam_all[:beam_width] results = [] if token_beam_sel[0][0][-1] == self.hp.eos_id: for character in token_beam_sel[0][0]: results.append(character) break alpha_accum = np.array(alpha_accum) return results
def decode_v2(self, hbatch, lengths, model_lm=None): """ decode function with a few modification. 1. Add the candidate when the prediction is </s> """ device = hbatch.device #import sentencepiece as spm #sp = spm.SentencePieceProcessor() #sp.Load(self.hp.spm_model) batch_size = hbatch.shape[0] num_frames = hbatch.shape[1] e_mask = torch.ones((batch_size, num_frames, 1), device=device, requires_grad=False) beam_width = self.hp.beam_width beam_search = { 'result': torch.zeros((beam_width, self.hp.max_decoder_seq_len), device=device, dtype=torch.long), 'length': torch.zeros(beam_width).long(), 'score': torch.zeros((beam_width), device=device, dtype=torch.float).fill_(0), 'c': torch.zeros((beam_width, self.num_decoder_hidden_nodes), device=device), 's': torch.zeros((beam_width, self.num_decoder_hidden_nodes), device=device), 'alpha': torch.zeros((beam_width, self.hp.max_decoder_seq_len, num_frames), device=device) } beam_results = { 'score': torch.zeros((beam_width), device=device, dtype=torch.float).fill_(0), 'result': torch.zeros((beam_width, self.hp.max_decoder_seq_len), device=device, dtype=torch.long), 'length': torch.zeros(beam_width).long(), 'alpha': torch.zeros((beam_width, self.hp.max_decoder_seq_len, num_frames), device=device, requires_grad=False) } beam_step = 0 for i, tmp in enumerate(lengths): if tmp < num_frames: e_mask[i, tmp:] = 0.0 for seq_step in range(self.hp.max_decoder_seq_len): # length_penalty = ((5 + seq_step + 1)**0.9 / (5 + 1)**0.9) cand_seq = copy.deepcopy(beam_search['result']) cand_score = copy.deepcopy(beam_search['score'].unsqueeze(1)) c = copy.deepcopy(beam_search['c']) s = copy.deepcopy(beam_search['s']) cand_alpha = copy.deepcopy(beam_search['alpha']) if seq_step == 0: g, alpha = self.att(s, hbatch, cand_alpha[:, seq_step, :].unsqueeze(1), e_mask) else: g, alpha = self.att( s, hbatch, cand_alpha[:, seq_step - 1, :].unsqueeze(1), e_mask) # generate y = self.L_yy(torch.tanh(self.L_gy(g) + self.L_sy(s))) if self.hp.score_func == 'log_softmax': y = F.log_softmax(y, dim=1) if model_lm is not None and seq_step > 0: lm_input = cand_seq[:, :seq_step] lm_score = model_lm(lm_input)[:, -1, :] tmpy = y + self.hp.lm_weight * F.log_softmax(lm_score, dim=1) else: tmpy = y.clone() elif self.hp.score_func == 'softmax': y = F.softmax(y, dim=1) if model_lm is not None and seq_step: lm_input = cand_seq[:, :seq_step] lm_score = model_lm(lm_input)[:, -1, :] y = y + self.hp.lm_weight * F.softmax(lm_score, dim=1) else: tmpy = y.clone() best_scores, best_indices = tmpy.data.topk(beam_width, dim=1) scores = cand_score + best_scores + 1 #0.5 tmp_s = torch.zeros((beam_width, self.num_decoder_hidden_nodes), device=device) tmp_c = torch.zeros((beam_width, self.num_decoder_hidden_nodes), device=device) if seq_step == 0: beam_search['score'] = scores[0] beam_search['result'][:, 0] = best_indices[0] beam_search['length'] += 1 beam_search['alpha'][:, 0, :] = alpha.squeeze(1) tmp_s = s tmp_c = c rec_input = self.L_ys( best_indices[0]) + self.L_ss(tmp_s) + self.L_gs(g) tmps, tmpc = self._func_lstm(rec_input, tmp_c) beam_search['s'] = tmps beam_search['c'] = tmpc else: k_scores, k_ix = scores.reshape(-1).topk(beam_width * 2) cand_idx = k_ix // beam_width cand_ids = k_ix % beam_width num_cand = 0 i_cand = 0 tmp_bestidx = torch.zeros((beam_width), dtype=torch.long, device=DEVICE) tmp_g = torch.zeros( (beam_width, self.num_decoder_hidden_nodes * 2), dtype=torch.float, device=DEVICE) while num_cand < beam_width: if best_indices[cand_idx[i_cand], cand_ids[i_cand]] == self.hp.eos_id: beam_results['score'][beam_step] = k_scores[i_cand] beam_results['result'][beam_step] = cand_seq[ cand_idx[i_cand]] beam_results['result'][beam_step][ seq_step] = best_indices[cand_idx[i_cand], cand_ids[i_cand]] beam_results['length'][beam_step] = seq_step + 1 beam_results['alpha'][beam_step] = cand_alpha[ cand_idx[i_cand], :, :] beam_results['alpha'][beam_step][seq_step] = alpha[ cand_idx[i_cand]].squeeze(0) beam_step += 1 i_cand += 1 else: beam_search['score'][num_cand] = k_scores[i_cand] beam_search['result'][num_cand] = cand_seq[ cand_idx[i_cand]] beam_search['result'][num_cand][ seq_step] = best_indices[cand_idx[i_cand], cand_ids[i_cand]] beam_search['length'][num_cand] += 1 tmp_bestidx[num_cand] = best_indices[cand_idx[i_cand], cand_ids[i_cand]] beam_search['alpha'][num_cand] = cand_alpha[ cand_idx[i_cand], :, :] beam_search['alpha'][num_cand][seq_step] = alpha[ cand_idx[i_cand]].squeeze(0) tmp_s[num_cand] = s[cand_idx[i_cand]] tmp_c[num_cand] = c[cand_idx[i_cand]] tmp_g[num_cand] = g[cand_idx[i_cand]] i_cand += 1 num_cand += 1 if beam_step >= beam_width: break rec_input = self.L_ys(tmp_bestidx) + self.L_ss( tmp_s) + self.L_gs(tmp_g) tmps, tmpc = self._func_lstm(rec_input, tmp_c) beam_search['s'] = tmps beam_search['c'] = tmpc if beam_step >= beam_width: break best_idx = beam_results['score'].argmax() length = beam_results['length'][best_idx] results = beam_results['result'][best_idx][:length].cpu().tolist() return results
def forward(self, x): return F.log_softmax(self.proj(x), dim=-1)
def forward(self, data): x, edge_index = data.x, data.edge_index x = self.conv1(x, edge_index) return F.log_softmax(x, dim=1)
def regr_fcn(logits, multi_label=False): if multi_label: return torch.sigmoid(logits) else: return F.log_softmax(logits, 1)
def forward(self, x): x = x.view(-1, self.iSize * self.iSize) x = self.fc1(x) x = torch.relu(x) x = self.fc2(x) return F.log_softmax(x, dim=1)
def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) x = F.dropout(x, training=self.training) x = self.conv2(x, edge_index) return F.log_softmax(x, dim=1)
def decoder(self,z): z = F.tanh(self.fc5(F.tanh(self.fc4(z)))) x= self.fc6(z) x = F.log_softmax(x, dim=1) return x
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument('--kshot', type=int, default=5, help="random seed for initialization") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = {"rte": RteProcessor} output_modes = {"rte": "classification"} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] # train_examples = processor.get_RTE_as_train_k_shot('/export/home/Dataset/glue_data/RTE/train.tsv', args.kshot) #train_pu_half_v1.txt # dev_examples = processor.get_RTE_as_dev('/export/home/Dataset/glue_data/RTE/dev.tsv') # test_examples = processor.get_RTE_as_test('/export/home/Dataset/RTE/test_RTE_1235.txt') scitail_path = '/export/home/Dataset/SciTailV1/tsv_format/' train_examples = processor.get_SciTail_as_train_k_shot( scitail_path + 'scitail_1.0_train.tsv', args.kshot) #train_pu_half_v1.txt dev_examples, test_examples = processor.get_SciTail_dev_and_test( scitail_path + 'scitail_1.0_dev.tsv', scitail_path + 'scitail_1.0_test.tsv') label_list = ["entails", "neutral"] num_labels = len(label_list) print('num_labels:', num_labels, 'training size:', len(train_examples), 'dev size:', len(dev_examples), 'test size:', len(test_examples)) num_train_optimization_steps = None num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) model = RobertaForSequenceClassification(3) tokenizer = RobertaTokenizer.from_pretrained( pretrain_model_dir, do_lower_case=args.do_lower_case) model.load_state_dict( torch.load( '/export/home/Dataset/BERT_pretrained_mine/MNLI_pretrained/_acc_0.9040886899918633.pt' )) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 max_dev_acc = 0.0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) '''load dev set''' dev_features = convert_examples_to_features( dev_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) dev_all_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) dev_all_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) dev_all_segment_ids = torch.tensor( [f.segment_ids for f in dev_features], dtype=torch.long) dev_all_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(dev_all_input_ids, dev_all_input_mask, dev_all_segment_ids, dev_all_label_ids) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.eval_batch_size) '''load test set''' test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) eval_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) eval_all_input_mask = torch.tensor( [f.input_mask for f in test_features], dtype=torch.long) eval_all_segment_ids = torch.tensor( [f.segment_ids for f in test_features], dtype=torch.long) eval_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) eval_data = TensorDataset(eval_all_input_ids, eval_all_input_mask, eval_all_segment_ids, eval_all_label_ids) eval_sampler = SequentialSampler(eval_data) test_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 final_test_performance = 0.0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, input_mask) # loss_fct = CrossEntropyLoss() prob_matrix = F.log_softmax(logits.view(-1, 3), dim=1) '''this step *1.0 is very important, otherwise bug''' new_prob_matrix = prob_matrix * 1.0 '''change the entail prob to p or 1-p''' changed_places = torch.nonzero(label_ids.view(-1), as_tuple=False) new_prob_matrix[changed_places, 0] = 1.0 - prob_matrix[changed_places, 0] loss = F.nll_loss( new_prob_matrix, torch.zeros_like(label_ids).to(device).view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 iter_co += 1 # if iter_co %20==0: if iter_co % len(train_dataloader) == 0: ''' start evaluate on dev set after this epoch ''' model.eval() for idd, dev_or_test_dataloader in enumerate( [dev_dataloader, test_dataloader]): if idd == 0: logger.info("***** Running dev *****") logger.info(" Num examples = %d", len(dev_examples)) else: logger.info("***** Running test *****") logger.info(" Num examples = %d", len(test_examples)) # logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] gold_label_ids = [] # print('Evaluating...') for input_ids, input_mask, segment_ids, label_ids in dev_or_test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) gold_label_ids += list( label_ids.detach().cpu().numpy()) with torch.no_grad(): logits = model(input_ids, input_mask) if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) preds = preds[0] pred_probs = softmax(preds, axis=1) pred_label_ids_3way = list( np.argmax(pred_probs, axis=1)) '''change from 3-way to 2-way''' pred_label_ids = [] for pred_id in pred_label_ids_3way: if pred_id != 0: pred_label_ids.append(1) else: pred_label_ids.append(0) gold_label_ids = gold_label_ids assert len(pred_label_ids) == len(gold_label_ids) hit_co = 0 for k in range(len(pred_label_ids)): if pred_label_ids[k] == gold_label_ids[k]: hit_co += 1 test_acc = hit_co / len(gold_label_ids) if idd == 0: # this is dev if test_acc > max_dev_acc: max_dev_acc = test_acc print('\ndev acc:', test_acc, ' max_dev_acc:', max_dev_acc, '\n') else: print('\ndev acc:', test_acc, ' max_dev_acc:', max_dev_acc, '\n') break else: # this is test if test_acc > max_test_acc: max_test_acc = test_acc final_test_performance = test_acc print('\ntest acc:', test_acc, ' max_test_acc:', max_test_acc, '\n') print('final_test_performance:', final_test_performance)
def forward(self, x, target): target = torch.zeros_like(x).scatter_(1, target.unsqueeze(1), 1) smoothed_target = (1 - self.e) * target + self.e / x.size(1) loss = (- F.log_softmax(x, dim=1) * smoothed_target).sum(dim=1) return loss.mean()
def log_prob(self, x): """Calculate the log prob of all vocab.""" x = self.linear(x, linear=True) log_prob = F.log_softmax(x, dim=-1, dtype=torch.float32) return log_prob
zn = Variable(Tensor(np.random.normal(0,1, (32, 2048)))) opt_g.zero_grad() opt_f.zero_grad() s_bottleneck = netG(s_imgs) t_bottleneck = netG(t_imgs) s_fc2_emb, s_logit = netF(s_bottleneck) t_fc2_emb, t_logit = netF(t_bottleneck) s_cls_loss = get_cls_loss(s_logit, s_labels) #kl-divergence feat_s_kl = s_bottleneck.view(-1,2048) loss_kld_s = F.kl_div(F.log_softmax(feat_s_kl), F.softmax(zn)) #distribution alignment loss (DAL) loss_dal= CriterionDAL(feat_t_recon, feat_zn_recon) t_prob = F.softmax(t_logit) t_entropy_loss = get_entropy_loss(t_prob) #updated loss function loss = s_cls_loss + t_entropy_loss + args.alpha * loss_kld_s + args.beta * loss_dal loss.backward() if (i+1) % 5 == 0: print ("cls_loss: %.4f, entropy_loss: %.4f" % (s_cls_loss.item(), t_entropy_loss.item())) opt_g.step()
def get_cls_loss(pred, gt): cls_loss = F.nll_loss(F.log_softmax(pred), gt) return cls_loss
def forward(self, s_logits, t_logits): s_prob = F.log_softmax(s_logits / self.temperature, 1) t_prob = F.softmax(t_logits / self.temperature, 1) loss = self.klloss(s_prob, t_prob) * self.temperature * self.temperature return loss
def distillation(y, teacher_scores, labels, T, alpha): return F.kl_div(F.log_softmax(y/T, dim=1), F.softmax(teacher_scores/T, dim=1)) * (T*T * 2. * alpha)\ + F.cross_entropy(y, labels) * (1. - alpha)
def forward(self, x, adj): x = F.relu(self.gc1(x, adj)) x = F.dropout(x, self.dropout, training=self.training) # 需要模型的整体training状态参数传入dropout函数 x = self.gc2(x, adj) return F.log_softmax(x, dim=1)
def recognize_beam(self, encoder_outputs, char_list, args): """Beam search, decode one utterence now. Args: encoder_outputs: T x H char_list: list of character args: args.beam Returns: nbest_hyps: """ # search params beam = args.beam_size nbest = args.nbest if args.decode_max_len == 0: maxlen = encoder_outputs.size(0) else: maxlen = args.decode_max_len # *********Init decoder rnn h_list = [self.zero_state(encoder_outputs.unsqueeze(0))] c_list = [self.zero_state(encoder_outputs.unsqueeze(0))] for l in range(1, self.num_layers): h_list.append(self.zero_state(encoder_outputs.unsqueeze(0))) c_list.append(self.zero_state(encoder_outputs.unsqueeze(0))) att_c = self.zero_state(encoder_outputs.unsqueeze(0), H=encoder_outputs.unsqueeze(0).size(2)) # prepare sos y = self.sos_id vy = encoder_outputs.new_zeros(1).long() hyp = {'score': 0.0, 'yseq': [y], 'c_prev': c_list, 'h_prev': h_list, 'a_prev': att_c} hyps = [hyp] ended_hyps = [] for i in range(maxlen): hyps_best_kept = [] for hyp in hyps: # vy.unsqueeze(1) vy[0] = hyp['yseq'][i] embedded = self.embedding(vy) # embedded.unsqueeze(0) # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1) rnn_input = torch.cat((embedded, hyp['a_prev']), dim=1) h_list[0], c_list[0] = self.rnn[0]( rnn_input, (hyp['h_prev'][0], hyp['c_prev'][0])) for l in range(1, self.num_layers): h_list[l], c_list[l] = self.rnn[l]( h_list[l-1], (hyp['h_prev'][l], hyp['c_prev'][l])) rnn_output = h_list[-1] # step 2. attention: c_i = AttentionContext(s_i,h) # below unsqueeze: (N x H) -> (N x 1 x H) att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1), encoder_outputs.unsqueeze(0)) att_c = att_c.squeeze(dim=1) # step 3. concate s_i and c_i, and input to MLP mlp_input = torch.cat((rnn_output, att_c), dim=1) predicted_y_t = self.mlp(mlp_input) local_scores = F.log_softmax(predicted_y_t, dim=1) # topk scores local_best_scores, local_best_ids = torch.topk( local_scores, beam, dim=1) for j in range(beam): new_hyp = {} new_hyp['h_prev'] = h_list[:] new_hyp['c_prev'] = c_list[:] new_hyp['a_prev'] = att_c[:] new_hyp['score'] = hyp['score'] + local_best_scores[0, j] new_hyp['yseq'] = [0] * (1 + len(hyp['yseq'])) new_hyp['yseq'][:len(hyp['yseq'])] = hyp['yseq'] new_hyp['yseq'][len(hyp['yseq'])] = int( local_best_ids[0, j]) # will be (2 x beam) hyps at most hyps_best_kept.append(new_hyp) hyps_best_kept = sorted(hyps_best_kept, key=lambda x: x['score'], reverse=True)[:beam] # end for hyp in hyps hyps = hyps_best_kept # add eos in the final loop to avoid that there are no ended hyps if i == maxlen - 1: for hyp in hyps: hyp['yseq'].append(self.eos_id) # add ended hypothes to a final list, and removed them from current hypothes # (this will be a probmlem, number of hyps < beam) remained_hyps = [] for hyp in hyps: if hyp['yseq'][-1] == self.eos_id: # hyp['score'] += (i + 1) * penalty ended_hyps.append(hyp) else: remained_hyps.append(hyp) hyps = remained_hyps if len(hyps) > 0: print('remeined hypothes: ' + str(len(hyps))) else: print('no hypothesis. Finish decoding.') break for hyp in hyps: print('hypo: ' + ' '.join([char_list[int(x)] for x in hyp['yseq'][1:]])) # end for i in range(maxlen) nbest_hyps = sorted(ended_hyps, key=lambda x: x['score'], reverse=True)[ :min(len(ended_hyps), nbest)] #print(nbest_hyps) return nbest_hyps
train_loader = DataLoader(dataset=tiny_sst, batch_size=5, collate_fn=batcher(device), shuffle=False, num_workers=0) # training loop for epoch in range(epochs): for step, batch in enumerate(train_loader): g = batch.graph n = g.number_of_nodes() h = th.zeros((n, h_size)) c = th.zeros((n, h_size)) logits = model(batch, h, c) logp = F.log_softmax(logits, 1) loss = F.nll_loss(logp, batch.label, reduction='sum') optimizer.zero_grad() loss.backward() optimizer.step() pred = th.argmax(logits, 1) acc = float(th.sum(th.eq(batch.label, pred))) / len(batch.label) print("Epoch {:05d} | Step {:05d} | Loss {:.4f} | Acc {:.4f} |".format( epoch, step, loss.item(), acc)) ############################################################################## # To train the model on full dataset with different settings(CPU/GPU, # etc.), please refer to our repo's # `example <https://github.com/jermainewang/dgl/tree/master/examples/pytorch/tree_lstm>`__. # Besides, we also provide an implementation of the Child-Sum Tree LSTM.
def generate_beam(self, src_enc, src_len, tgt_lang_id, beam_size, length_penalty, early_stopping, max_len=200): """ Decode a sentence given initial start. `x`: - LongTensor(bs, slen) <EOS> W1 W2 W3 <EOS> <PAD> <EOS> W1 W2 W3 W4 <EOS> `lengths`: - LongTensor(bs) [5, 6] `positions`: - False, for regular "arange" positions (LM) - True, to reset positions from the new generation (MT) `langs`: - must be None if the model only supports one language - lang_id if only one language is involved (LM) - (lang_id1, lang_id2) if two languages are involved (MT) """ # check inputs assert src_enc.size(0) == src_len.size(0) assert beam_size >= 1 # batch size / number of words bs = len(src_len) n_words = self.n_words # expand to beam size the source latent representations / source lengths src_enc = src_enc.unsqueeze(1).expand((bs, beam_size) + src_enc.shape[1:]).contiguous().view((bs * beam_size,) + src_enc.shape[1:]) src_len = src_len.unsqueeze(1).expand(bs, beam_size).contiguous().view(-1) # generated sentences (batch with beam current hypotheses) generated = src_len.new(max_len, bs * beam_size) # upcoming output generated.fill_(self.pad_index) # fill upcoming ouput with <PAD> generated[0].fill_(self.eos_index) # we use <EOS> for <BOS> everywhere # generated hypotheses generated_hyps = [BeamHypotheses(beam_size, max_len, length_penalty, early_stopping) for _ in range(bs)] # positions positions = src_len.new(max_len).long() positions = torch.arange(max_len, out=positions).unsqueeze(1).expand_as(generated) # language IDs langs = positions.clone().fill_(tgt_lang_id) # scores for each sentence in the beam beam_scores = src_enc.new(bs, beam_size).fill_(0) beam_scores[:, 1:] = -1e9 beam_scores = beam_scores.view(-1) # current position cur_len = 1 # cache compute states cache = {'slen': 0} # store cross attention weights self.cross_att = defaultdict(list) # done sentences done = [False for _ in range(bs)] while cur_len < max_len: # compute word scores tensor = self.forward( 'fwd', x=generated[:cur_len], lengths=src_len.new(bs * beam_size).fill_(cur_len), positions=positions[:cur_len], langs=langs[:cur_len], causal=True, src_enc=src_enc, src_len=src_len, cache=cache ) assert tensor.size() == (1, bs * beam_size, self.dim) tensor = tensor.data[-1, :, :] # (bs * beam_size, dim) scores = self.pred_layer.get_scores(tensor) # (bs * beam_size, n_words) scores = F.log_softmax(scores, dim=-1) # (bs * beam_size, n_words) assert scores.size() == (bs * beam_size, n_words) # select next words with scores _scores = scores + beam_scores[:, None].expand_as(scores) # (bs * beam_size, n_words) _scores = _scores.view(bs, beam_size * n_words) # (bs, beam_size * n_words) next_scores, next_words = torch.topk(_scores, 2 * beam_size, dim=1, largest=True, sorted=True) assert next_scores.size() == next_words.size() == (bs, 2 * beam_size) # next batch beam content # list of (bs * beam_size) tuple(next hypothesis score, next word, current position in the batch) next_batch_beam = [] # for each sentence for sent_id in range(bs): # if we are done with this sentence done[sent_id] = done[sent_id] or generated_hyps[sent_id].is_done(next_scores[sent_id].max().item()) if done[sent_id]: next_batch_beam.extend([(0, self.pad_index, 0)] * beam_size) # pad the batch continue # next sentence beam content next_sent_beam = [] # next words for this sentence for idx, value in zip(next_words[sent_id], next_scores[sent_id]): # get beam and word IDs beam_id = idx // n_words word_id = idx % n_words # end of sentence, or next word if word_id == self.eos_index or cur_len + 1 == max_len: generated_hyps[sent_id].add(generated[:cur_len, sent_id * beam_size + beam_id].clone(), value.item()) else: next_sent_beam.append((value, word_id, sent_id * beam_size + beam_id)) # the beam for next step is full if len(next_sent_beam) == beam_size: break # update next beam content assert len(next_sent_beam) == 0 if cur_len + 1 == max_len else beam_size if len(next_sent_beam) == 0: next_sent_beam = [(0, self.pad_index, 0)] * beam_size # pad the batch next_batch_beam.extend(next_sent_beam) assert len(next_batch_beam) == beam_size * (sent_id + 1) # sanity check / prepare next batch assert len(next_batch_beam) == bs * beam_size beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) beam_words = generated.new([x[1] for x in next_batch_beam]) beam_idx = src_len.new([x[2] for x in next_batch_beam]) # re-order batch and internal states generated = generated[:, beam_idx] generated[cur_len] = beam_words for k in cache.keys(): if k != 'slen': cache[k] = (cache[k][0][beam_idx], cache[k][1][beam_idx]) # update current length cur_len = cur_len + 1 # stop when we are done with each sentence if all(done): break # select the best hypotheses tgt_len = src_len.new(bs) best = [] for i, hypotheses in enumerate(generated_hyps): best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1] tgt_len[i] = len(best_hyp) + 1 # +1 for the <EOS> symbol best.append(best_hyp) # generate target batch decoded = src_len.new(tgt_len.max().item(), bs).fill_(self.pad_index) for i, hypo in enumerate(best): decoded[:tgt_len[i] - 1, i] = hypo decoded[tgt_len[i] - 1, i] = self.eos_index # sanity check assert (decoded == self.eos_index).sum() == 2 * bs return decoded, tgt_len
def forward(self, x): h = self.fc_layer(x) if len(self.output_shape) > 1: h = h.view(h.shape[0], *self.output_shape) h = F.log_softmax(h, dim=-1) return h
import numpy as np import torch import torch.nn.functional as F x_ = np.random.randn(8).astype(np.float32) x = torch.tensor(x_, requires_grad=True) p = F.log_softmax(x, -1) p_ = p.detach() q = F.log_softmax(torch.randn(8), -1) print(p) print(q) kl_div = F.kl_div(q, torch.exp(p), reduction='sum') print(kl_div) kl_div.backward() print(x.grad) x = torch.tensor(x_, requires_grad=True) p = F.log_softmax(x, -1) divided_kl_loss = 0 for index in range(len(x)): divided_kl_loss += (q[index] - p_[index]) * torch.exp( p_[index]) * -p[index] divided_kl_loss.backward() print(x.grad) x = torch.tensor(x_, requires_grad=True)