def train(epoch): t = time.time() model.train() optimizer.zero_grad() output = model(features, adj) loss_train = F.nll_loss(output[idx_train], labels[idx_train]) acc_train = accuracy(output[idx_train], labels[idx_train]) loss_train.backward() optimizer.step() if not args.fastmode: # Evaluate validation set performance separately, # deactivates dropout during validation run. model.eval() output = model(features, adj) loss_val = F.nll_loss(output[idx_val], labels[idx_val]) acc_val = accuracy(output[idx_val], labels[idx_val]) print('Epoch: {:04d}'.format(epoch+1), 'loss_train: {:.4f}'.format(loss_train.data[0]), 'acc_train: {:.4f}'.format(acc_train.data[0]), 'loss_val: {:.4f}'.format(loss_val.data[0]), 'acc_val: {:.4f}'.format(acc_val.data[0]), 'time: {:.4f}s'.format(time.time() - t)) return loss_val.data[0]
def forward(self, y_pred, y_true): torch.nn.modules.loss._assert_no_grad(y_true) y_pred_log = torch.log(y_pred) start_loss = F.nll_loss(y_pred_log[:, 0, :], y_true[:, 0]) end_loss = F.nll_loss(y_pred_log[:, 1, :], y_true[:, 1]) return start_loss + end_loss
def train(epoch, model): #最后的全连接层学习率为前面的10倍 LEARNING_RATE = lr / math.pow((1 + 10 * (epoch - 1) / epochs), 0.75) print("learning rate:", LEARNING_RATE) optimizer_fea = torch.optim.SGD([ {'params': model.sharedNet.parameters()}, {'params': model.cls_fc.parameters(), 'lr': LEARNING_RATE}, ], lr=LEARNING_RATE / 10, momentum=momentum, weight_decay=l2_decay) optimizer_critic = torch.optim.SGD([ {'params': model.domain_fc.parameters(), 'lr': LEARNING_RATE} ], lr=LEARNING_RATE, momentum=momentum, weight_decay=l2_decay) data_source_iter = iter(source_loader) data_target_iter = iter(target_train_loader) dlabel_src = Variable(torch.ones(batch_size).long().cuda()) dlabel_tgt = Variable(torch.zeros(batch_size).long().cuda()) i = 1 while i <= len_source_loader: model.train() source_data, source_label = data_source_iter.next() if cuda: source_data, source_label = source_data.cuda(), source_label.cuda() source_data, source_label = Variable(source_data), Variable(source_label) clabel_src, dlabel_pred_src = model(source_data) label_loss = F.nll_loss(F.log_softmax(clabel_src, dim=1), source_label) critic_loss_src = F.nll_loss(F.log_softmax(dlabel_pred_src, dim=1), dlabel_src) confusion_loss_src = 0.5 * ( F.nll_loss(F.log_softmax(dlabel_pred_src, dim=1), dlabel_src) + F.nll_loss(F.log_softmax(dlabel_pred_src, dim=1), dlabel_tgt) ) target_data, target_label = data_target_iter.next() if i % len_target_loader == 0: data_target_iter = iter(target_train_loader) if cuda: target_data, target_label = target_data.cuda(), target_label.cuda() target_data = Variable(target_data) clabel_tgt, dlabel_pred_tgt = model(target_data) critic_loss_tgt = F.nll_loss(F.log_softmax(dlabel_pred_tgt, dim=1), dlabel_tgt) confusion_loss_tgt = 0.5 * (F.nll_loss(F.log_softmax(dlabel_pred_tgt, dim=1), dlabel_src) + F.nll_loss( F.log_softmax(dlabel_pred_tgt, dim=1), dlabel_tgt)) confusion_loss_total = (confusion_loss_src + confusion_loss_tgt) / 2 fea_loss_total = confusion_loss_total + label_loss critic_loss_total = (critic_loss_src + critic_loss_tgt) / 2 optimizer_fea.zero_grad() fea_loss_total.backward(retain_graph=True) optimizer_fea.step() optimizer_fea.zero_grad() optimizer_critic.zero_grad() critic_loss_total.backward() optimizer_critic.step() if i % log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tconfusion_Loss: {:.6f}\tlabel_Loss: {:.6f}\tdomain_Loss: {:.6f}'.format( epoch, i * len(source_data),len_source_dataset, 100. * i / len_source_loader, confusion_loss_total.data[0], label_loss.data[0], critic_loss_total.data[0])) i = i + 1
def get_loss(cls, start_log_probs, end_log_probs, starts, ends): """ Get the loss, $-\log P(s|p,q)P(e|p,q)$. The start and end labels are expected to be in span format, so that text[start:end] is the answer. """ # Subtracts 1 from the end points, to get the exact indices, not 1 # after the end. loss = nll_loss(start_log_probs, starts) +\ nll_loss(end_log_probs, ends-1) return loss
def train(self, epoch): """ Train one epoch of this model by iterating through mini batches. An epoch ends after one pass through the training set, or if the number of mini batches exceeds the parameter "batches_in_epoch". """ self.logger.info("epoch: %s", epoch) t0 = time.time() self.preEpoch() self.logger.info("Learning rate: %s", self.learningRate if self.lr_scheduler is None else self.lr_scheduler.get_lr()) self.model.train() for batch_idx, (batch, target) in enumerate(self.train_loader): data = batch["input"] if self.model_type in ["resnet9", "cnn"]: data = torch.unsqueeze(data, 1) data, target = data.to(self.device), target.to(self.device) self.optimizer.zero_grad() output = self.model(data) loss = F.nll_loss(output, target) loss.backward() self.optimizer.step() if batch_idx >= self.batches_in_epoch: break self.postEpoch() self.logger.info("training duration: %s", time.time() - t0)
def cross_entropy2d(input, target, weight=None, size_average=True): """ Function to compute pixelwise cross-entropy for 2D image. This is the segmentation loss. Args: input: input tensor of shape (minibatch x num_channels x h x w) target: 2D label map of shape (minibatch x h x w) weight (optional): tensor of size 'C' specifying the weights to be given to each class size_average (optional): boolean value indicating whether the NLL loss has to be normalized by the number of pixels in the image """ # input: (n, c, h, w), target: (n, h, w) n, c, h, w = input.size() # log_p: (n, c, h, w) log_p = F.log_softmax(input) # log_p: (n*h*w, c) log_p = log_p.transpose(1, 2).transpose(2, 3).contiguous().view(-1, c) try: log_p = log_p[target.view(n, h, w, 1).repeat(1, 1, 1, c) >= 0] except: print "Exception: ", target.size() log_p = log_p.view(-1, c) # target: (n*h*w,) mask = target >= 0 target = target[mask] target = torch.squeeze(target) loss = F.nll_loss(log_p, target, weight=weight, size_average=False) if size_average: loss /= mask.data.sum() return loss
def test(self, test_loader=None): """ Test the model using the given loader and return test metrics """ if test_loader is None: test_loader = self.test_loader self.model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for batch, target in test_loader: data = batch["input"] if self.model_type in ["resnet9", "cnn"]: data = torch.unsqueeze(data, 1) data, target = data.to(self.device), target.to(self.device) output = self.model(data) test_loss += F.nll_loss(output, target, reduction='sum').item() pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.sampler) test_error = 100. * correct / len(test_loader.sampler) entropy = self.entropy() ret = { "total_correct": correct, "mean_loss": test_loss, "mean_accuracy": test_error, "entropy": float(entropy)} return ret
def train(args, model, device, train_loader, optimizer): model.train() start_time = time() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: percentage = 100. * batch_idx / len(train_loader) cur_length = int((tracker_length * int(percentage)) / 100) bar = '=' * cur_length + '>' + '-' * (tracker_length - cur_length) sys.stdout.write('\r{}/{} [{}] - loss: {:.4f}'.format( batch_idx * len(data), len(train_loader.dataset), bar, loss.item())) sys.stdout.flush() train_time = time() - start_time sys.stdout.write('\r{}/{} [{}] - {:.1f}s {:.1f}us/step - loss: {:.4f}'.format( len(train_loader.dataset), len(train_loader.dataset), '=' * tracker_length, train_time, (train_time / len(train_loader.dataset)) * 1000000.0, loss.item())) sys.stdout.flush() return len(train_loader.dataset), train_time, loss.item()
def test(epoch, best_acc): slope = get_slope(epoch) model.eval() test_loss = 0.0 correct = 0.0 for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data, volatile=True), Variable(target) output = model((data, slope)) test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability correct += pred.eq(target.data.view_as(pred)).cpu().sum() test_loss /= len(test_loader.dataset) test_acc = correct / len(test_loader.dataset) print 'Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, int(correct), len(test_loader.dataset), 100. * test_acc) if test_acc >= best_acc: torch.save(model.state_dict(), os.path.join('models','{}.pth'.format(model_name))) return test_loss, test_acc
def m_testxxx(epoch): # checkpoint = torch.load('checkpoint-1.pth.tar') # model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) # model.eval() test_loss = 0 correct = 0 for data, target in test_loader: # x_data = data[0].numpy() # x_data = np.reshape(x_data, [28, 28]) # np.savetxt('./data.csv', x_data) if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data, volatile=True), Variable(target) output = model(data) test_loss += F.nll_loss(output, target).data[0] pred = output.data.max(1)[1] # get the index of the max log-probability #result = pred.numpy() #np.reshape(result, [-1, 1]) #print(result.shape) # print(pred) correct += pred.eq(target.data).cpu().sum() test_loss = test_loss test_loss /= len(test_loader) # loss function already averages over batch size print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))
def train(model, device, train_loader, optimizer, epoch): """Train for one epoch on the training set""" losses = AverageMeter() top1 = AverageMeter() # switch to train mode model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) # compute output output = model(data) loss = F.nll_loss(output, target) # measure accuracy and record loss prec1 = accuracy(output, target, topk=(1,))[0] losses.update(loss.item(), data.size(0)) top1.update(prec1.item(), data.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() if batch_idx % args.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( epoch, batch_idx, len(train_loader), loss=losses, top1=top1))
def get_combined_loss(cls, combined, starts, ends): """ Get the loss, $-\log P(s,e|p,q)$. In practice, with: 1. $\Psi_s(s|p,q)$ the start logits, 2. $\Psi_e(e|p,q)$ the end logits, 3. $Z_s = \log\sum_{i}\exp\Psi_s(i|p,q)$, the start partition, 4. $Z_e = \log\sum_{i}\exp\Psi_e(i|p,q)$, the end partition, and 5. $Z_c = \log\sum_{i}\sum{j>=i}\exp(\Psi_s(i|p,q)+\Psi_e(i|p,q))$, the combined partition, the default loss is: $Z_s + Z_e - \Psi_s(s|p,q) - \Psi_e(e|p,q)$, and the combined loss is: $Z_c - \Psi_s(s|p,q) - \Psi_e(e|p,q)$. The combined loss uses a normalization that ignores invalid end points. This is not a major difference, and should only slow things down during training. This loss is only used to validate and to compare. """ batch_size, num_tokens, _other = combined.size() assert num_tokens == _other mask = torch.zeros(batch_size, num_tokens, num_tokens).float() for start in range(1, num_tokens): mask[:, start, :start] = -1e20 mask = mask.type_as(combined.data) combined = combined + Variable(mask) combined = combined.view(batch_size, num_tokens*num_tokens) combined = nn.functional.log_softmax(combined, dim=1) labels = starts * num_tokens + ends return nll_loss(combined, labels)
def train(epoch, model): LEARNING_RATE = lr / math.pow((1 + 10 * (epoch - 1) / epochs), 0.75) print('learning rate{: .4f}'.format(LEARNING_RATE) ) optimizer = torch.optim.SGD([ {'params': model.sharedNet.parameters()}, {'params': model.cls_fc.parameters(), 'lr': LEARNING_RATE}, ], lr=LEARNING_RATE / 10, momentum=momentum, weight_decay=l2_decay) model.train() iter_source = iter(source_loader) iter_target = iter(target_train_loader) num_iter = len_source_loader for i in range(1, num_iter): data_source, label_source = iter_source.next() data_target, _ = iter_target.next() if i % len_target_loader == 0: iter_target = iter(target_train_loader) if cuda: data_source, label_source = data_source.cuda(), label_source.cuda() data_target = data_target.cuda() data_source, label_source = Variable(data_source), Variable(label_source) data_target = Variable(data_target) optimizer.zero_grad() label_source_pred, loss_mmd = model(data_source, data_target) loss_cls = F.nll_loss(F.log_softmax(label_source_pred, dim=1), label_source) gamma = 2 / (1 + math.exp(-10 * (epoch) / epochs)) - 1 loss = loss_cls + gamma * loss_mmd loss.backward() optimizer.step() if i % log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tsoft_Loss: {:.6f}\tmmd_Loss: {:.6f}'.format( epoch, i * len(data_source), len_source_dataset, 100. * i / len_source_loader, loss.data[0], loss_cls.data[0], loss_mmd.data[0]))
def evaluate(): should_stop = False model.eval() for name, loader in [('train', train_loader), ('test', test_loader)]: loss = 0 correct = 0 for data, target in loader: if args.cuda: data, target = data.cuda(), target.cuda() if isinstance(model, MLP): data = data.view(-1, 784) data, target = Variable(data, volatile=True), Variable(target) output = model(data) loss += F.nll_loss(output, target, size_average=False).data[0] # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] correct += pred.eq(target.data.view_as(pred)).cpu().sum() loss /= len(loader.dataset) print('{} -- Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)' .format(name.ljust(5), loss, correct, len(loader.dataset), 100. * correct / len(loader.dataset))) if name == 'test': scheduler.step(loss) should_stop = should_stop or correct == len(loader.dataset) return should_stop or optimizer.param_groups[0]['lr'] < args.lr / 1e2
def train(epoch): slope = get_slope(epoch) print '# Epoch : {} - Slope : {}'.format(epoch, slope) model.train() train_loss = 0 for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model((data, slope)) loss = F.nll_loss(output, target) loss.backward() optimizer.step() train_loss += loss.data train_loss /= len(train_loader) train_loss = train_loss[0] print 'Training Loss : {}'.format(train_loss) return train_loss
def test(model, device, test_loader, epoch): losses = AverageMeter() top1 = AverageMeter() # switch to evaluate mode model.eval() for batch_idx, (data, target) in enumerate(test_loader): data, target = data.to(device), target.to(device) # compute output with torch.no_grad(): output = model(data) loss = F.nll_loss(output, target) # measure accuracy and record loss prec1 = accuracy(output, target, topk=(1,))[0] losses.update(loss.item(), data.size(0)) top1.update(prec1.item(), data.size(0)) if batch_idx % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format( batch_idx, len(test_loader), loss=losses, top1=top1)) print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1)) return top1.avg
def train(args, epoch, net, trainLoader, optimizer, trainF): net.train() nProcessed = 0 nTrain = len(trainLoader.dataset) for batch_idx, (data, target) in enumerate(trainLoader): if args.cuda: data, target = data.cuda(), target.cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() output = net(data) loss = F.nll_loss(output, target) # make_graph.save('/tmp/t.dot', loss.creator); assert(False) loss.backward() optimizer.step() nProcessed += len(data) pred = output.data.max(1)[1] # get the index of the max log-probability incorrect = pred.ne(target.data).cpu().sum() err = 100.*incorrect/len(data) partialEpoch = epoch + batch_idx / len(trainLoader) - 1 print('Train Epoch: {:.2f} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tError: {:.6f}'.format( partialEpoch, nProcessed, nTrain, 100. * batch_idx / len(trainLoader), loss.data[0], err)) trainF.write('{},{},{}\n'.format(partialEpoch, loss.data[0], err)) trainF.flush()
def _test_pytorch(self, model): """ Test pre-trained pytorch model using MNIST Dataset :param model: Pre-trained PytorchMNIST model :return: tuple(loss, accuracy) """ data_loader = torch.utils.data.DataLoader( datasets.MNIST(self.dataDir, train=False, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=BATCH_SIZE, shuffle=True) model.eval() loss = 0.0 num_correct = 0.0 with torch.no_grad(): for data, target in data_loader: data = data.view(-1, 28 * 28) output = model(data) loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability num_correct += pred.eq(target.view_as(pred)).sum().item() loss /= len(data_loader.dataset) accuracy = num_correct / len(data_loader.dataset) return (loss, accuracy)
def train(**kwargs): opt.parse(kwargs) vis = Visualizer(opt.env) #step1: config model model = getattr(Nets,opt.model)() if opt.load_model_path: model.load(opt.load_model_path) if opt.use_gpu: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) #step2: data train_data = imageSentiment(opt.train_path,train = True) #训练集 val_data = imageSentiment(opt.train_path,train = False) #验证集 train_dataloader = DataLoader(train_data,batch_size = opt.batch_size,shuffle=True,num_workers = opt.num_workers) val_dataloader = DataLoader(val_data,batch_size = opt.batch_size,shuffle=False,num_workers = opt.num_workers) #step3: 定义损失函数及优化器 # criterion = nn.CrossEntropyLoss() #交叉熵损失函数 如果使用该损失函数 则网络最后无需使用softmax函数 lr = opt.lr # optimizer = Optim.Adam(model.parameters(),lr = lr,weight_decay= opt.weight_decay) optimizer = Optim.SGD(model.parameters(),lr = 0.001,momentum=0.9,nesterov=True) #step4: 统计指标(计算平均损失以及混淆矩阵) loss_meter = meter.AverageValueMeter() confusion_matrix = meter.ConfusionMeter(7) previous_loss = 1e100 #训练 for i in range(opt.max_epoch): loss_meter.reset() confusion_matrix.reset() total_loss = 0. for ii,(label,data) in tqdm(enumerate(train_dataloader),total=len(train_dataloader)): if opt.use_gpu: label,data = label.to(device),data.to(device) optimizer.zero_grad() score = model(data) # ps:使用nll_loss和crossentropyloss进行多分类时 target为索引标签即可 无需转为one-hot loss = F.nll_loss(score,label) total_loss += loss.item() loss.backward() optimizer.step() #更新统计指标以及可视化 loss_meter.add(loss.item()) confusion_matrix.add(score.data,label.data) if ii%opt.print_freq==opt.print_freq-1: vis.plot('loss',loss_meter.value()[0]) vis.plot('mach avgloss', total_loss/len(train_dataloader)) model.save() #计算验证集上的指标 val_accuracy = val(model,val_dataloader) vis.plot('val_accuracy',val_accuracy)
def compute_test(): model.eval() output = model(features, adj) loss_test = F.nll_loss(output[idx_test], labels[idx_test]) acc_test = accuracy(output[idx_test], labels[idx_test]) print("Test set results:", "loss= {:.4f}".format(loss_test.data[0]), "accuracy= {:.4f}".format(acc_test.data[0]))
def loss(self, predict, target): ''' compute loss ''' return F.nll_loss( predict.view(-1, self.trg_vocab_size), target.view(-1), ignore_index=PAD_IDX)
def part_loss(pred_part, gt_seg_part, gt_seg_object, object_label, valid): mask_object = (gt_seg_object == object_label) loss = F.nll_loss(pred_part, gt_seg_part * mask_object.long(), reduction='none') loss = loss * mask_object.float() loss = torch.sum(loss.view(loss.size(0), -1), dim=1) nr_pixel = torch.sum(mask_object.view(mask_object.shape[0], -1), dim=1) sum_pixel = (nr_pixel * valid).sum() loss = (loss * valid.float()).sum() / torch.clamp(sum_pixel, 1).float() return loss
def _train_iteration(self): self.model.train() for batch_idx, (data, target) in enumerate(self.train_loader): if self.args.cuda: data, target = data.cuda(), target.cuda() self.optimizer.zero_grad() output = self.model(data) loss = F.nll_loss(output, target) loss.backward() self.optimizer.step()
def train(epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step()
def train_(self, input_img, input_qst, label): self.optimizer.zero_grad() output = self(input_img, input_qst) loss = F.nll_loss(output, label) loss.backward() self.optimizer.step() pred = output.data.max(1)[1] correct = pred.eq(label.data).cpu().sum() accuracy = correct * 100. / len(label) return accuracy
def forward(self, batch): context, context_lengths, context_limited = batch.context, batch.context_lengths, batch.context_limited question, question_lengths, question_limited = batch.question, batch.question_lengths, batch.question_limited answer, answer_lengths, answer_limited = batch.answer, batch.answer_lengths, batch.answer_limited oov_to_limited_idx, limited_idx_to_full_idx = batch.oov_to_limited_idx, batch.limited_idx_to_full_idx def map_to_full(x): return limited_idx_to_full_idx[x] self.map_to_full = map_to_full context_embedded = self.encoder_embeddings(context) question_embedded = self.encoder_embeddings(question) context_encoded = self.bilstm_before_coattention(context_embedded, context_lengths)[0] question_encoded = self.bilstm_before_coattention(question_embedded, question_lengths)[0] context_padding = context.data == self.pad_idx question_padding = question.data == self.pad_idx coattended_context = self.coattention(context_encoded, question_encoded, context_padding, question_padding) context_summary = torch.cat([coattended_context, context_encoded, context_embedded], -1) condensed_context, _ = self.context_bilstm_after_coattention(context_summary, context_lengths) self_attended_context = self.self_attentive_encoder_context(condensed_context, padding=context_padding) final_context, (context_rnn_h, context_rnn_c) = self.bilstm_context(self_attended_context[-1], context_lengths) context_rnn_state = [self.reshape_rnn_state(x) for x in (context_rnn_h, context_rnn_c)] context_indices = context_limited if context_limited is not None else context answer_indices = answer_limited if answer_limited is not None else answer pad_idx = self.field.decoder_stoi[self.field.pad_token] context_padding = context_indices.data == pad_idx self.dual_ptr_rnn_decoder.applyMasks(context_padding) if self.training: answer_padding = answer_indices.data == pad_idx answer_embedded = self.decoder_embeddings(answer) self_attended_decoded = self.self_attentive_decoder(answer_embedded[:, :-1].contiguous(), self_attended_context, context_padding=context_padding, answer_padding=answer_padding[:, :-1], positional_encodings=True) decoder_outputs = self.dual_ptr_rnn_decoder(self_attended_decoded, final_context, hidden=context_rnn_state) rnn_output, context_attention, context_alignment, vocab_pointer_switch, rnn_state = decoder_outputs probs = self.probs(self.out, rnn_output, vocab_pointer_switch, context_attention, context_indices, oov_to_limited_idx) probs, targets = mask(answer_indices[:, 1:].contiguous(), probs.contiguous(), pad_idx=pad_idx) loss = F.nll_loss(probs.log(), targets) return loss, None else: return None, self.greedy(self_attended_context, final_context, context_indices, oov_to_limited_idx, rnn_state=context_rnn_state).data
def evaluate(net, dataloader, num_ens=1): """Calculate ensemble accuracy and NLL""" accs = [] nlls = [] for i, (inputs, labels) in enumerate(dataloader): inputs, labels = Variable(inputs.cuda(async=True)), Variable(labels.cuda(async=True)) outputs = torch.zeros(inputs.shape[0], net.num_classes, num_ens).cuda() for j in range(num_ens): outputs[:, :, j] = F.log_softmax(net(inputs), dim=1).data accs.append(metrics.logit2acc(logmeanexp(outputs, dim=2), labels)) nlls.append(F.nll_loss(Variable(logmeanexp(outputs, dim=2)), labels, size_average=False).data.cpu().numpy()) return np.mean(accs), np.sum(nlls)
def train(epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % 10 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.data[0]))
def train(args, model, device, train_loader, optimizer, epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item()))
def train_epoch(epoch, args, model, data_loader, optimizer): model.train() pid = os.getpid() for batch_idx, (data, target) in enumerate(data_loader): optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( pid, epoch, batch_idx * len(data), len(data_loader.dataset), 100. * batch_idx / len(data_loader), loss.item()))
def fit(epoch, model, data_loader, phase='training', volatile=False, is_cuda=False): optimizer = optim.SGD(model.parameters(), lr=Leaning_Rate, momentum=0.5) if phase == 'training': model.train() if phase == 'validation': model.eval() volatile = True running_loss = 0.0 running_correct = 0 for batch_idx, (data, target) in enumerate(data_loader): # data, target = data.cuda(), target.cuda() #data, target = Variable(data, volatile), Variable(target) if phase == 'training': optimizer.zero_grad() # print("data shape:{}".format(data.shape)) output = model(data) #------------------------------------------------------- # #-------------------------------------------------------- #------------------------------------------------------------------- loss = F.nll_loss(output, target) running_loss += F.nll_loss(output, target, size_average=False).data #---------------------------------------------------------------------- preds = output.data.max(dim=1, keepdim=True)[1] gound_truth = target.data # print("preds:{}".format(preds)) answer = preds.squeeze() # print("gound_truth:{}".format(gound_truth)) # print("answer:{}".format(answer)) a = gound_truth.data.detach().cpu().numpy() b = answer.data.detach().cpu().numpy() gound_truth_list.append(a) answer_list.append(b) # print("ground_truth numpy:{}".format(a)) # print("answer numpy:{}".format(b)) running_correct += preds.eq(target.data.view_as(preds)).cpu().sum() if phase == 'training': loss.backward() optimizer.step() loss = running_loss / len(data_loader.dataset) accuracy = 100. * running_correct.item() / len(data_loader.dataset) print( f'{phase} loss is {loss:{5}.{2}} and {phase} accuracy is {running_correct}/{len(data_loader.dataset)}{accuracy:{10}.{4}}' ) # print("gound_truth_list:{}".format(gound_truth_list)) # print("answer_list:{}".format(answer_list)) return loss, accuracy
for batch_X, batch_y in iterator.get_batches(train_set, shuffle=False): net_in = np_to_var(batch_X) if cuda: net_in = net_in.cuda() net_target = np_to_var(batch_y) if cuda: net_target = net_target.cuda() # Remove gradients of last backward pass from all parameters optimizer.zero_grad() outputs = model(net_in) # Mean predictions across trial # Note that this will give identical gradients to computing # a per-prediction loss (at least for the combination of log softmax activation # and negative log likelihood loss which we are using here) outputs = th.mean(outputs, dim=2)[:, :, 0] loss = F.nll_loss(outputs, net_target) loss.backward() optimizer.step() # Print some statistics each epoch model.eval() for setname, dataset in (('Train', train_set), ('Test', test_set)): # Collect all predictions and losses all_preds = [] all_losses = [] batch_sizes = [] for batch_X, batch_y in iterator.get_batches(dataset, shuffle=False): net_in = np_to_var(batch_X) if cuda: net_in = net_in.cuda() net_target = np_to_var(batch_y)
def forward( self, # type: ignore question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], span_start: torch.IntTensor = None, span_end: torch.IntTensor = None, metadata: List[Dict[str, Any]] = None, store_metrics: bool = True, valid_output_mask: torch.LongTensor = None, sent_targets: torch.Tensor = None, stance: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- question : Dict[str, torch.LongTensor] From a ``TextField``. passage : Dict[str, torch.LongTensor] From a ``TextField``. The model assumes that this passage contains the answer to the question, and predicts the beginning and ending positions of the answer within the passage. span_start : ``torch.IntTensor``, optional From an ``IndexField``. This is one of the things we are trying to predict - the beginning position of the answer with the passage. This is an `inclusive` token index. If this is given, we will compute a loss that gets included in the output dictionary. span_end : ``torch.IntTensor``, optional From an ``IndexField``. This is one of the things we are trying to predict - the ending position of the answer with the passage. This is an `inclusive` token index. If this is given, we will compute a loss that gets included in the output dictionary. metadata : ``List[Dict[str, Any]]``, optional If present, this should contain the question ID, original passage text, and token offsets into the passage for each instance in the batch. We use this for computing official metrics using the official SQuAD evaluation script. The length of this list should be the batch size, and each dictionary should have the keys ``id``, ``original_passage``, and ``token_offsets``. If you only want the best span string and don't care about official metrics, you can omit the ``id`` key. store_metrics : bool If true, stores metrics (if applicable) within model metric tracker. If false, returns resulting metrics immediately, without updating the model metric tracker. valid_output_mask: ``torch.LongTensor``, optional The locations for a valid answer. Used to limit the model's output space. Returns ------- An output dictionary consisting of: span_start_logits : torch.FloatTensor A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log probabilities of the span start position. span_start_probs : torch.FloatTensor The result of ``softmax(span_start_logits)``. span_end_logits : torch.FloatTensor A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log probabilities of the span end position (inclusive). span_end_probs : torch.FloatTensor The result of ``softmax(span_end_logits)``. best_span : torch.IntTensor The result of a constrained inference over ``span_start_logits`` and ``span_end_logits`` to find the most probable span. Shape is ``(batch_size, 2)`` and each offset is a token index. loss : torch.FloatTensor, optional A scalar loss to be optimised. best_span_str : List[str] If sufficient metadata was provided for the instances in the batch, we also return the string from the original passage that the model thinks is the best answer to the question. """ embedded_question = self._highway_layer( self._text_field_embedder(question)) embedded_passage = self._highway_layer( self._text_field_embedder(passage)) batch_size = embedded_question.size(0) passage_length = embedded_passage.size(1) question_mask = util.get_text_field_mask(question).float() passage_mask = util.get_text_field_mask(passage).float() question_lstm_mask = question_mask if self._mask_lstms else None passage_lstm_mask = passage_mask if self._mask_lstms else None encoded_question = self._dropout( self._phrase_layer(embedded_question, question_lstm_mask)) encoded_passage = self._dropout( self._phrase_layer(embedded_passage, passage_lstm_mask)) encoding_dim = encoded_question.size(-1) # Shape: (batch_size, passage_length, question_length) passage_question_similarity = self._matrix_attention( encoded_passage, encoded_question) # Shape: (batch_size, passage_length, question_length) passage_question_attention = util.masked_softmax( passage_question_similarity, question_mask) # Shape: (batch_size, passage_length, encoding_dim) passage_question_vectors = util.weighted_sum( encoded_question, passage_question_attention) # We replace masked values with something really negative here, so they don't affect the # max below. masked_similarity = util.replace_masked_values( passage_question_similarity, question_mask.unsqueeze(1), -1e7) # Shape: (batch_size, passage_length) question_passage_similarity = masked_similarity.max( dim=-1)[0].squeeze(-1) # Shape: (batch_size, passage_length) question_passage_attention = util.masked_softmax( question_passage_similarity, passage_mask) # Shape: (batch_size, encoding_dim) question_passage_vector = util.weighted_sum( encoded_passage, question_passage_attention) # Shape: (batch_size, passage_length, encoding_dim) tiled_question_passage_vector = question_passage_vector.unsqueeze( 1).expand(batch_size, passage_length, encoding_dim) # Shape: (batch_size, passage_length, encoding_dim * 4) final_merged_passage = torch.cat([ encoded_passage, passage_question_vectors, encoded_passage * passage_question_vectors, encoded_passage * tiled_question_passage_vector ], dim=-1) # Debate: Conditioning on whose turn it is (A/B) if not self.is_judge: turn_film_params = self._turn_film_gen( stance.to(final_merged_passage).unsqueeze(1)) turn_gammas, turn_betas = torch.split( turn_film_params, self._modeling_layer.get_input_dim(), dim=-1) final_merged_passage_mask = ( final_merged_passage != 0).float() # NOTE: Using heuristic to get mask final_merged_passage = self._film( final_merged_passage, 1. + turn_gammas, turn_betas) * final_merged_passage_mask modeled_passage = self._dropout( self._modeling_layer(final_merged_passage, passage_lstm_mask)) modeling_dim = modeled_passage.size(-1) # Shape: (batch_size, passage_length, encoding_dim * 4 + modeling_dim)) span_start_input_full = torch.cat( [final_merged_passage, modeled_passage], dim=-1) span_start_input = self._dropout(span_start_input_full) if not self.is_judge: value_head_input = span_start_input_full.detach( ) if self._detach_value_head else span_start_input_full # Shape: (batch_size) tokenwise_values = self._value_head(value_head_input).squeeze(-1) value, value_loc = util.replace_masked_values( tokenwise_values, passage_mask, -1e7).max(-1) # Shape: (batch_size, passage_length) span_start_logits = self._span_start_predictor( span_start_input).squeeze(-1) valid_output_mask = passage_mask if valid_output_mask is None else valid_output_mask # Shape: (batch_size, passage_length) span_start_probs = util.masked_softmax(span_start_logits, valid_output_mask) # Shape: (batch_size, modeling_dim) span_start_representation = util.weighted_sum(modeled_passage, span_start_probs) # Shape: (batch_size, passage_length, modeling_dim) tiled_start_representation = span_start_representation.unsqueeze( 1).expand(batch_size, passage_length, modeling_dim) # Shape: (batch_size, passage_length, encoding_dim * 4 + modeling_dim * 3) span_end_representation = torch.cat([ final_merged_passage, modeled_passage, tiled_start_representation, modeled_passage * tiled_start_representation ], dim=-1) # Shape: (batch_size, passage_length, encoding_dim) encoded_span_end = self._dropout( self._span_end_encoder(span_end_representation, passage_lstm_mask)) # Shape: (batch_size, passage_length, encoding_dim * 4 + span_end_encoding_dim) span_end_input = self._dropout( torch.cat([final_merged_passage, encoded_span_end], dim=-1)) span_end_logits = self._span_end_predictor(span_end_input).squeeze(-1) span_end_probs = util.masked_softmax(span_end_logits, valid_output_mask) span_start_logits = util.replace_masked_values(span_start_logits, valid_output_mask, -1e7) span_end_logits = util.replace_masked_values(span_end_logits, valid_output_mask, -1e7) best_span = self.get_best_span(span_start_logits, span_end_logits) output_dict = { "passage_question_attention": passage_question_attention, "span_start_logits": span_start_logits, "span_start_probs": span_start_probs, "span_end_logits": span_end_logits, "span_end_probs": span_end_probs, "best_span": best_span, "value": value if not self.is_judge else None, "prob": torch.tensor([ span_start_probs[i, span_start[i]] if span_start[i] < span_start_probs.size(1) else 0. for i in range(batch_size) ]) if self.is_judge else None, # prob(true answer) "prob_dist": span_start_probs, } # Compute the loss for training. if (span_start is not None) and self.is_judge: span_start[span_start >= passage_mask.size( 1)] = -100 # NB: Hacky. Don't add to loss if span not in input loss = nll_loss( util.masked_log_softmax(span_start_logits, valid_output_mask), span_start.squeeze(-1)) if store_metrics: self._span_start_accuracy(span_start_logits, span_start.squeeze(-1)) span_end[span_end >= passage_mask.size( 1)] = -100 # NB: Hacky. Don't add to loss if span not in input loss += nll_loss( util.masked_log_softmax(span_end_logits, valid_output_mask), span_end.squeeze(-1)) if store_metrics: self._span_end_accuracy(span_end_logits, span_end.squeeze(-1)) self._span_accuracy(best_span, torch.stack([span_start, span_end], -1)) output_dict["loss"] = loss elif not self.is_judge: # Debate SL if self.reward_method == 'sl': # sent_targets should be a vector of target indices output_dict["loss"] = nll_loss( util.masked_log_softmax(span_start_logits, valid_output_mask), sent_targets.squeeze(-1)) if store_metrics: self._span_start_accuracy(span_start_logits, sent_targets.squeeze(-1)) elif self.reward_method.startswith('sl-sents'): # sent_targets should be a matrix of target values (non-zero only in EOS indices) sent_targets = util.replace_masked_values( sent_targets, valid_output_mask, -1e7) output_dict["loss"] = util.masked_mean( ((span_start_logits - sent_targets)**2), valid_output_mask, 1) if store_metrics: self._span_start_accuracy(span_start_logits, sent_targets.max(-1)[1]) # Compute the EM and F1 on SQuAD and add the tokenized input to the output. batch_ems = [] batch_f1s = [] if metadata is not None: output_dict['best_span_str'] = [] question_tokens = [] passage_tokens = [] for i in range(batch_size): question_tokens.append(metadata[i]['question_tokens']) passage_tokens.append(metadata[i]['passage_tokens']) passage_str = metadata[i]['original_passage'] offsets = metadata[i]['token_offsets'] predicted_span = tuple(best_span[i].detach().cpu().numpy()) start_offset = offsets[predicted_span[0]][0] end_offset = offsets[predicted_span[1]][1] best_span_string = passage_str[start_offset:end_offset] output_dict['best_span_str'].append(best_span_string) answer_texts = metadata[i].get('answer_texts', []) if answer_texts: self._squad_metrics(best_span_string, answer_texts) sample_squad_metrics = SquadEmAndF1() sample_squad_metrics(best_span_string, answer_texts) sample_em, sample_f1 = sample_squad_metrics.get_metric( reset=True) batch_ems.append(sample_em) batch_f1s.append(sample_f1) output_dict['question_tokens'] = question_tokens output_dict['passage_tokens'] = passage_tokens output_dict['em'] = torch.tensor(batch_ems) output_dict['f1'] = torch.tensor(batch_f1s) return output_dict
[transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]), ) indices = np.isin(mnist_dataset.targets, keep_labels).astype("uint8") selected_data = (th.native_masked_select(mnist_dataset.data.transpose(0, 2), th.tensor(indices)).view( 28, 28, -1).transpose(2, 0)) selected_targets = th.native_masked_select(mnist_dataset.targets, th.tensor(indices)) dataset = sy.BaseDataset(data=selected_data, targets=selected_targets, transform=mnist_dataset.transform) dataset = sy.BaseDataset(data=selected_data, targets=selected_targets, transform=mnist_dataset.transform) trainloader = th.utils.data.DataLoader(dataset, batch_size=64, shuffle=True) optimizer = optim.SGD(model.parameters(), lr=0.001) start_time = time.time() for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = model(inputs) loss = F.nll_loss(outputs, targets) loss.backward() optimizer.step() print("[PROF]", "LocalTraining", "duration", time.time() - start_time)
def loss_fn(predictions, targets): return F.nll_loss(predictions, targets)
def validation_step(self, val_batch, batch_idx): x, y = val_batch logits = self.forward(x) loss = F.nll_loss(logits, y) acc = self.accuracy(logits, y) return {"val_loss": loss, "val_accuracy": acc}
def test(model, device, test_loader, epsilon): # Accuracy counter correct = 0 adv_examples = [] # Loop over all examples in test set for data, target in test_loader: # Send the data and label to the device data, target = data.to(device), target.to(device) # Set requires_grad attribute of tensor. Important for Attack data.requires_grad = True # Forward pass the data through the model output = model(data) init_pred = output.max( 1, keepdim=True)[1] # get the index of the max log-probability # If the initial prediction is wrong, dont bother attacking, just move on if init_pred.item() != target.item(): continue # Calculate the loss loss = F.nll_loss(output, target) # Zero all existing gradients model.zero_grad() # Calculate gradients of model in backward pass loss.backward() # Collect datagrad data_grad = data.grad.data # Call FGSM Attack perturbed_data = fgsm_attack(data, epsilon, data_grad) # Re-classify the perturbed image output = model(perturbed_data) # Check for success final_pred = output.max( 1, keepdim=True)[1] # get the index of the max log-probability if final_pred.item() == target.item(): correct += 1 # Special case for saving 0 epsilon examples if (epsilon == 0) and (len(adv_examples) < 5): adv_ex = perturbed_data.squeeze().detach().cpu().numpy() adv_examples.append( (init_pred.item(), final_pred.item(), adv_ex)) else: # Save some adv examples for visualization later if len(adv_examples) < 5: adv_ex = perturbed_data.squeeze().detach().cpu().numpy() adv_examples.append( (init_pred.item(), final_pred.item(), adv_ex)) # Calculate final accuracy for this epsilon final_acc = correct / float(len(test_loader)) print("Epsilon: {}\tTest Accuracy = {} / {} = {}".format( epsilon, correct, len(test_loader), final_acc)) # Return the accuracy and an adversarial example return final_acc, adv_examples
def forward(self, preds, target): n = preds.size()[-1] log_preds = F.log_softmax(preds, dim=-1) loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction) nll = F.nll_loss(log_preds, target, reduction=self.reduction) return linear_combination(loss/n, nll, self.epsilon)
def training_step(self, batch, batch_nb): data, target = batch output = self.forward(data) loss = F.nll_loss(output, target) return {"loss": loss}
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def loss_fn(pred, target): return F.nll_loss(input=pred, target=target)
def forward( self, question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], span_start: torch.IntTensor = None, span_end: torch.IntTensor = None, yesno_list: torch.IntTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: batch_size, max_qa_count, max_q_len, _ = question[ 'token_characters'].size() total_qa_count = batch_size * max_qa_count qa_mask = torch.ge(yesno_list, 0).view(total_qa_count) embedded_question = self._text_field_embedder(question, num_wrapping_dims=1) # total_qa_count * max_q_len * encoding_dim embedded_question = embedded_question.reshape( total_qa_count, max_q_len, self._text_field_embedder.get_output_dim()) embedded_passage = self._text_field_embedder(passage) word_emb_ques, elmo_ques, ques_feat = torch.split(embedded_question, [200, 1024, 40], dim=2) word_emb_pass, elmo_pass, pass_feat = torch.split(embedded_passage, [200, 1024, 40], dim=2) embedded_question = torch.cat([word_emb_ques, elmo_ques], dim=2) embedded_passage = torch.cat([word_emb_pass, elmo_pass], dim=2) embedded_question = self._variational_dropout(embedded_question) embedded_passage = self._variational_dropout(embedded_passage) passage_length = embedded_passage.size(1) question_mask = util.get_text_field_mask(question, num_wrapping_dims=1).float() question_mask = question_mask.reshape(total_qa_count, max_q_len) passage_mask = util.get_text_field_mask(passage).float() repeated_passage_mask = passage_mask.unsqueeze(1).repeat( 1, max_qa_count, 1) repeated_passage_mask = repeated_passage_mask.view( total_qa_count, passage_length) encode_passage = self._phrase_layer(embedded_passage, passage_mask) projected_passage = self.relu( self.projected_layer(torch.cat([encode_passage, elmo_pass], dim=2))) encode_question = self._phrase_layer(embedded_question, question_mask) projected_question = self.relu( self.projected_layer(torch.cat([encode_question, elmo_ques], dim=2))) encoded_passage = self._variational_dropout(projected_passage) repeated_encoded_passage = encoded_passage.unsqueeze(1).repeat( 1, max_qa_count, 1, 1) repeated_encoded_passage = repeated_encoded_passage.view( total_qa_count, passage_length, self._encoding_dim) repeated_pass_feat = (pass_feat.unsqueeze(1).repeat( 1, max_qa_count, 1, 1)).view(total_qa_count, passage_length, 40) encoded_question = self._variational_dropout(projected_question) # total_qa_count * max_q_len * passage_length # cnt * m * n s = torch.bmm(encoded_question, repeated_encoded_passage.transpose(2, 1)) alpha = util.masked_softmax(s, question_mask.unsqueeze(2).expand( s.size()), dim=1) # cnt * n * h aligned_p = torch.bmm(alpha.transpose(2, 1), encoded_question) # cnt * m * n beta = util.masked_softmax(s, repeated_passage_mask.unsqueeze(1).expand( s.size()), dim=2) # cnt * m * h un_aligned_q = torch.bmm(beta, repeated_encoded_passage) # flow # (b * qa) * m * h -> (b * m) * qa * h un_aligned_q = un_aligned_q.reshape( batch_size, max_qa_count, max_q_len, -1).transpose(2, 1).reshape(batch_size * max_q_len, max_qa_count, -1) tmp_q_mask = question_mask.reshape(batch_size, max_qa_count, max_q_len).transpose(2, 1).reshape( batch_size * max_q_len, max_qa_count) aligned_q = self.flow(un_aligned_q, tmp_q_mask).reshape( batch_size, max_q_len, max_qa_count, -1).transpose(2, 1).reshape(total_qa_count, max_q_len, self._encoding_dim) fused_p = self.fuse(repeated_encoded_passage, aligned_p) fused_q = self.fuse(encoded_question, aligned_q) # add manual features here q_aware_p = self.projected_lstm( torch.cat([fused_p, repeated_pass_feat], dim=2), repeated_passage_mask) # cnt * n * n self_p = torch.bmm(q_aware_p, q_aware_p.transpose(2, 1)) for i in range(passage_length): self_p[:, i, i] = 0 lamb = util.masked_softmax(self_p, repeated_passage_mask.unsqueeze(1).expand( self_p.size()), dim=2) # cnt * n * h self_aligned_p = torch.bmm(lamb, q_aware_p) # cnt * n * h fused_self_p = self.fuse(q_aware_p, self_aligned_p) contextual_p = self.contextual_layer_p(fused_self_p, repeated_passage_mask) contextual_q = self.contextual_layer_q(fused_q, question_mask) # cnt * m gamma = util.masked_softmax( self.linear_self_align(contextual_q).squeeze(2), question_mask, dim=1) # cnt * h weighted_q = torch.bmm(gamma.unsqueeze(1), contextual_q).squeeze(1) span_start_logits = self.bilinear_layer_s(weighted_q, contextual_p) span_end_logits = self.bilinear_layer_e(weighted_q, contextual_p) # cnt * n * 1 cnt * 1 * h span_yesno_logits = self.yesno_predictor( torch.bmm(span_end_logits.unsqueeze(2), weighted_q.unsqueeze(1))) span_start_logits = util.replace_masked_values(span_start_logits, repeated_passage_mask, -1e7) span_end_logits = util.replace_masked_values(span_end_logits, repeated_passage_mask, -1e7) best_span = self._get_best_span_yesno_followup(span_start_logits, span_end_logits, span_yesno_logits, self._max_span_length) output_dict: Dict[str, Any] = {} # Compute the loss for training if span_start is not None: loss = nll_loss(util.masked_log_softmax(span_start_logits, repeated_passage_mask), span_start.view(-1), ignore_index=-1) self._span_start_accuracy(span_start_logits, span_start.view(-1), mask=qa_mask) loss += nll_loss(util.masked_log_softmax(span_end_logits, repeated_passage_mask), span_end.view(-1), ignore_index=-1) self._span_end_accuracy(span_end_logits, span_end.view(-1), mask=qa_mask) self._span_accuracy(best_span[:, 0:2], torch.stack([span_start, span_end], -1).view(total_qa_count, 2), mask=qa_mask.unsqueeze(1).expand(-1, 2).long()) # add a select for the right span to compute loss gold_span_end_loc = [] span_end = span_end.view( total_qa_count).squeeze().data.cpu().numpy() for i in range(0, total_qa_count): gold_span_end_loc.append( max(span_end[i] * 3 + i * passage_length * 3, 0)) gold_span_end_loc.append( max(span_end[i] * 3 + i * passage_length * 3 + 1, 0)) gold_span_end_loc.append( max(span_end[i] * 3 + i * passage_length * 3 + 2, 0)) gold_span_end_loc = span_start.new(gold_span_end_loc) pred_span_end_loc = [] for i in range(0, total_qa_count): pred_span_end_loc.append( max(best_span[i][1] * 3 + i * passage_length * 3, 0)) pred_span_end_loc.append( max(best_span[i][1] * 3 + i * passage_length * 3 + 1, 0)) pred_span_end_loc.append( max(best_span[i][1] * 3 + i * passage_length * 3 + 2, 0)) predicted_end = span_start.new(pred_span_end_loc) _yesno = span_yesno_logits.view(-1).index_select( 0, gold_span_end_loc).view(-1, 3) loss += nll_loss(torch.nn.functional.log_softmax(_yesno, dim=-1), yesno_list.view(-1), ignore_index=-1) _yesno = span_yesno_logits.view(-1).index_select( 0, predicted_end).view(-1, 3) self._span_yesno_accuracy(_yesno, yesno_list.view(-1), mask=qa_mask) output_dict["loss"] = loss # Compute the EM and F1 on SQuAD and add the tokenized input to the output. output_dict['best_span_str'] = [] output_dict['qid'] = [] output_dict['yesno'] = [] best_span_cpu = best_span.detach().cpu().numpy() for i in range(batch_size): passage_str = metadata[i]['original_passage'] offsets = metadata[i]['token_offsets'] f1_score = 0.0 per_dialog_best_span_list = [] per_dialog_yesno_list = [] per_dialog_query_id_list = [] for per_dialog_query_index, (iid, answer_texts) in enumerate( zip(metadata[i]["instance_id"], metadata[i]["answer_texts_list"])): predicted_span = tuple(best_span_cpu[i * max_qa_count + per_dialog_query_index]) start_offset = offsets[predicted_span[0]][0] end_offset = offsets[predicted_span[1]][1] yesno_pred = predicted_span[2] per_dialog_yesno_list.append(yesno_pred) per_dialog_query_id_list.append(iid) best_span_string = passage_str[start_offset:end_offset] per_dialog_best_span_list.append(best_span_string) if answer_texts: if len(answer_texts) > 1: t_f1 = [] # Compute F1 over N-1 human references and averages the scores. for answer_index in range(len(answer_texts)): idxes = list(range(len(answer_texts))) idxes.pop(answer_index) refs = [answer_texts[z] for z in idxes] t_f1.append( squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, best_span_string, refs)) f1_score = 1.0 * sum(t_f1) / len(t_f1) else: f1_score = squad_eval.metric_max_over_ground_truths( squad_eval.f1_score, best_span_string, answer_texts) self._official_f1(100 * f1_score) output_dict['qid'].append(per_dialog_query_id_list) output_dict['best_span_str'].append(per_dialog_best_span_list) output_dict['yesno'].append(per_dialog_yesno_list) return output_dict
def train(): min_loss = 1e10 max_acc=0 patience_cnt = 0 val_loss_values = [] val_acc_values = [] best_epoch = 0 t = time.time() print(count_parameters(model)/1000) model.train() step=0 for epoch in range(args.epochs): torch.cuda.empty_cache() loss_train = 0.0 correct = 0 for i, data in enumerate(train_loader): optimizer.zero_grad() data = data.to(args.device) out = model(data) loss = F.nll_loss(out, data.y) if torch.isnan(loss): print('NO') loss.backward() step+=1 optimizer.step() loss_train += loss.item() pred = out.max(dim=1)[1] correct += pred.eq(data.y).sum().item() nni.report_intermediate_result((time.time()-t)) t2=time.time() time_cost=(t2-t) print('{:.2f}'.format(time_cost)) nni.report_final_result(time_cost) # acc_train = correct / len(train_loader.dataset) # acc_val, loss_val = compute_test(val_loader) # # # client_send(gpuid, 1) # # # if epoch>5: # # # client_send(gpuid, 1) # outs='Epoch: {:04d}'.format(epoch + 1)+'\tloss_train: {:.6f}'.format(loss_train)+\ # '\tacc_train: {:.6f}'.format(acc_train)+ '\tloss_val: {:.6f}'.format(loss_val)+\ # '\tacc_val: {:.6f}'.format(acc_val)+'\ttime: {:.6f}s'.format(time.time() - t) # nni.report_intermediate_result(-loss_val) # print(outs) # logging.info(outs) # val_loss_values.append(loss_val) # val_acc_values.append(acc_val) # torch.save(model.state_dict(), res/'{}.pth'.format(epoch)) # if val_loss_values[-1] < min_loss: # min_loss = val_loss_values[-1] # best_epoch = epoch # patience_cnt = 0 # else: # patience_cnt += 1 # # if val_acc_values[-1] > max_acc: # # max_acc = val_acc_values[-1] # # best_epoch=epoch # # patience_cnt = 0 # # else: # # patience_cnt +=1 # if patience_cnt == args.patience: # break # files = glob.glob(res.as_posix()+'/*.pth') # for f in files: # epoch_nb = int(f.split('/')[-1].split('.')[0]) # if epoch_nb < best_epoch: # os.remove(f) # files = glob.glob(res.as_posix()+'/*.pth') # for f in files: # epoch_nb = int(f.split('/')[-1].split('.')[0]) # if epoch_nb > best_epoch: # os.remove(f) # outs='Optimization Finished! Total time elapsed: {:.6f}'.format(time.time() - t) # print(outs) return best_epoch
model = CNNModel().to(device) learning_rate = 0.01 optimizer= optim.SGD(model.parameters(), lr=learning_rate) # Train model iter = 0 for epoch in range(num_epochs): for i, (images,labels) in enumerate(train_loader): images = images.to(device) labels = labels.to(device) optimizer.zero_grad() output = model(images) loss = F.nll_loss(output, labels) loss.backward() optimizer.step() iter = iter + 1 if (i+1) % 100 == 0: print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0])) model.eval() correct=0 total=0 for images, labels in test_loader: images = images.to(device) labels = labels.to(device) outputs = model(images) _, predicted = torch.max(outputs.data, 1)
def forward(self, question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], #passages_length: torch.LongTensor = None, #correct_passage: torch.LongTensor = None, span_start: torch.IntTensor = None, span_end: torch.IntTensor = None, metadata = None) -> Dict[str, torch.Tensor]: # shape: B x Tq x E embedded_question = self._embedder(question) embedded_passage = self._embedder(passage) batch_size = embedded_question.size(0) total_passage_length = embedded_passage.size(1) question_mask = util.get_text_field_mask(question) passage_mask = util.get_text_field_mask(passage) # shape: B x T x 2H encoded_question = self._dropout(self._question_encoder(embedded_question, question_mask)) encoded_passage = self._dropout(self._passage_encoder(embedded_passage, passage_mask)) passage_mask = passage_mask.float() question_mask = question_mask.float() encoding_dim = encoded_question.size(-1) # shape: B x 2H if encoded_passage.is_cuda: cuda_device = encoded_passage.get_device() gru_hidden = Variable(torch.zeros(batch_size, encoding_dim).cuda(cuda_device)) else: gru_hidden = Variable(torch.zeros(batch_size, encoding_dim)) question_awared_passage = [] for timestep in range(total_passage_length): # shape: B x Tq = attention(B x 2H, B x Tq x 2H) attn_weights = self._question_attention_for_passage(encoded_passage[:, timestep, :], encoded_question, question_mask) # shape: B x 2H = weighted_sum(B x Tq x 2H, B x Tq) attended_question = util.weighted_sum(encoded_question, attn_weights) # shape: B x 4H passage_question_combined = torch.cat([encoded_passage[:, timestep, :], attended_question], dim=-1) # shape: B x 4H gate = F.sigmoid(self._gate(passage_question_combined)) gru_input = gate * passage_question_combined # shape: B x 2H gru_hidden = self._dropout(self._gru_cell(gru_input, gru_hidden)) question_awared_passage.append(gru_hidden) # shape: B x T x 2H # question aware passage representation v_P question_awared_passage = torch.stack(question_awared_passage, dim=1) # compute question vector r_Q # shape: B x T = attention(B x 2H, B x T x 2H) v_r_Q_tiled = self._v_r_Q.unsqueeze(0).expand(batch_size, encoding_dim) attn_weights = self._question_attention_for_question(v_r_Q_tiled, encoded_question, question_mask) # shape: B x 2H r_Q = util.weighted_sum(encoded_question, attn_weights) # shape: B x T = attention(B x 2H, B x T x 2H) span_start_logits = self._passage_attention_for_answer(r_Q, question_awared_passage, passage_mask) span_start_logits = util.replace_masked_values(span_start_logits, passage_mask, -1e7) span_start_probs = util.masked_softmax(span_start_logits, passage_mask) span_start_log_probs = util.masked_log_softmax(span_start_logits, passage_mask) # shape: B x 2H c_t = util.weighted_sum(question_awared_passage, span_start_probs) # shape: B x 2H h_1 = self._dropout(self._answer_net(c_t, r_Q)) span_end_logits = self._passage_attention_for_answer(h_1, question_awared_passage, passage_mask) span_end_logits = util.replace_masked_values(span_end_logits, passage_mask, -1e7) span_end_probs = util.masked_softmax(span_end_logits, passage_mask) span_end_log_probs = util.masked_log_softmax(span_end_logits, passage_mask) best_span = self.get_best_span(span_start_logits, span_end_logits) #num_passages = passages_length.size(1) #acc = Variable(torch.zeros(batch_size, num_passages + 1)).cuda(cuda_device).long() #acc[:, 1:num_passages+1] = torch.cumsum(passages_length, dim=1) #g_batch = [] #for b in range(batch_size): # g = [] # for i in range(num_passages): # if acc[b, i+1].data[0] > acc[b, i].data[0]: # attn_weights = self._passage_attention_for_ranking(r_Q[b:b+1], question_awared_passage[b:b+1, acc[b, i].data[0]: acc[b, i+1].data[0], :], passage_mask[b:b+1, acc[b, i].data[0]: acc[b, i+1].data[0]]) # r_P = util.weighted_sum(question_awared_passage[b:b+1, acc[b, i].data[0]:acc[b, i+1].data[0], :], attn_weights) # question_passage_combined = torch.cat([r_Q[b:b+1], r_P], dim=-1) # gi = self._dropout(self._match_layer_2(F.tanh(self._dropout(self._match_layer_1(question_passage_combined))))) # g.append(gi) # else: # g.append(Variable(torch.zeros(1, 1)).cuda(cuda_device)) # g = torch.cat(g, dim=1) # g_batch.append(g) #t2 = time.time() #g = torch.cat(g_batch, dim=0) #passage_log_probs = F.log_softmax(g, dim=-1) output_dict = {} if span_start is not None: AP_loss = F.nll_loss(span_start_log_probs, span_start.squeeze(-1)) +\ F.nll_loss(span_end_log_probs, span_end.squeeze(-1)) #PR_loss = F.nll_loss(passage_log_probs, correct_passage.squeeze(-1)) #loss = self._r * AP_loss + self._r * PR_loss self._span_start_accuracy(span_start_logits, span_start.squeeze(-1)) self._span_end_accuracy(span_end_logits, span_end.squeeze(-1)) self._span_accuracy(best_span, torch.stack([span_start, span_end], -1)) output_dict['loss'] = AP_loss _, max_start = torch.max(span_start_probs, dim=1) _, max_end = torch.max(span_end_probs, dim=1) #t3 = time.time() output_dict['span_start_idx'] = max_start output_dict['span_end_idx'] = max_end #t4 = time.time() #global ITE #ITE += 1 #if (ITE % 100 == 0): # print(" gold %i:%i|predicted %i:%i" %(span_start.squeeze(-1)[0], span_end.squeeze(-1)[0], max_start.data[0], max_end.data[0])) if metadata is not None: output_dict['best_span_str'] = [] question_tokens = [] passage_tokens = [] for i in range(batch_size): question_tokens.append(metadata[i]['question_tokens']) passage_tokens.append(metadata[i]['passage_tokens']) passage_str = metadata[i]['original_passage'] offsets = metadata[i]['token_offsets'] predicted_span = tuple(best_span[i].data.cpu().numpy()) start_offset = offsets[predicted_span[0]][0] end_offset = offsets[predicted_span[1]][1] best_span_string = passage_str[start_offset:end_offset] output_dict['best_span_str'].append(best_span_string) answer_texts = metadata[i].get('answer_texts', []) if answer_texts: self._squad_metrics(best_span_string, answer_texts) output_dict['question_tokens'] = question_tokens output_dict['passage_tokens'] = passage_tokens #t5 = time.time() #print("Total: %.5f" % (t5-t0)) #print("Batch processing 1: %.5f" % (t2-t1)) #print("Batch processing 2: %.5f" % (t4-t3)) return output_dict
num_batch = len(dataset) / opt.batchSize miou_list = list() for epoch in range(opt.nepoch): for i, data in enumerate(dataloader, 0): points, target = data points, target = Variable(points), Variable(target) points = points.transpose(2, 1) points, target = points.cuda(), target.cuda() optimizer.zero_grad() classifier = classifier.train() pred = classifier(points) pred = pred.view(-1, num_classes) target = target.view(-1, 1)[:, 0] - 1 #print(pred.size(), target.size()) loss = F.nll_loss(pred, target) loss.backward() optimizer.step() pred_choice = pred.data.max(1)[1] correct = pred_choice.eq(target.data).cpu().sum() print('[%d: %d/%d] train loss: %f accuracy: %f' % (epoch, i, num_batch, loss.item(), correct.item() / float(opt.batchSize * opt.num_points))) if i % 100 == 0: j, data = next(enumerate(testdataloader, 0)) points, target = data points, target = Variable(points), Variable(target) points = points.transpose(2, 1) points, target = points.cuda(), target.cuda()
def get_cls_loss(pred, label, select): if len(select.size()) == 0 or select.size() == torch.Size([0]): return 0 pred = torch.index_select(pred, 0, select) label = torch.index_select(label, 0, select) return F.nll_loss(pred, label)
def mnist_tutorial(nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, train_end=-1, test_end=-1, learning_rate=LEARNING_RATE): """ MNIST cleverhans tutorial :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Train a pytorch MNIST model torch_model = PytorchMnistModel() if torch.cuda.is_available(): torch_model = torch_model.cuda() report = AccuracyReport() train_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=False, transform=transforms.ToTensor()), batch_size=batch_size) # Truncate the datasets so that our test run more quickly train_loader.dataset.train_data = train_loader.dataset.train_data[ :train_end] test_loader.dataset.test_data = test_loader.dataset.test_data[:test_end] # Train our model optimizer = optim.Adam(torch_model.parameters(), lr=learning_rate) train_loss = [] total = 0 correct = 0 step = 0 for _epoch in range(nb_epochs): for xs, ys in train_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() optimizer.zero_grad() preds = torch_model(xs) loss = F.nll_loss(preds, ys) loss.backward() # calc gradients train_loss.append(loss.data.item()) optimizer.step() # update gradients preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys).sum() total += len(xs) step += 1 if total % 1000 == 0: acc = float(correct) / total print('[%s] Training accuracy: %.2f%%' % (step, acc * 100)) total = 0 correct = 0 # Evaluate on clean data total = 0 correct = 0 for xs, ys in test_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() preds = torch_model(xs) preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys).sum() total += len(xs) acc = float(correct) / total report.clean_train_clean_eval = acc print('[%s] Clean accuracy: %.2f%%' % (step, acc * 100)) # We use tf for evaluation on adversarial data sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=(None, 1, 28, 28,)) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack fgsm_op = FastGradientMethod(cleverhans_model, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_op = fgsm_op.generate(x_op, **fgsm_params) adv_preds_op = tf_model_fn(adv_x_op) # Run an evaluation of our model against fgsm total = 0 correct = 0 for xs, ys in test_loader: adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs}) correct += (np.argmax(adv_preds, axis=1) == ys).sum() total += len(xs) acc = float(correct) / total print('Adv accuracy: {:.3f}'.format(acc * 100)) report.clean_train_adv_eval = acc return report
def forward(self, question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], list_, passages_length: torch.LongTensor = None, correct_passage: torch.LongTensor = None, span_start: torch.IntTensor = None, span_end: torch.IntTensor = None) -> Dict[str, torch.Tensor]: # shape: B x N x T x E embedded_passage_list = self._embedder(list_) # shape: N (batch_size, num_passages, max_p, embedding_size) = embedded_passage_list.size() # shape: B x Tq x E embedded_question = self._embedder(question) embedded_passage = embedded_passage_list.view(batch_size, -1, embedding_size) # embedded_passage = self._embedder(passage) # batch_size = embedded_question.size(0) total_passage_length = embedded_passage.size(1) question_mask = util.get_text_field_mask(question) # passage_mask = util.get_text_field_mask(passage) passage_list_mask = util.get_text_field_mask(list_, 1) passage_mask = passage_list_mask.view(batch_size, -1) # shape: B x T x 2H encoded_question = self._dropout(self._question_encoder(embedded_question, question_mask)) encoded_passage = self._dropout(self._passage_encoder(embedded_passage, passage_mask)) passage_mask = passage_mask.float() question_mask = question_mask.float() encoding_dim = encoded_question.size(-1) #encoded_passage_list = self._dropout(self._passage_encoder(embedded_passage_list, passage_list_mask)) # shape: B x 2H if encoded_passage.is_cuda: cuda_device = encoded_passage.get_device() gru_hidden = Variable(torch.zeros(batch_size, encoding_dim).cuda(cuda_device)) else: gru_hidden = Variable(torch.zeros(batch_size, encoding_dim)) question_awared_passage = [] for timestep in range(total_passage_length): u_t_P = encoded_passage[:, timestep, :] # shape: B x Tq = attention(B x 2H, B x Tq x 2H) attn_weights = self._question_attention_for_passage(encoded_passage[:, timestep, :], encoded_question, question_mask) # shape: B x 2H = weighted_sum(B x Tq x 2H, B x Tq) attended_question = util.weighted_sum(encoded_question, attn_weights) # shape: B x 4H passage_question_combined = torch.cat([encoded_passage[:, timestep, :], attended_question], dim=-1) # shape: B x 4H gate = F.sigmoid(self._gate(passage_question_combined)) gru_input = gate * passage_question_combined # shape: B x 2H gru_hidden = self._dropout(self._gru_cell(gru_input, gru_hidden)) question_awared_passage.append(gru_hidden) # shape: B x T x 2H # question aware passage representation v_P question_awared_passage = torch.stack(question_awared_passage, dim=1) # compute question vector r_Q # shape: B x T = attention(B x 2H, B x T x 2H) v_r_Q_tiled = self._v_r_Q.unsqueeze(0).expand(batch_size, encoding_dim) attn_weights = self._question_attention_for_question(v_r_Q_tiled, encoded_question, question_mask) # shape: B x 2H r_Q = util.weighted_sum(encoded_question, attn_weights) # shape: B x T = attention(B x 2H, B x T x 2H) span_start_logits = self._passage_attention_for_answer(r_Q, question_awared_passage, passage_mask) span_start_logits = util.replace_masked_values(span_start_logits, passage_mask, -1e7) span_start_probs = util.masked_softmax(span_start_logits, passage_mask) span_start_log_probs = util.masked_log_softmax(span_start_logits, passage_mask) # shape: B x 2H c_t = util.weighted_sum(question_awared_passage, span_start_probs) # shape: B x 2H h_1 = self._dropout(self._answer_net(c_t, r_Q)) span_end_logits = self._passage_attention_for_answer(h_1, question_awared_passage, passage_mask) span_end_logits = util.replace_masked_values(span_end_logits, passage_mask, -1e7) span_end_probs = util.masked_softmax(span_end_logits, passage_mask) span_end_log_probs = util.masked_log_softmax(span_end_logits, passage_mask) #num_passages = passages_length.size(1) #cum_passages = torch.cumsum(passages_length, dim=1) g = [] for i in range(num_passages): attn_weights = self._passage_attention_for_ranking(r_Q, question_awared_passage[:, i*max_p: (i + 1)*max_p, :], passage_mask[:, i*max_p: (i + 1)*max_p]) r_P = util.weighted_sum(question_awared_passage[:, i*max_p: (i + 1)*max_p, :], attn_weights) question_passage_combined = torch.cat([r_Q, r_P], dim=-1) gi = self._dropout(self._match_layer_2(F.tanh(self._match_layer_1(question_passage_combined)))) g.append(gi) # compute r_P # shape: B x T = attention(B x 2H, B x T x 2H) #attn_weights = self._passage_attention_for_ranking(r_Q, question_awared_passage, passage_mask) # shape: B x 2H #r_P = util.weighted_sum(question_awared_passage, attn_weights) # shape: B x 4H #question_passage_combined = torch.cat([r_Q, r_P], dim=-1) # shape: B x 10 #g = self._dropout(self._match_layer_2(F.tanh(self._match_layer_1(question_passage_combined)))) #cum_passages = torch.cumsum(passages_length, dim=1) #for b in range(batch_size): # for i in range(num_passages): # attn_weights = self._passage_attention_for_ranking(r_Q[b], question_awared_passage padded_span_start = span_start.clone() padded_span_end = span_end.clone() cumsum = torch.cumsum(passage_mask.long(), dim=1) for b in range(batch_size): padded_span_start[b] = (cumsum[b] == span_start[b] + 1).nonzero()[0][0] padded_span_end[b] = (cumsum[b] == span_end[b] + 1).nonzero()[0][0] g = torch.cat(g, dim=1) passage_log_probs = F.log_softmax(g, dim=-1) output_dict = {} if span_start is not None: AP_loss = F.nll_loss(span_start_log_probs, padded_span_start.squeeze(-1)) +\ F.nll_loss(span_end_log_probs, padded_span_end.squeeze(-1)) PR_loss = F.nll_loss(passage_log_probs, correct_passage.squeeze(-1)) loss = self._r * AP_loss + self._r * PR_loss output_dict['loss'] = loss _, max_start = torch.max(span_start_probs, dim=1) _, max_end = torch.max(span_end_probs, dim=1) #max_start = max_start.cpu().data[0] #max_end = max_end.cpu().data[0] #unpad for b in range(batch_size): max_start.data[b] = cumsum.data[b, max_start.data[b]] - 1 max_end.data[b] = cumsum.data[b, max_end.data[b]] - 1 output_dict['span_start_idx'] = max_start output_dict['span_end_idx'] = max_end self._num_iter += 1 if (self._num_iter % 50 == 0): print(" gold %i:%i|predicted %i:%i" %(span_start.squeeze(-1)[0], span_end.squeeze(-1)[0], max_start.cpu().data[0], max_end.cpu().data[0])) return output_dict
def test(self): print("Starting attack on", self.modelName, "...") # Accuracy counter allAdversarialExamples = [] allAccuracies = [] for epsilon in self.epsilons: correct = 0 loss = 0 currentExamples = [] # Loop over all examples in test set for batch in tqdm(self.model.testDL): # Get image data and true classification from batch data, target = batch # Send the data and label to the device data, target = data.to(self.device), target.to(self.device) # Set requires_grad attribute of tensor. Important for Attack data.requires_grad = True # Forward pass the data through the model probabilities, prediction = self.predict(data) # Calculate the loss currentLoss = F.nll_loss(probabilities, target) # Zero all existing gradients self.model.zero_grad() # Calculate gradients of model in backward pass currentLoss.backward(retain_graph=True) # Call FGSM Attack dataGrad = data.grad perturbedData = self.attack(data, dataGrad, epsilon) # Re-classify the perturbed image probabilities, finalPred = self.predict(perturbedData) for i in range(len(data)): # If initial prediction was incorrect, skip image if prediction[i].item() != target[i].item(): continue # Save adversarial example adv_ex = perturbedData.squeeze().detach().cpu().numpy() npProb = probabilities.detach().numpy() # Get one image from batch if self.model.batchSize == 64: adv_ex = adv_ex[i] npProb = npProb[i] # Check for success if finalPred[i].item() == target[i].item(): correct += 1 # Special case for saving 0 epsilon examples if (epsilon == 0) and (len(currentExamples) < 5): currentExamples.append((prediction[i].item(), finalPred[i].item(), adv_ex, npProb)) else: # Save some adv examples for visualization later if len(currentExamples) < 5: currentExamples.append((prediction[i].item(), finalPred[i].item(), adv_ex, npProb)) # Calculate final accuracy for this epsilon accuracy = correct / 10000 print("Epsilon: {}\tTest Accuracy = {} / {} = {}".format(epsilon, correct, 10000, accuracy)) # Sleep for console output sleep(0.1) # Append results from current epsilon to output allAccuracies.append(accuracy) allAdversarialExamples.append(currentExamples) return allAccuracies, allAdversarialExamples
def train(self, model, optimizer, data_loader, batch_size, num_epochs, batches_per_epoch, save_every, print_every, check_every, seeds, test_max_len=50, test_temperature=1.0): num_tokens = data_loader.get_vocab_size() for ind_epoch in range(num_epochs): epoch_start = datetime.datetime.now() epoch_losses = [] for ind_batch in range(batches_per_epoch): batch = data_loader.get_random_train_batch(batch_size) model.zero_grad() probas, _ = model(batch) loss = F.nll_loss( probas[:, :-1].contiguous().view(-1, num_tokens), batch[:, 1:].contiguous().view(-1)) epoch_losses.append(loss.item()) loss.backward() optimizer.step() epoch_loss = np.mean(epoch_losses) validation_loss = self.calc_validation_loss( model, data_loader, batch_size) optimizer.update(validation_loss) if ind_epoch % save_every == 0: self.save_checkpoint(ind_epoch, data_loader, model, optimizer) if ind_epoch % print_every == 0: epoch_seconds = round( (datetime.datetime.now() - epoch_start).total_seconds(), 2) print("Epoch:", ind_epoch + 1, "lr: %.2E" % optimizer.get_lr(), "seconds:", epoch_seconds, "train loss:", epoch_loss, "valid loss:", validation_loss) if ind_epoch % check_every == 0: for seed in seeds: out = self.generate_sample(model, data_loader, seed, test_max_len, test_temperature) out = re.sub("_PAD_", "", out).strip() out = re.sub("_SEP_", " # " * 10, out) print(out) print() print()
def train_adv(args, epoch, model, trainLoader, optimizer, trainF, config, scheduler): model.train() attack = FGSM_Attack(model, F.nll_loss) nProcessed = 0 nTrain = len(trainLoader.dataset) # nIter_per_epoch = nTrain // batch_size # dice_loss = dloss.DiceLoss(nclass=2) for batch_idx, (data, target) in enumerate(trainLoader): if args.cuda: data, target = data.cuda(), target.type(torch.LongTensor).cuda() data, target = Variable(data), Variable(target) optimizer.zero_grad() _, output_adv, output = attack.fgsm(data, target, softmax=F.log_softmax) output = output.permute(0, 2, 3, 4, 1).contiguous() output = output.view(output.numel() // 3, 3) # 3 labels # output = F.log_softmax(output, dim=-1) # dim marked output_adv = output_adv.permute(0, 2, 3, 4, 1).contiguous() output_adv = output_adv.view(output.numel() // 3, 3) # 3 labels target = target.view(target.numel()) # add CrossEntropyLoss loss = F.nll_loss(output, target) adv_loss = F.nll_loss(output_adv, target) # loss.backward() becomes: # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() loss.backward() adv_loss.backward() optimizer.step() # update learning rate scheduler(optimizer, i=batch_idx, epoch=epoch) nProcessed += len(data) # get the index of the max log-probability pred = torch.argmax(output, dim=-1) # print(output.size(), pred.size(), target.size()) dice = evaluate_dice(pred, target, cpu=True) incorrect = pred.ne(target.data).cpu().sum() partialEpoch = (int)(epoch + batch_idx / len(trainLoader)) loss_data = loss.detach().data.cpu().numpy() adv_loss_data = adv_loss.detach().data.cpu().numpy() print( 'Train Epoch: {} [{}/{} ({:.0f}%)], Loss: {:.4f}, Kidney_Dice: {:.6f}, Tumor_Dice:{:.6}, Adv_loss: {:.4f}' .format(partialEpoch, nProcessed, nTrain, 100. * batch_idx / len(trainLoader), loss_data, dice[0], dice[1], adv_loss_data)) # trainF.write('{},{},{},{},{}\n'.format(partialEpoch, loss_data, dice[0], dice[1], adv_loss_data)) trainF.flush()
def forward( self, # type: ignore question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], span_start: torch.IntTensor = None, span_end: torch.IntTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- question : Dict[str, torch.LongTensor] From a ``TextField``. passage : Dict[str, torch.LongTensor] From a ``TextField``. The model assumes that this passage contains the answer to the question, and predicts the beginning and ending positions of the answer within the passage. span_start : ``torch.IntTensor``, optional From an ``IndexField``. This is one of the things we are trying to predict - the beginning position of the answer with the passage. This is an `inclusive` token index. If this is given, we will compute a loss that gets included in the output dictionary. span_end : ``torch.IntTensor``, optional From an ``IndexField``. This is one of the things we are trying to predict - the ending position of the answer with the passage. This is an `inclusive` token index. If this is given, we will compute a loss that gets included in the output dictionary. metadata : ``List[Dict[str, Any]]``, optional If present, this should contain the question ID, original passage text, and token offsets into the passage for each instance in the batch. We use this for computing official metrics using the official SQuAD evaluation script. The length of this list should be the batch size, and each dictionary should have the keys ``id``, ``original_passage``, and ``token_offsets``. If you only want the best span string and don't care about official metrics, you can omit the ``id`` key. Returns ------- An output dictionary consisting of: span_start_logits : torch.FloatTensor A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log probabilities of the span start position. span_start_probs : torch.FloatTensor The result of ``softmax(span_start_logits)``. span_end_logits : torch.FloatTensor A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log probabilities of the span end position (inclusive). span_end_probs : torch.FloatTensor The result of ``softmax(span_end_logits)``. best_span : torch.IntTensor The result of a constrained inference over ``span_start_logits`` and ``span_end_logits`` to find the most probable span. Shape is ``(batch_size, 2)`` and each offset is a token index. loss : torch.FloatTensor, optional A scalar loss to be optimised. best_span_str : List[str] If sufficient metadata was provided for the instances in the batch, we also return the string from the original passage that the model thinks is the best answer to the question. """ embedded_question = self._highway_layer( self._text_field_embedder(question)) embedded_passage = self._highway_layer( self._text_field_embedder(passage)) batch_size = embedded_question.size(0) passage_length = embedded_passage.size(1) question_mask = util.get_text_field_mask(question).float() passage_mask = util.get_text_field_mask(passage).float() question_lstm_mask = question_mask if self._mask_lstms else None passage_lstm_mask = passage_mask if self._mask_lstms else None encoded_question = self._dropout( self._phrase_layer(embedded_question, question_lstm_mask)) encoded_passage = self._dropout( self._phrase_layer(embedded_passage, passage_lstm_mask)) encoding_dim = encoded_question.size(-1) #New Question SA encoding sa_encoded_question = self._self_attention_layer( embedded_question, question_lstm_mask) sa_encoded_passage = self._self_attention_layer( embedded_passage, passage_lstm_mask) # Shape: (batch_size, passage_length, question_length) passage_question_similarity = self._matrix_attention( encoded_passage, encoded_question) sa_passage_question_similarity = self._sa_matrix_attention( sa_encoded_passage, sa_encoded_question) # Shape: (batch_size, passage_length, question_length) #print(passage_question_similarity.size()) question_mask_temp = question_mask question_mask_temp = question_mask_temp.unsqueeze_(1) # print(question_mask_temp.size()) # print(question_mask_temp.size()) passage_question_attention = util.masked_softmax( passage_question_similarity, question_mask_temp) sa_passage_question_attention = util.masked_softmax( sa_passage_question_similarity, question_mask_temp) # Shape: (batch_size, passage_length, encoding_dim) passage_question_vectors = util.weighted_sum( encoded_question, passage_question_attention) sa_passage_question_vectors = util.weighted_sum( sa_encoded_question, sa_passage_question_attention) # We replace masked values with something really negative here, so they don't affect the # max below. masked_similarity = util.replace_masked_values( passage_question_similarity, question_mask, -1e7) # Shape: (batch_size, passage_length) question_passage_similarity = masked_similarity.max( dim=-1)[0].squeeze(-1) # Shape: (batch_size, passage_length) question_passage_attention = util.masked_softmax( question_passage_similarity, passage_mask) # Shape: (batch_size, encoding_dim) question_passage_vector = util.weighted_sum( encoded_passage, question_passage_attention) # Shape: (batch_size, passage_length, encoding_dim) tiled_question_passage_vector = question_passage_vector.unsqueeze( 1).expand(batch_size, passage_length, encoding_dim) #print("Shape of SA Encoded:",sa_encoded_question.size(),sa_encoded_passage.size()) #print("Required Shape of Encoded Passage:",encoded_passage.size(),passage_question_vectors.size()) # Shape: (batch_size, passage_length, encoding_dim * 4 + 2*sa_dim ) final_merged_passage = torch.cat([ encoded_passage, sa_encoded_passage, sa_passage_question_vectors, passage_question_vectors, encoded_passage * passage_question_vectors, encoded_passage * tiled_question_passage_vector ], dim=-1) modeled_passage = self._dropout( self._modeling_layer(final_merged_passage, passage_lstm_mask)) modeling_dim = modeled_passage.size(-1) # Shape: (batch_size, passage_length, encoding_dim * 4 + modeling_dim + 2*selfattention_dim)) span_start_input = self._dropout( torch.cat([final_merged_passage, modeled_passage], dim=-1)) # Shape: (batch_size, passage_length) span_start_logits = self._span_start_predictor( span_start_input).squeeze(-1) # Shape: (batch_size, passage_length) span_start_probs = util.masked_softmax(span_start_logits, passage_mask) # Shape: (batch_size, modeling_dim) span_start_representation = util.weighted_sum(modeled_passage, span_start_probs) # Shape: (batch_size, passage_length, modeling_dim) tiled_start_representation = span_start_representation.unsqueeze( 1).expand(batch_size, passage_length, modeling_dim) # Shape: (batch_size, passage_length, encoding_dim * 4 + modeling_dim * 3) span_end_representation = torch.cat([ final_merged_passage, modeled_passage, tiled_start_representation, modeled_passage * tiled_start_representation ], dim=-1) # Shape: (batch_size, passage_length, encoding_dim) encoded_span_end = self._dropout( self._span_end_encoder(span_end_representation, passage_lstm_mask)) # Shape: (batch_size, passage_length, encoding_dim * 4 + span_end_encoding_dim) span_end_input = self._dropout( torch.cat([final_merged_passage, encoded_span_end], dim=-1)) span_end_logits = self._span_end_predictor(span_end_input).squeeze(-1) span_end_probs = util.masked_softmax(span_end_logits, passage_mask) span_start_logits = util.replace_masked_values(span_start_logits, passage_mask, -1e7) span_end_logits = util.replace_masked_values(span_end_logits, passage_mask, -1e7) best_span = self.get_best_span(span_start_logits, span_end_logits) span_start_logits_do = self._dropout(span_start_logits) na_logits_start = self._na_dense(passage_length)(span_start_logits_do) span_end_logits_do = self._dropout(span_end_logits) na_logits_end = self._na_dense(passage_length)(span_end_logits_do) na_logits = Softmax(dim=1)(na_logits_start) * Softmax( dim=1)(na_logits_end) na_probs = Softmax(dim=1)(na_logits) na_gt = (span_start == -1) na_inv = (1.0 - na_gt) output_dict = { "passage_question_attention": passage_question_attention, "span_start_logits": span_start_logits, "span_start_probs": span_start_probs, "span_end_logits": span_end_logits, "span_end_probs": span_end_probs, "best_span": best_span, "na_logits": na_logits, "na_probs": na_probs } # Compute the loss for training. if span_start is not None: loss = 0.0 # calculate loss for answer existance loss += CrossEntropyLoss()(na_probs.type(torch.cuda.FloatTensor), na_gt.squeeze(-1).type( torch.cuda.LongTensor)) self._na_accuracy(na_probs.type(torch.cuda.FloatTensor), na_gt.squeeze(-1).type(torch.cuda.FloatTensor)) # calculate loss if there is answer # loss for start preds_start = ( na_inv.type(torch.cuda.FloatTensor) * util.masked_log_softmax( span_start_logits.type(torch.cuda.FloatTensor), passage_mask.type(torch.cuda.FloatTensor))).type( torch.cuda.FloatTensor) y_start = ( na_inv.squeeze(-1).type(torch.cuda.ByteTensor) * span_start.squeeze(-1).type(torch.cuda.ByteTensor)).type( torch.cuda.LongTensor) loss += nll_loss(preds_start, y_start) # accuracy for start acc_p_start = na_inv.type( torch.cuda.FloatTensor) * span_start_logits.type( torch.cuda.FloatTensor) acc_y_start = na_inv.squeeze(-1).type( torch.cuda.FloatTensor) * span_start.squeeze(-1).type( torch.cuda.FloatTensor) self._span_start_accuracy(acc_p_start, acc_y_start) # loss for end preds_end = (na_inv.type(torch.cuda.FloatTensor) * util.masked_log_softmax( span_end_logits.type(torch.cuda.FloatTensor), passage_mask.type(torch.cuda.FloatTensor))).type( torch.cuda.FloatTensor) y_end = (na_inv.squeeze(-1).type(torch.cuda.ByteTensor) * span_end.squeeze(-1).type(torch.cuda.ByteTensor)).type( torch.cuda.LongTensor) loss += nll_loss(preds_end, y_end) # accuracy for end acc_p_end = na_inv.type( torch.cuda.FloatTensor) * span_end_logits.type( torch.cuda.FloatTensor) acc_y_end = na_inv.squeeze(-1).type( torch.cuda.FloatTensor) * span_end.squeeze(-1).type( torch.cuda.FloatTensor) self._span_end_accuracy(acc_p_end, acc_y_end) # accuracy for span acc_p = na_inv.type(torch.cuda.FloatTensor) * best_span.type( torch.cuda.FloatTensor) acc_y = na_inv.type(torch.cuda.FloatTensor) * torch.cat([ span_start.type(torch.cuda.FloatTensor), span_end.type(torch.cuda.FloatTensor) ], -1) self._span_accuracy(acc_p, acc_y) output_dict["loss"] = loss # Compute the EM and F1 on SQuAD and add the tokenized input to the output. if metadata is not None: output_dict['best_span_str'] = [] question_tokens = [] passage_tokens = [] for i in range(batch_size): question_tokens.append(metadata[i]['question_tokens']) passage_tokens.append(metadata[i]['passage_tokens']) passage_str = metadata[i]['original_passage'] offsets = metadata[i]['token_offsets'] predicted_span = tuple(best_span[i].detach().cpu().numpy()) start_offset = offsets[predicted_span[0]][0] end_offset = offsets[predicted_span[1]][1] best_span_string = passage_str[start_offset:end_offset] output_dict['best_span_str'].append(best_span_string) answer_texts = metadata[i].get('answer_texts', []) if answer_texts: self._squad_metrics(best_span_string, answer_texts) output_dict['question_tokens'] = question_tokens output_dict['passage_tokens'] = passage_tokens return output_dict
def train(dataset, dataset_folder, task, number_of_points, batch_size, epochs, learning_rate, output_folder, number_of_workers, model_checkpoint): train_dataset = DATASETS[dataset](dataset_folder, task=task, number_of_points=number_of_points) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=number_of_workers) test_dataset = DATASETS[dataset](dataset_folder, task=task, train=False, number_of_points=number_of_points) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=number_of_workers) if task == 'classification': model = ClassificationPointNet(num_classes=train_dataset.NUM_CLASSIFICATION_CLASSES, point_dimension=train_dataset.POINT_DIMENSION) elif task == 'segmentation': model = SegmentationPointNet(num_classes=train_dataset.NUM_SEGMENTATION_CLASSES, point_dimension=train_dataset.POINT_DIMENSION) else: raise Exception('Unknown task !') if torch.cuda.is_available(): model.cuda() if model_checkpoint: model.load_state_dict(torch.load(model_checkpoint)) optimizer = optim.Adam(model.parameters(), lr=learning_rate) mb = master_bar(range(epochs)) if not os.path.isdir(output_folder): os.mkdir(output_folder) with open(os.path.join(output_folder, 'training_log.csv'), 'w+') as fid: fid.write('train_loss,test_loss,train_accuracy,test_accuracy\n') train_loss = [] test_loss = [] train_acc = [] test_acc = [] for epoch in mb: epoch_train_loss = [] epoch_train_acc = [] batch_number = 0 for data in progress_bar(train_dataloader, parent=mb): batch_number += 1 points, targets = data if torch.cuda.is_available(): points, targets = points.cuda(), targets.cuda() if points.shape[0] <= 1: continue optimizer.zero_grad() model = model.train() preds, feature_transform = model(points) if task == 'segmentation': preds = preds.view(-1, train_dataset.NUM_SEGMENTATION_CLASSES) targets = targets.view(-1) identity = torch.eye(feature_transform.shape[-1]) if torch.cuda.is_available(): identity = identity.cuda() regularization_loss = torch.norm( identity - torch.bmm(feature_transform, feature_transform.transpose(2, 1)) ) loss = F.nll_loss(preds, targets) + 0.001 * regularization_loss epoch_train_loss.append(loss.cpu().item()) loss.backward() optimizer.step() preds = preds.data.max(1)[1] corrects = preds.eq(targets.data).cpu().sum() if task == 'classification': accuracy = corrects.item() / float(batch_size) elif task == 'segmentation': accuracy = corrects.item() / float(batch_size*number_of_points) epoch_train_acc.append(accuracy) mb.child.comment = 'train loss: %f, train accuracy: %f' % (np.mean(epoch_train_loss), np.mean(epoch_train_acc)) epoch_test_loss = [] epoch_test_acc = [] for batch_number, data in enumerate(test_dataloader): points, targets = data if torch.cuda.is_available(): points, targets = points.cuda(), targets.cuda() model = model.eval() preds, feature_transform = model(points) if task == 'segmentation': preds = preds.view(-1, train_dataset.NUM_SEGMENTATION_CLASSES) targets = targets.view(-1) loss = F.nll_loss(preds, targets) epoch_test_loss.append(loss.cpu().item()) preds = preds.data.max(1)[1] corrects = preds.eq(targets.data).cpu().sum() if task == 'classification': accuracy = corrects.item() / float(batch_size) elif task == 'segmentation': accuracy = corrects.item() / float(batch_size*number_of_points) epoch_test_acc.append(accuracy) mb.write('Epoch %s: train loss: %s, val loss: %f, train accuracy: %s, val accuracy: %f' % (epoch, np.mean(epoch_train_loss), np.mean(epoch_test_loss), np.mean(epoch_train_acc), np.mean(epoch_test_acc))) if test_acc and np.mean(epoch_test_acc) > np.max(test_acc): torch.save(model.state_dict(), os.path.join(output_folder, 'shapenet_%s_model.pth' % task)) with open(os.path.join(output_folder, 'training_log.csv'), 'a') as fid: fid.write('%s,%s,%s,%s,%s\n' % (epoch, np.mean(epoch_train_loss), np.mean(epoch_test_loss), np.mean(epoch_train_acc), np.mean(epoch_test_acc))) train_loss.append(np.mean(epoch_train_loss)) test_loss.append(np.mean(epoch_test_loss)) train_acc.append(np.mean(epoch_train_acc)) test_acc.append(np.mean(epoch_test_acc)) plot_losses(train_loss, test_loss, save_to_file=os.path.join(output_folder, 'loss_plot.png')) plot_accuracies(train_acc, test_acc, save_to_file=os.path.join(output_folder, 'accuracy_plot.png'))
inputs = [ e if e is None else Variable(e.cuda(async=True)) for e in ex[:5] ] target_s = Variable(ex[5].cuda(async=True)) target_e = Variable(ex[6].cuda(async=True)) else: inputs = [e if e is None else Variable(e) for e in ex[:5]] target_s = Variable(ex[5]) target_e = Variable(ex[6]) # Run forward score_s, score_e = self.network(*inputs) # Compute loss and accuracies loss = F.nll_loss(score_s, target_s) + F.nll_loss(score_e, target_e) # Clear gradients and run backward self.optimizer.zero_grad() loss.backward() # Clip gradients torch.nn.utils.clip_grad_norm(self.network.parameters(), self.args.grad_clipping) # Update parameters self.optimizer.step() self.updates += 1 # Reset any partially fixed parameters (e.g. rare words) self.reset_parameters()
def forward(self, x, targets=None, img_dim=None): # Tensors for cuda support FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor self.img_dim = img_dim num_samples = x.size(0) grid_size = x.size(2) #print ('in models: x size ', x.size()) prediction = ( x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size) .permute(0, 1, 3, 4, 2) .contiguous() ) # Get outputs x = torch.sigmoid(prediction[..., 0]) # Center x y = torch.sigmoid(prediction[..., 1]) # Center y w = prediction[..., 2] # Width h = prediction[..., 3] # Height pred_conf = torch.sigmoid(prediction[..., 4]) # Conf pred_cls = prediction[..., 5:] # Cls pred. if self.loss_type=="bce": pred_cls = torch.sigmoid(pred_cls) elif self.loss_type=="hierarchical_loss": pred_cls = self.logsoftmax(pred_cls) # If grid size does not match current we compute new offsets if grid_size != self.grid_size: self.compute_grid_offsets(grid_size, cuda=x.is_cuda) # Add offset and scale with anchors pred_boxes = FloatTensor(prediction[..., :4].shape) pred_boxes[..., 0] = x.data + self.grid_x pred_boxes[..., 1] = y.data + self.grid_y pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h output = torch.cat( ( pred_boxes.view(num_samples, -1, 4) * self.stride, pred_conf.view(num_samples, -1, 1), pred_cls.view(num_samples, -1, self.num_classes), ), -1, ) if targets is None: return output, 0 else: iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors, ignore_thres=self.ignore_thres, ) if iou_scores is None or class_mask is None or obj_mask is None or noobj_mask is None or tx is None or ty is None or tw is None or th is None or tcls is None: print ('Exception in build targets') return None, None # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) try: loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj if self.loss_type=="bce": loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) elif self.loss_type=="ce": loss_cls = self.ce_loss(pred_cls[obj_mask], torch.argmax(tcls, 4)[obj_mask]) elif self.loss_type=="hierarchical_ce": pred_cls_obj_mask = pred_cls[obj_mask] pred_cls_obj_mask_level2 = pred_cls_obj_mask[..., self.class_hierarchy[:,0]] pred_cls_obj_mask_level1 = pred_cls_obj_mask[..., self.class_hierarchy[:,1]] pred_cls_obj_mask = pred_cls_obj_mask_level2 + pred_cls_obj_mask_level1 loss_cls = F.nll_loss(pred_cls_obj_mask, torch.argmax(tcls, 4)[obj_mask]) total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls if math.isnan(total_loss): return None, None except: print ('Exception in loss computation') return None, None # Metrics #print ('class_mask[obj_mask] ', class_mask[obj_mask]) cls_acc = 100 * class_mask[obj_mask].mean() conf_obj = pred_conf[obj_mask].mean() conf_noobj = pred_conf[noobj_mask].mean() conf50 = (pred_conf > 0.5).float() iou50 = (iou_scores > 0.5).float() iou75 = (iou_scores > 0.75).float() detected_mask = conf50 * class_mask * tconf precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) self.metrics = { "loss": to_cpu(total_loss).item(), "x": to_cpu(loss_x).item(), "y": to_cpu(loss_y).item(), "w": to_cpu(loss_w).item(), "h": to_cpu(loss_h).item(), "conf": to_cpu(loss_conf).item(), "cls": to_cpu(loss_cls).item(), "cls_acc": to_cpu(cls_acc).item(), "recall50": to_cpu(recall50).item(), "recall75": to_cpu(recall75).item(), "precision": to_cpu(precision).item(), "conf_obj": to_cpu(conf_obj).item(), "conf_noobj": to_cpu(conf_noobj).item(), "grid_size": grid_size, } return output, total_loss
def train(): model.train() optimizer.zero_grad() F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward() optimizer.step()
def expected_loss(self, target, forward_result): (a2, logprobs_out) = forward_result return F.nll_loss(logprobs_out, target)
def train(epoch): n_iter = 0 correct = 0 network.train() # set optimiser to only calculate gradients for parameters to be updated optimizer1 = optim.SGD([{ 'params': network.conv1.parameters() }, { 'params': network.conv2.parameters() }], lr=learning_rate, momentum=momentum) optimizer2 = optim.SGD([{ 'params': network.fc1.parameters() }, { 'params': network.fc2.parameters() }], lr=learning_rate, momentum=momentum) for batch_idx, (data, target) in enumerate(train_loader): for i in range(2): # for each of the loss functions n_iter += 1 if (i == 0): network.conv1.weight.requires_grad_ = True #network.pool.weight.requires_grad_ = True network.conv2.weight.requires_grad_ = True network.fc1.weight.requires_grad_ = False network.fc2.weight.requires_grad_ = False network.fc3.weight.requires_grad_ = False network.conv1.bias.requires_grad_ = True #network.pool.bias.requires_grad_ = True network.conv2.bias.requires_grad_ = True network.fc1.bias.requires_grad_ = False network.fc2.bias.requires_grad_ = False network.fc3.bias.requires_grad_ = False else: network.conv1.weight.requires_grad_ = False #network.pool.weight.requires_grad_ = False network.conv2.weight.requires_grad_ = False network.fc1.weight.requires_grad_ = True network.fc2.weight.requires_grad_ = True network.fc3.weight.requires_grad_ = True network.conv1.bias.requires_grad_ = False # network.pool.bias.requires_grad_ = False network.conv2.bias.requires_grad_ = False network.fc1.bias.requires_grad_ = True network.fc2.bias.requires_grad_ = True network.fc3.bias.requires_grad_ = True preds, out = network.forward(data) # Forward propagation if (i == 0): optimizer1.zero_grad() # zero out gradients layers = [ data.data for data in network.parameters() ] #Get weights matrices and biases of network as a list hi = torch.cat([layers[6], layers[7].unsqueeze(1)], 1) ones = torch.ones(target.numel()).unsqueeze(1) xj = torch.cat([out, ones], 1) loss = fp_loss(hi, xj) train_losses_fp.append(loss.item()) #Add loss to the list loss.backward() # back propagate loss optimizer1.step() # adjust weights matrices else: optimizer2.zero_grad() loss = F.nll_loss(preds, target) train_losses_ce.append(loss.item()) #Add loss to the list loss.backward() # back propagate loss optimizer2.step() # adjust weights matrices pred = preds.data.max(1, keepdim=True)[1] correct = pred.eq(target.data.view_as(pred)).sum() train_accuracy = correct, 64, 100. * (correct / 64) train_accuracies.append(train_accuracy) if batch_idx % log_interval == 0: if i == 0: string = "FP" else: string = "CE" print( string, 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) pred = preds.data.max(1, keepdim=True)[1] correct = pred.eq(target.data.view_as(pred)).sum() train_accuracy = correct, 64, 100. * (correct / 64) print( string, 'Train set: Accuracy: {}/{} ({:.0f}%)\n'.format( correct, 64, 100. * correct / 64))
cudnn.benchmark = True optimizer = optim.Adam(model.parameters()) print("Model Training Starts:") for epochs in range(num_echos): #training part print("Training Epochs ", epochs) model.train() train_accu = [] for index, (data, target) in enumerate(train_loader): data, target = Variable(data), Variable(target) if use_cuda: data, target = data.cuda(), target.cuda() output = model(data) optimizer.zero_grad() loss = F.nll_loss(output, target) loss.backward() if (epochs > 5): for group in optimizer.param_groups: for p in group['params']: state = optimizer.state[p] if (state['step'] >= 1024): state['step'] = 1000 optimizer.step() prediction = output.data.max(1)[1] accuracy = (float(prediction.eq(target.data).sum()) / float(batch_size_train)) * 100.0 train_accu.append(accuracy) #if(index%100 == 1): # print("Step:", index, " Training Accuracy: ",accuracy) accu_train = np.mean(train_accu)