def process_function(engine, batch): x, y = _prepare_batch_fp16(batch, device=device, non_blocking=True) if config['enable_mixup']: x, y = mixup_data(x, y, config['mixup_alpha'], config['mixup_proba']) optimizer.zero_grad() y_pred = model(x) loss = criterion(y_pred, y) loss.backward() if config["clip_gradients"] is not None: clip_grad_norm_(model.parameters(), config["clip_gradients"]) if config['use_adamw']: for group in optimizer.param_groups: for param in group['params']: param.data.add_(-weight_decay / batch_size * group['lr']) optimizer.step() loss = loss.item() return loss
def run_train_iter(self, x, y,manual_verified,epoch_number = -1): """ Receives the inputs and targets for the model and runs a training iteration. Returns loss and accuracy metrics. :param x: The inputs to the model. A numpy array of shape batch_size, channels, height, width :param y: The targets for the model. A numpy array of shape batch_size, num_classes :return: the loss and accuracy for this batch """ self.train() # sets model to training mode (in case batch normalization or other methods have different procedures for training and evaluation) if len(y.shape) > 1: y = np.argmax(y, axis=1) # convert one hot encoded labels to single integer labels y_no_cuda = y #print(type(x)) if type(x) is np.ndarray: x, y = torch.Tensor(x).float().to(device=self.device), torch.Tensor(y).long().to( device=self.device) # send data to device as torch tensors y_no_cuda = torch.Tensor(y_no_cuda).long() x = x.to(self.device) y = y.to(self.device) if self.mixup == True: inputs, targets_a, targets_b,y_, lam = MixUp.mixup_data(x, y,y_no_cuda,self.num_classes, self.alpha, use_cuda=self.use_gpu) # inputs, targets_a, targets_b = map(Variable, (inputs, # targets_a, targets_b)) if self.stack == True: x_stack = torch.stack((x, inputs), 0) x_stack = x_stack.view((self.batch_size,1,self.heigth,self.width)) out = self.model.forward_train(x_stack) loss_mix = MixUp.mixup_criterion(out[:int(self.batch_size/2)],targets_a,targets_b,lam,self.device) loss_smooth = CustomLosses.loss_function(out[int(self.batch_size/2):],y,y_no_cuda,self.num_classes,self.device,self.eps_smooth,self.loss_function, array_manual_label=manual_verified,consider_manual = self.consider_manual) loss = (loss_mix + loss_smooth)/2 else: out = self.model.forward_train(x) # forward the data in the model loss = MixUp.mixup_criterion(out, targets_a, targets_b, lam,self.device) else: out = self.model.forward_train(x) loss = CustomLosses.loss_function(out,y,y_no_cuda,self.num_classes,self.device,self.eps_smooth,self.loss_function, array_manual_label=manual_verified,consider_manual = self.consider_manual) #if self.loss_function=='CCE': # loss = F.cross_entropy(input=out, target=y) # compute loss #elif self.loss_function=='lq_loss': # loss=CustomLosses.lq_loss(y_true=y,y_pred=out,_q=self.q_) self.optimizer.zero_grad() # set all weight grads from previous training iters to 0 loss.backward() # backpropagate to compute gradients for current iter loss self.optimizer.step() # update network parameters _, predicted = torch.max(out.data, 1) # get argmax of predictions if self.stack: accuracy = np.mean(list(predicted[int(self.batch_size/2):].eq(y.data).cpu())) else: accuracy = np.mean(list(predicted.eq(y.data).cpu())) # compute accuracy return loss.data.detach().cpu().numpy(), accuracy
def forward(self, X): if self.beta > 0: lam = np.random.beta(self.beta, self.beta) else: lam = 1 X, permutation, lam = mixup_data(X, lam) X = ReverseLayerF.apply(X, self.alpha) for layer in self.layers: X = layer(X) return X, permutation, lam
def train(epoch): logf.write('\nEpoch: %d' % epoch) print('Epoch: %d' % epoch) net.train() train_loss, correct, total = 0, 0, 0 batch_accs = [] batch_losses = [] for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) if args.use_mix_up: optimizer.zero_grad() inputs, targets_a, targets_b, lam = mixup_data( args, inputs, targets, args.mix_up_alpha, args.use_uniform_mixup, use_cuda) inputs, targets_a, targets_b = map(Variable, (inputs, targets_a, targets_b)) outputs = net(inputs) loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam) train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += ( lam * predicted.eq(targets_a.data).cpu().sum().float() + (1 - lam) * predicted.eq(targets_b.data).cpu().sum().float()) optimizer.zero_grad() loss.backward() optimizer.step() else: inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() cur_loss = train_loss / (batch_idx + 1) acc = 100. * correct / total logf.write('[%d]Loss: %.3f | Acc: %.3f%% (%d/%d)\n' % (batch_idx, cur_loss, acc, correct, total)) if batch_idx % 100 == 0: print('[%d]Loss: %.3f | Acc: %.3f%% (%d/%d)' % (batch_idx, cur_loss, acc, correct, total)) batch_accs.append(acc) batch_losses.append(cur_loss) acc = float(correct) / total print('Train Acc:{}'.format(acc)) return np.mean(batch_losses), acc
def warmup_cudnn(model, criterion, batch_size, config): # run forward and backward pass of the model on a batch of random inputs # to allow benchmarking of cudnn kernels x = torch.Tensor(np.random.rand(batch_size, 3, 32, 32)).cuda() x = x.half() y = torch.LongTensor(np.random.randint(0, 10, batch_size)).cuda() if config['enable_mixup']: x, y = mixup_data(x, y, config['mixup_alpha'], config['mixup_proba']) model.train(True) y_pred = model(x) loss = criterion(y_pred, y) loss.backward() model.zero_grad() torch.cuda.synchronize()
def process_function(engine, batch): x, y = _prepare_batch_fp16(batch, device=device, non_blocking=True) if config['enable_mixup']: x, y = mixup_data(x, y, config['mixup_alpha'], config['mixup_proba']) optimizer.zero_grad() y_pred = model(x) loss = criterion(y_pred, y) loss.backward() if config["clip_gradients"] is not None: clip_grad_norm_(model.parameters(), config["clip_gradients"]) optimizer.step() loss = loss.item() return loss
def iterate(epoch, phase): is_train = True if phase == 'train': is_train = True elif phase == 'valid': is_train = False else: raise ValueError('Unrecognized phase: ' + str(phase)) if is_train is True: net.train() '''learning rate scheduling''' if config['optimizer']['use_adam'] is False: lr = optim.get_epoch_lr(epoch) optim.set_lr(optimizer, lr) else: net.eval() phase_dataloader = dataloaders[phase] acc_loss = 0. is_saved = False global best_valid_loss global global_iter_valid global global_iter_train with torch.set_grad_enabled(is_train): # with autograd.detect_anomaly(): for batch_idx, (inputs, targets) in enumerate(phase_dataloader): inputs = inputs.to(device) targets = targets.to(device) # view_inputs(inputs) if is_train is True: '''mix up''' inputs, targets_a, targets_b, lam = mixup.mixup_data(inputs, targets, device, float(config['params']['mixup_alpha'])) # inputs, targets_a, targets_b = map(Variable, (inputs, # targets_a, targets_b)) '''label smoothing''' targets_a = label_smoothing.smooth_one_hot(true_labels=targets_a, classes=num_classes, smoothing=float(config['params']['label_smoothing'])) targets_b = label_smoothing.smooth_one_hot(true_labels=targets_b, classes=num_classes, smoothing=float(config['params']['label_smoothing'])) else: targets = label_smoothing.smooth_one_hot(true_labels=targets, classes=num_classes, smoothing=0.0) # view_inputs(inputs) if config['model']['type'] == 'arcface': if is_train is True: logits = net(inputs, targets_a) else: logits = net(inputs, targets) else: logits = net(inputs) outputs = logits.log_softmax(dim=1) if is_train is True: loss = mixup.mixup_criterion(criterion, outputs, targets_a, targets_b, lam) optimizer.zero_grad() loss.backward() optimizer.step() else: loss = criterion(outputs, targets) preds = outputs.argmax(dim=1, keepdim=True) if is_train is True: targets_a = targets_a.argmax(dim=1, keepdim=True) targets_b = targets_b.argmax(dim=1, keepdim=True) accuracy = (lam * preds.eq(targets_a).float().sum() + (1 - lam) * preds.eq(targets_b).float().sum()) accuracy = accuracy / (targets_a.shape[0] + targets_b.shape[0]) else: targets = targets.argmax(dim=1, keepdim=True) accuracy = preds.eq(targets).float().mean() acc_loss += loss.item() avg_loss = acc_loss / (batch_idx + 1) print('[%s] epoch: %3d | iter: %4d | loss: %.3f | avg_loss: %.3f | accuracy: %.3f' % (phase, epoch, batch_idx, loss.item(), avg_loss, accuracy)) if is_train is True: summary_writer.add_scalar('train/loss', loss.item(), global_iter_train) summary_writer.add_scalar('train/acc', accuracy, global_iter_train) global_iter_train += 1 else: summary_writer.add_scalar('valid/loss', loss.item(), global_iter_valid) summary_writer.add_scalar('valid/acc', accuracy, global_iter_valid) global_iter_valid += 1 state = { 'net': net.state_dict(), 'loss': best_valid_loss, 'epoch': epoch, 'lr': config['optimizer']['lr'], 'batch': config['params']['batch_size'], 'global_train_iter': global_iter_train, 'global_valid_iter': global_iter_valid, 'optimizer': optimizer.state_dict() } if is_train is True: print('[Train] Saving..') # torch.save(state, config['model']['exp_path'] + '/ckpt-' + str(epoch) + '.pth') torch.save(state, os.path.join(config['exp']['path'], 'latest.pth')) else: # check whether better model or not if avg_loss < best_valid_loss: best_valid_loss = avg_loss is_saved = True if is_saved is True: print('[Valid] Saving..') # torch.save(state, config['model']['exp_path'] + '/ckpt-' + str(epoch) + '.pth') torch.save(state, os.path.join(config['exp']['path'], 'best.pth'))
[backbone, margin], optimizer = amp.initialize([backbone, margin], optimizer, opt_level='O1', verbosity=0) iter_idx = 0 for epoch in range(epochs): print('\nEpoch: {}'.format(epoch + 1)) train_loss = 0 correct = 0 total = 0 batch_idx = 0 since = time.time() for inputs, targets in train_loader: backbone.train() margin.train() inputs, targets = inputs.to(device), targets.to(device) if mixup: inputs, targets_a, targets_b, lam = mixup_data(inputs, targets, alpha) feature = backbone(inputs) outputs = margin(feature, targets_a, targets_b, lam, device=device, mixed_precision=mixed_precision) loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam) else: feature = backbone(inputs) outputs = margin(feature, targets, device=device, mixed_precision=mixed_precision) loss = criterion(outputs, targets) if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if iter_idx % accumulate == 0: optimizer.step() optimizer.zero_grad()
def train(args, batch_size): current_epoch = 0 width = height = 320 data_dir = 'data/data/' dataset = ImageDataset(data_dir, 'data/training.csv', width, height) category_map = get_categories('data/species.csv') file_count = len(dataset) // batch_size model = seresnext50_32x4d(True, num_classes=len(category_map), drop_rate=0) #model = resnest50_fast_1s4x24d('pretrained/resnest50_fast_1s4x24d-d4a4f76f.pth', num_classes=len(category_map)) #model = resnest50_fast_4s2x40d('pretrained/resnest50_fast_4s2x40d-41d14ed0.pth', num_classes=len(category_map)) model = torch.nn.DataParallel(model).cuda() if args.resume_from != None: current_epoch = get_epoch(args.resume_from) model_dict = torch.load(args.resume_from).module.state_dict() model.module.load_state_dict(model_dict) print("resume from ", args.resume_from) lr = 0.005 use_lr = lr optimizer = SGD(model.parameters(), lr=use_lr, momentum=0.9, weight_decay=0.0001) print(current_epoch) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3, gamma=0.5, last_epoch=-1) #lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0.00005) #lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.8) epochs = 15 loss_fn = nn.CrossEntropyLoss() show_loss_loop = 10 alpha = 0.1 for epoch in range(current_epoch, epochs): show_cate_loss = 0 dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=4) start = time.time() for i, data in enumerate(dataloader): optimizer.zero_grad() if random.random() < 0.5: inputs = to_tensor(data[0]).cuda() targets = data[1].cuda() inputs, targets_a, targets_b, lam = mixup_data( inputs, targets, alpha, True) inputs, targets_a, targets_b = map( Variable, (inputs, targets_a, targets_b)) outputs = model(inputs) cate_loss = mixup_criterion(loss_fn, outputs, targets_a, targets_b, lam) else: im = to_tensor(data[0]).cuda() category_id = data[1].cuda() cate_fc = model(im) cate_loss = loss_fn(cate_fc, category_id) cate_loss.backward() optimizer.step() show_cate_loss += cate_loss.item() if (i + 1) % show_loss_loop == 0: end = time.time() use_time = (end - start) / show_loss_loop start = end need_time = ((file_count * (epochs - epoch) - i) * use_time) / 60 / 60 show_cate_loss /= show_loss_loop print("epoch: {}/{} iter:{}/{} lr:{:.5f}, cate_loss:{:.5f}, use_time:{:.2f}/iter, need_time:{:.2f} h".\ format(epoch+1, epochs, (i+1), file_count, lr_scheduler.get_lr()[0], show_cate_loss, use_time, need_time)) show_cate_loss = 0 lr_scheduler.step() torch.save(model, 'models/epoch_{}.pth'.format(epoch + 1))