def train(self, epoch): self.model.train() train_loss = MovingAverageMeter() train_acc = AccuracyMeter() for i, (x, y) in enumerate(self.train_loader): x = Variable(x) y = Variable(y) if self.use_cuda: x = x.cuda() y = y.cuda() output = self.model(x) loss = F.cross_entropy(output, y) self.optimizer.zero_grad() loss.backward() self.optimizer.step() train_loss.update(float(loss.data)) y_pred = output.data.max(dim=1)[1] correct = int(y_pred.eq(y.data).cpu().sum()) train_acc.update(correct, x.size(0)) return train_loss.average, train_acc.accuracy
def train_loop(self): self.model.train() ys, ps = [], [] _loss, _acc = AverageMeter(), AccuracyMeter() with tqdm(total=len(self.dl_train.dataset), ncols=100, leave=False, desc=f"{self.cepoch} train") as t: for x, y in self.dl_train: x_, y_ = x.cuda(), y.cuda() p_ = self.model(x_) loss = self.criterion(p_, y_) # SAM loss.backward() clip_grad_norm_(self.model.parameters(), 1.0) self.optimizer.first_step(zero_grad=True) self.criterion(self.model(x_), y_).backward() clip_grad_norm_(self.model.parameters(), 1.0) self.optimizer.second_step(zero_grad=True) _loss.update(loss.item()) _acc.update(y_, p_) ys.append(y) ps.append(p_.detach().cpu()) t.set_postfix_str(f"loss:{loss.item():.6f} acc:{_acc():.2f}%", refresh=False) t.update(len(y)) self.tys = torch.cat(ys) self.tps = torch.cat(ps).softmax(dim=1) self.tloss = _loss() self.tacc = (self.tys == torch.argmax(self.tps, dim=1)).sum().item() / len(self.tys) * 100
def eval(network, dataloader, device, tencrop): network.eval() softmax = nn.Softmax(dim=1).cuda() accs = [] for idx in range(n_layers): accs.append(AccuracyMeter()) pbar = tqdm(dataloader) for data in pbar: img = data[0].to(device) rot = data[1].long().to(device) if tencrop: bs, ncrops, c, h, w = img.size() img = img.view(-1, c, h, w) outputs = network(img) for idx in range(n_layers): outputs[idx].to(device) if tencrop: outputs[idx] = softmax(outputs[idx]) outputs[idx] = torch.squeeze(outputs[idx].view(bs, ncrops, -1).mean(1)) for idx in range(13, n_layers): accuracy(outputs[idx], rot, accs[idx]) str_content = generate_acc(n_layers, start=13) flt_content = [] for idx in range(13, n_layers): flt_content.append(accs[idx].get()) pbar.set_postfix(info=str_content.format(*flt_content)) return accs
def validate(): net.eval() valid_loss = AverageMeter() valid_acc = AccuracyMeter() with torch.no_grad(): for i, (x, y) in enumerate(valid_loader): x = x.to(device) y = y.to(device) output = net(x) loss = F.cross_entropy(output, y) pred = output.data.max(dim=1)[1] correct = int(pred.eq(y.data).cpu().sum()) valid_loss.update(float(loss.data), number=x.size(0)) valid_acc.update(correct, number=x.size(0)) return valid_loss.average, valid_acc.accuracy
def validate(self): self.model.eval() valid_loss = AverageMeter() valid_acc = AccuracyMeter() with torch.no_grad(): for i, (x, y) in enumerate(self.valid_loader): x = x.to(self.device) y = y.to(self.device) output = self.model(x) loss = F.cross_entropy(output, y) valid_loss.update(float(loss.data), x.size(0)) y_pred = output.data.max(dim=1)[1] correct = int(y_pred.eq(y.data).cpu().sum()) valid_acc.update(correct, x.size(0)) return valid_loss.average, valid_acc.accuracy
def train(): net.train() train_loss = AverageMeter() train_acc = AccuracyMeter() for i, (x, y) in enumerate(train_loader): x = x.to(device) y = y.to(device) output = net(x) loss = F.cross_entropy(output, y) optimizer.zero_grad() loss.backward() optimizer.step() pred = output.data.max(dim=1)[1] correct = int(pred.eq(y.data).cpu().sum()) train_loss.update(float(loss.data), number=x.size(0)) train_acc.update(correct, number=x.size(0)) return train_loss.average, train_acc.accuracy
def validate(self): self.model.eval() valid_loss = AverageMeter() valid_acc = AccuracyMeter() for i, (x, y) in enumerate(self.valid_loader): x = Variable(x, volatile=True) y = Variable(y) if self.use_cuda: x = x.cuda() y = y.cuda() output = self.model(x) loss = F.cross_entropy(output, y) valid_loss.update(float(loss.data), x.size(0)) y_pred = output.data.max(dim=1)[1] correct = int(y_pred.eq(y.data).cpu().sum()) valid_acc.update(correct, x.size(0)) return valid_loss.average, valid_acc.accuracy
def train(i_epoch, network, criterion, optimizer, dataloader, device): network.eval() losses = [] accs = [] for idx in range(n_layers): losses.append(AvgMeter()) accs.append(AccuracyMeter()) pbar = tqdm(dataloader) for data in pbar: img = data[0].to(device) rot = data[1].long().to(device) outputs = network(img) for idx in range(n_layers): outputs[idx].to(device) optimizer.zero_grad() all_loss = [] for idx in range(n_layers): all_loss.append(criterion(outputs[idx], rot)) accuracy(outputs[idx], rot, accs[idx]) loss = 0 for idx in range(n_layers): loss += all_loss[idx] #all_loss[idx].backward() losses[idx].add(all_loss[idx].item()) loss.backward() optimizer.step() lr = optimizer.param_groups[0]['lr'] str_content = generate_lossacc(n_layers, start=13) # str_content = 'c1:{:.4f}/{:.4f} c2:{:.4f}/{:.4f} c3:{:.4f}/{:.4f} c4:{:.4f}/{:.4f} c5:{:.4f}/{:.4f}, lr:{}' flt_content = [] for idx in range(13, n_layers): flt_content.append(losses[idx].get()) flt_content.append(accs[idx].get()) flt_content.append(lr) pbar.set_description("Epoch:{}".format(i_epoch)) pbar.set_postfix(info=str_content.format(*flt_content)) return losses, accs
def model_train(model, config, criterion, trainloader, testloader, validloader, model_name): num_epochs = config['budget'] success = False time_to_94 = None lrs = list() logging.info(f"weight decay:\t{config['weight_decay']}") logging.info(f"momentum :\t{config['momentum']}") base_optimizer = optim.SGD(model.parameters(), lr=config['base_lr'], weight_decay=config['weight_decay'], momentum=config['momentum']) if config['swa']: optimizer = torchcontrib.optim.SWA(base_optimizer) # lr_scheduler = SWAResNetLR(optimizer, milestones=config['milestones'], schedule=config['schedule'], swa_start=config['swa_start'], swa_init_lr=config['swa_init_lr'], swa_step=config['swa_step'], base_lr=config['base_lr']) else: optimizer = base_optimizer # lr_scheduler = PiecewiseLinearLR(optimizer, milestones=config['milestones'], schedule=config['schedule']) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, num_epochs) #lr_scheduler = PiecewiseLinearLR(optimizer, milestones=config['milestones'], schedule=config['schedule']) save_model_str = './models/' if not os.path.exists(save_model_str): os.mkdir(save_model_str) save_model_str += f'model_({datetime.datetime.now()})' if not os.path.exists(save_model_str): os.mkdir(save_model_str) summary_dir = f'{save_model_str}/summary' if not os.path.exists(summary_dir): os.mkdir(summary_dir) c = datetime.datetime.now() train_meter = AccuracyMeter(model_dir=summary_dir, name='train') test_meter = AccuracyMeter(model_dir=summary_dir, name='test') valid_meter = AccuracyMeter(model_dir=summary_dir, name='valid') for epoch in range(num_epochs): lr = lr_scheduler.get_lr()[0] lrs.append(lr) logging.info('epoch %d, lr %e', epoch, lr) train_acc, train_obj, time = train(trainloader, model, criterion, optimizer, model_name, config['grad_clip'], config['prefetch']) train_meter.update({ 'acc': train_acc, 'loss': train_obj }, time.total_seconds()) lr_scheduler.step() if config['swa'] and ((epoch + 1) >= config['swa_start']) and ( (epoch + 1 - config['swa_start']) % config['swa_step'] == 0): optimizer.update_swa() valid_acc, valid_obj, time = infer(testloader, model, criterion, name=model_name, prefetch=config['prefetch']) valid_meter.update({ 'acc': valid_acc, 'loss': valid_obj }, time.total_seconds()) if valid_acc >= 94: success = True time_to_94 = train_meter.time logging.info(f'Time to reach 94% {time_to_94}') # wandb.log({"Test Accuracy":valid_acc, "Test Loss": valid_obj, "Train Accuracy":train_acc, "Train Loss": train_obj}) a = datetime.datetime.now() - c if config['swa']: optimizer.swap_swa_sgd() optimizer.bn_update(trainloader, model) test_acc, test_obj, time = infer(testloader, model, criterion, name=model_name, prefetch=config['prefetch']) test_meter.update({ 'acc': test_acc, 'loss': test_obj }, time.total_seconds()) torch.save(model.state_dict(), f'{save_model_str}/state') # wandb.save('model.h5') train_meter.plot(save_model_str) valid_meter.plot(save_model_str) plt.plot(lrs) plt.title('LR vs epochs') plt.xlabel('Epochs') plt.ylabel('LR') plt.xticks(np.arange(0, num_epochs, 5)) plt.savefig(f'{save_model_str}/lr_schedule.png') plt.close() device = get('device') device_name = cpuinfo.get_cpu_info( )['brand'] if device.type == 'cpu' else torch.cuda.get_device_name(0) total_time = round(a.total_seconds(), 2) logging.info( f'test_acc: {test_acc}, save_model_str:{save_model_str}, total time :{total_time} and device used {device_name}' ) _, cnt, time = train_meter.get() time_per_step = round(time / cnt, 2) return_dict = { 'test_acc': test_acc, 'save_model_str': save_model_str, 'training_time_per_step': time_per_step, 'total_train_time': time, 'total_time': total_time, 'device_used': device_name, 'train_acc': train_acc } if success: return_dict['time_to_94'] = time_to_94 return return_dict, model