def train(net): net.train() priorbox = PriorBox() with torch.no_grad(): priors = priorbox.forward() priors = priors.to(device) dataloader = DataLoader(VOCDetection(), batch_size=2, collate_fn=detection_collate, num_workers=12) for epoch in range(1000): loss_ls, loss_cs = [], [] load_t0 = time.time() if epoch > 500: adjust_learning_rate(optimizer, 1e-4) for images, targets in dataloader: images = images.to(device) targets = [anno.to(device) for anno in targets] out = net(images) optimizer.zero_grad() loss_l, loss_c = criterion(out, priors, targets) loss = 2 * loss_l + loss_c loss.backward() optimizer.step() loss_cs.append(loss_c.item()) loss_ls.append(loss_l.item()) load_t1 = time.time() print(f'{np.mean(loss_cs)}, {np.mean(loss_ls)} time:{load_t1-load_t0}') torch.save(net.state_dict(), 'Final_FaceBoxes.pth')
def train(self, traindataloader, valdataloader, startepoch, endepoch): for epoch in range(startepoch, endepoch + 1): train = self._epoch(traindataloader, epoch) if epoch % self.opts.valInterval == 0: with torch.no_grad(): test = self._epoch(valdataloader, epoch, 'val') Writer = open(self.File, 'a') Writer.write(train + ' ' + test + '\n') Writer.close() else: Writer = open(self.File, 'a') Writer.write(train + '\n') Writer.close() if epoch % self.opts.saveInterval == 0: state = { 'epoch': epoch + 1, 'model_state': self.model.state_dict(), 'optimizer_state': self.optimizer.state_dict(), } path = os.path.join(self.opts.saveDir, 'model_{}.pth'.format(epoch)) torch.save(state, path) adjust_learning_rate(self.optimizer, epoch, self.opts.dropLR, self.opts.dropMag) loss_final = self._epoch(valdataloader, -1, 'val') return
def train(self, epoch, epochs): adjust_learning_rate(self.optimizer, epoch, self.lr, epochs) self.model.train() total_loss, total_num, train_bar = 0.0, 0, tqdm(self.train_loader) for x, target in train_bar: batch_size = len(target) x1, x2 = x[0].cuda(), x[1].cuda() _, online_projection_1 = self.model(x1) _, online_projection_2 = self.model(x2) online_prediction_1 = self.predictor(online_projection_1) online_prediction_2 = self.predictor(online_projection_2) with torch.no_grad(): _, target_projection_1 = self.target_encoder(x1) _, target_projection_2 = self.target_encoder(x2) loss = self.L( online_prediction_1, target_projection_2.detach()) + self.L( online_prediction_2, target_projection_1.detach()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.update_moving_average(self.target_ema_updater, self.target_encoder, self.model) total_num += batch_size total_loss += loss.item() * batch_size train_bar.set_description( 'Train Epoch: [{}/{}] Loss: {:.4f}'.format( epoch, epochs, total_loss / total_num)) return total_loss / total_num
def train(train_loader, model, optimizer, criterion, regularizer=None, lr_schedule=None): loss_sum = 0.0 correct = 0.0 num_iters = len(train_loader) model.train() for iter, (input, target) in enumerate(train_loader): if lr_schedule is not None: lr = lr_schedule(iter / num_iters) utils.adjust_learning_rate(optimizer, lr) input = input.cuda(non_blocking=True) target = target.cuda(non_blocking=True) output = model(input) loss = criterion(output, target) if regularizer is not None: loss += regularizer(model) optimizer.zero_grad() loss.backward() loss_sum += loss.item() * input.size(0) pred = output.data.argmax(1, keepdim=True) correct += pred.eq(target.data.view_as(pred)).sum().item() grad = np.concatenate( [p.grad.data.cpu().numpy().ravel() for p in model.parameters()]) return grad
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False sys.stdout = Logger(osp.join(args.save_dir, 'log' + '.txt')) if use_gpu: print("Currently using GPU: {}".format(args.gpu)) cudnn.benchmark = True torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU") with open(loader_path, 'rb') as f: trainloader, testloader = pickle.load(f) print("Creating model: {}".format(args.model)) model = models.create(name=args.model, num_classes=num_classes, feature_dim=feature_dim) if use_gpu: model = nn.DataParallel(model).cuda() criterion_xent = nn.CrossEntropyLoss() criterion_cent = CenterLoss(num_classes=num_classes, feat_dim=args.featdim, use_gpu=use_gpu) optimizer_model = torch.optim.SGD(model.parameters(), lr=args.lr_model, weight_decay=5e-04, momentum=0.9) optimizer_centloss = torch.optim.SGD(criterion_cent.parameters(), lr=args.lr_cent) if args.stepsize > 0: scheduler = lr_scheduler.StepLR(optimizer_model, step_size=args.stepsize, gamma=args.gamma) start_time = time.time() total_loss_list = [] train_acc, test_acc = 0, 0 for epoch in range(args.max_epoch): adjust_learning_rate(optimizer_model, epoch) print("==> Epoch {}/{}".format(epoch+1, args.max_epoch)) loss_list, train_acc = train(model, criterion_xent, criterion_cent, optimizer_model, optimizer_centloss, trainloader, use_gpu, num_classes, epoch) total_loss_list.append(loss_list) if args.stepsize > 0: scheduler.step() if args.eval_freq > 0 and (epoch+1) % args.eval_freq == 0 or (epoch+1) == args.max_epoch: print("==> Test") test_acc = test(model, testloader, use_gpu, num_classes, epoch) total_loss_list = np.array(total_loss_list).ravel() elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) print("Finished. Total elapsed time (h:m:s): {}".format(elapsed)) return total_loss_list,train_acc, test_acc
def train(x_train,x_train_external,y_train): # model num_class=np.shape(y_train)[1] num_external=np.shape(x_train_external)[1] model = ECGNet(BasicBlock, [3, 4, 6, 3],num_classes= num_class,num_external=num_external) model = model.to(device) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) criterion1 = nn.BCEWithLogitsLoss() lr = config.lr start_epoch = 1 stage = 1 best_auc = -1 # =========>开始训练<========= print("*" * 10, "step into stage %02d lr %.5f" % (stage, lr)) for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss,train_auc= train_epoch(model, optimizer, criterion1,x_train,x_train_external,y_train) print('#epoch:%02d stage:%d train_loss:%.4f train_auc:%.4f time:%s' % (epoch, stage, train_loss, train_auc, utils.print_time_cost(since))) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay print("*" * 10, "step into stage %02d lr %.5f" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr) return model
def run_train(self): start = time.time() # epochs for epoch in range(self.start_epoch, self.num_epochs): # trian an epoch self.train(epoch=epoch) # time per epoch epoch_time = time.time() - start print('Epoch: [{0}] finished, time consumed: {epoch_time:.3f}'. format(epoch, epoch_time=epoch_time)) # decay learning rate every epoch adjust_learning_rate(self.optimizer, self.lr_decay) # save checkpoint if self.checkpoint_path is not None: save_checkpoint(epoch=epoch, model=self.model, model_name=self.model_name, optimizer=self.optimizer, dataset_name=self.dataset_name, word_map=self.word_map, checkpoint_path=self.checkpoint_path, checkpoint_basename=self.checkpoint_basename) start = time.time()
def train(mix_trainloader, model, interp, optimizer, args): """Create the model and start the training.""" tot_iter = len(mix_trainloader) for i_iter, batch in enumerate(mix_trainloader): images, labels, name = batch labels = labels.long() optimizer.zero_grad() adjust_learning_rate(optimizer, i_iter, tot_iter, args) if args.info_max_loss: pred = model(images.to(device), training=True) loss = self_training_regularized_infomax(pred, labels.to(device), args) elif args.unc_noise: pred, noise_pred = model(images.to(device), training=True) loss = self_training_regularized_infomax_cct( pred, labels.to(device), noise_pred, args) else: pred = model(images.to(device)) loss = F.cross_entropy(pred, labels.to(device), ignore_index=255) loss.backward() optimizer.step() logger.info('iter = {} of {} completed, loss = {:.4f}'.format( i_iter + 1, tot_iter, loss.item()))
def train(x_train, x_train_external, y_train): # model num_class = np.shape(y_train)[1] model = ResNet34(num_classes=num_class) model = model.to(device) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) wc = y_train.sum(axis=0) wc = 1. / (np.log(wc + 1) + 1) w = torch.tensor(wc, dtype=torch.float).to(device) criterion1 = utils.WeightedMultilabel(w) lr = config.lr start_epoch = 1 stage = 1 best_auc = -1 # =========>开始训练<========= for epoch in range(start_epoch, config.max_epoch + 1): train_loss, train_auc = train_epoch(model, optimizer, criterion1, x_train, x_train_external, y_train) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay utils.adjust_learning_rate(optimizer, lr) return model
def train_imagenet(model, args): optimizer = optim.SGD(model.parameters(), weight_decay=args.weight_decay, lr=args.lr, momentum=args.momentum) criterion = nn.CrossEntropyLoss() train_loader = train_imagenet_loader(args) val_loader = val_imagenet_loader(args) if os.path.exists(args.model_dir): shutil.rmtree(args.model_dir) os.makedirs(args.model_dir) best_prec = 0.0 for epoch in range(1, args.epochs + 1): adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set cur_prec, _ = validate(val_loader, model, criterion, args) # remember best prec@1 and save checkpoint is_best = cur_prec > best_prec if is_best == True: best_prec = cur_prec cur_model_name = args.model_name + "-" + str(epoch).zfill( 2) + "-{:.3f}.pth".format(best_prec) torch.save(model.state_dict(), os.path.join(args.model_dir, cur_model_name)) print('Save weights at {}/{}'.format(args.model_dir, cur_model_name))
def update_learning_rate(epoch, ite): lr_adapted = args.lr * args.droplr**np.sum(args.adlr < epoch) if not lr_current == lr_adapted: print('Learning rate is adapted: {} -> {}'.format( lr_current, lr_adapted)) utils.adjust_learning_rate(optimizer, lr_adapted) return lr_adapted
def train(base_datamgr, base_set, aux_iter, val_loader, model, start_epoch, stop_epoch, params): # for validation max_acc = 0 total_it = 0 # training for epoch in range(start_epoch, stop_epoch): if params.adj_lr == True: learning_rate_adj = params.LUT_lr model_lr = utils.adjust_learning_rate(model.model_optim, epoch, learning_rate_adj) ft_lr = utils.adjust_learning_rate(model.ft_optim, epoch, learning_rate_adj) # randomly split seen domains to pseudo-seen and pseudo-unseen domains random_set = random.sample(base_set, k=2) ps_set = random_set[0] pu_set = random_set[1:] ps_loader = base_datamgr.get_data_loader(os.path.join( params.data_dir, ps_set, 'base.json'), aug=params.train_aug) pu_loader = base_datamgr.get_data_loader([ os.path.join(params.data_dir, dataset, 'base.json') for dataset in pu_set ], aug=params.train_aug) base_loader = base_datamgr.get_data_loader([ os.path.join(params.data_dir, dataset, 'base.json') for dataset in base_set ], aug=params.train_aug) # train loop model.train() if params.feature_wise_type == 'FT': total_it = model.train_loop(epoch, base_loader, total_it) else: total_it = model.trainall_loop(epoch, ps_loader, pu_loader, aux_iter, total_it) # validate model.eval() with torch.no_grad(): acc = model.test_loop(val_loader) # save if acc > max_acc: print(f"best model! accuracy: {acc}, save...") max_acc = acc outfile = os.path.join(params.checkpoint_dir, 'best_model.tar') model.save(outfile, epoch) else: print('GG!! best accuracy {:f}'.format(max_acc)) if ((epoch + 1) % params.save_freq == 0) or (epoch == stop_epoch - 1): outfile = os.path.join(params.checkpoint_dir, '{:d}.tar'.format(epoch + 1)) model.save(outfile, epoch) return
def train(args, encoder, decoder, loader, decoder_optimizer, encoder_optimizer, device, criterion): decoder.train() # train mode (dropout and batchnorm is used) encoder.train() losses = AverageMeter() # loss (per word decoded) top3accs = AverageMeter() # top accuracy i = 0 for data in tqdm(loader): if i % args.lr_update_freq == 0 and i > 0: adjust_learning_rate(decoder_optimizer, args.decay_rate) if args.fine_tune_encoder: adjust_learning_rate(encoder_optimizer, args.decay_rate) imgs = data[0] caps = data[1] caplens = data[2] # Forward pass imgs = encoder(imgs.to(device)).to(device) caps = caps.to(imgs.device) caplens = caplens.to(imgs.device) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder( imgs, caps, caplens) targets = caps_sorted[:, 1:] #remove <start> and <end> tokens scores = pack_padded_sequence(scores, decode_lengths, batch_first=True).to( device) #remove padding tokens targets = pack_padded_sequence(targets, decode_lengths, batch_first=True).to(device) # Calculate loss loss = criterion(scores.data, targets.data).to(imgs.device) # Add doubly stochastic attention regularization loss += args.alphac * ((1. - alphas.sum(dim=1).to(device))**2).mean() # Back prop. decoder_optimizer.zero_grad() if encoder_optimizer is not None: encoder_optimizer.zero_grad() loss.backward() if args.gradient_clip is not None: clip_gradient(decoder_optimizer, args.gradient_clip) if encoder_optimizer is not None: clip_gradient(encoder_optimizer, args.gradient_clip) # Update weights decoder_optimizer.step() if encoder_optimizer is not None: encoder_optimizer.step() # Keep track of metrics top3 = accuracy(scores.data, targets.data, 3) losses.update(loss.item(), sum(decode_lengths)) top3accs.update(top3, sum(decode_lengths)) # Print status if i % args.print_freq == 0: print('Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-3 Accuracy {top3.val:.3f} ({top3.avg:.3f})'.format( loss=losses, top3=top3accs)) if i % args.checkpoint_freq == 0 and args.checkpoint_freq > 0: save_checkpoint(args.model_path, i, encoder, decoder, encoder_optimizer, decoder_optimizer, 0, False) i += 1
def run(dataset, net_type, train=True): # Hyper Parameter settings train_ens = cfg.train_ens valid_ens = cfg.valid_ens test_ens = cfg.test_ens n_epochs = cfg.n_epochs lr_start = cfg.lr_start num_workers = cfg.num_workers valid_size = cfg.valid_size batch_size = cfg.batch_size trainset, testset, inputs, outputs = data.getDataset_regression(dataset) train_loader, valid_loader, test_loader = data.getDataloader( trainset, testset, valid_size, batch_size, num_workers) net = getModel(net_type, inputs, outputs).to(device) print(len(train_loader)) print(len(valid_loader)) print(len(test_loader)) ckpt_dir = f'checkpoints/regression/{dataset}/bayesian' ckpt_name = f'checkpoints/regression/{dataset}/bayesian/model_{net_type}.pt' if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir, exist_ok=True) criterion = metrics.ELBO_regression_hetero(len(trainset)).to(device) if train: optimizer = Adam(net.parameters(), lr=lr_start) valid_loss_max = np.Inf for epoch in range(n_epochs): # loop over the dataset multiple times cfg.curr_epoch_no = epoch utils.adjust_learning_rate(optimizer, metrics.lr_linear(epoch, 0, n_epochs, lr_start)) train_loss, train_mse, train_kl = train_model(net, optimizer, criterion, train_loader, num_ens=train_ens) valid_loss, valid_mse = validate_model(net, criterion, valid_loader, num_ens=valid_ens) print('Epoch: {} \tTraining Loss: {:.4f} \tTraining MSE: {:.4f} \tValidation Loss: {:.4f} \tValidation MSE: {:.4f} \ttrain_kl_div: {:.4f}'.format( epoch, train_loss, train_mse, valid_loss, valid_mse, train_kl)) # save model if validation MSE has increased if valid_loss <= valid_loss_max: print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format( valid_loss_max, valid_loss)) torch.save(net.state_dict(), ckpt_name) valid_loss_max = valid_loss # test saved model best_model = getModel(net_type, inputs, outputs).to(device) best_model.load_state_dict(torch.load(ckpt_name)) test_loss, test_mse = test_model(best_model, criterion, test_loader, num_ens=test_ens) print('Test Loss: {:.4f} \tTest MSE: {:.4f} '.format( test_loss, test_mse)) test_uncertainty(best_model, testset[:100], data='ccpp')
def train(): cfg = json.load(open(cfg_path)) enable_cuda = cfg['train']['enable_cuda'] gpus = cfg['train']['GPUS'] device = 'cuda:%d' % gpus[0] if enable_cuda else 'cpu' enable_multi_gpus = enable_cuda and len(gpus) > 1 batch_size = cfg['train']['batch_size'] batch_size = len(gpus) * batch_size if enable_multi_gpus else batch_size def transform_fn(image, boxes): sel_box_idx = random.randrange(0, len(boxes)) to_size = random.choice(cfg['network']['anchor_sizes']) to_size = random.randint(round(to_size * 0.7), round(to_size * 1.3)) img, boxes = transform.select_crop_face(image, boxes, cfg['train']['image_shape'], sel_box_idx, to_size) return img, boxes wider_train_dataset = WiderTrain(img_dir=cfg['wider_train']['image_dir'], anno_path=cfg['wider_train']['txt_path'], transform=transform_fn) dataloader = DataLoader(wider_train_dataset, batch_size=batch_size, collate_fn=wider_train_dataset.collate_fn, num_workers=12, drop_last=True, shuffle=True) print('model is being building...') detector = Detector(cfg=cfg) if enable_multi_gpus: detector = th.nn.DataParallel(detector, device_ids=gpus) detector.to(device) detector.train(True) optimizer = th.optim.Adam(detector.parameters()) print('model is builded.') train_step = 0 for epoch in range(50): for i, sample in enumerate(dataloader): optimizer.zero_grad() sample['image'] = sample['image'].to(device) sample['boxes'] = sample['boxes'].to(device) loss = detector(sample) if enable_multi_gpus: loss = sum(loss) loss.backward() optimizer.step() lr = _lr_adjust(train_step) ut.adjust_learning_rate(optimizer, lr) #if i % 100 == 0: print("epoch %d, step %d, total_step %d, loss %.3f" % (epoch, i, train_step, loss.item())) i += 1 train_step += 1
def main(): parser = argparse.ArgumentParser() parser.add_argument('--cuda', default='True', help='The dataset the class to processed') parser.add_argument('--gpu_id', default='0', help='The dataset the class to processed') args = parser.parse_args() (train_data_loader, val_data_loader, network, optimizer, train_writer, test_writer) = setup(bool(args.cuda), int(args.gpu_id)) # Everything seems fine. # make a code log with exp name utils.save_exp_information() # init values train_jter_count, val_jter_count, best_loss = (0, 0, np.inf) start_time = time.time() for epoch in range(gv.total_epochs): train_st_time = time.time() utils.adjust_learning_rate(optimizer, epoch, gv.orig_lr) train_data_loader.shuffle_index() train_jter_count = train(train_data_loader, network, optimizer, train_writer, train_jter_count) print('==========TRAIN Epoch', epoch + 1, "COMPLETE ====================") val_st_time = time.time() loss, val_jter_count = val(val_data_loader, network, test_writer, train_jter_count) print('==========val Epoch', epoch + 1, "COMPLETE ====================") print('==========TIME FROM START: ', time.time() - start_time, ' =============') if loss < best_loss: print('========== BEST MODEL TILL NOW! =============') best_loss_ = True best_loss = loss else: best_loss_ = False # add a is best checker utils.save_checkpoint( { 'epoch': epoch + 1, 'arch': 'res18', 'loss': loss, 'model_state_dict': network.state_dict(), 'optimizer': optimizer.state_dict(), }, filename='weights/the_real_simple_GRU_' + str(epoch + 1) + '.pth', is_best=best_loss_) print('==========Total Time for epoch: ', time.time() - val_st_time, ' =============') train_writer.close() test_writer.close()
def main(): global opt, best_prec1 opt = parser.parse_args() opt.logdir = opt.logdir + '/' + opt.name logger = 'hi' best_prec1 = 0 print(opt) # Initialize the model, criterion and the optimizer model = init.load_model(opt) model, criterion, optimizer = init.setup(model, opt) # Display the model structure print(model) # Setup trainer and validation trainer = train.Trainer(model, criterion, optimizer, opt, logger) validator = train.Validator(model, criterion, opt, logger) # Load model from a checkpoint if mentioned in opts if opt.resume: if os.path.isfile(opt.resume): model, optimizer, opt, best_prec1 = init.resumer( opt, model, optimizer) else: print("=> no checkpoint found at '{}'".format(opt.resume)) cudnn.benchmark = True # Setup the train and validation data loaders dataloader = init_data.load_data(opt) train_loader = dataloader.train_loader val_loader = dataloader.val_loader for epoch in range(opt.start_epoch, opt.epochs): utils.adjust_learning_rate(opt, optimizer, epoch) print("Starting epoch number:", epoch + 1, "Learning rate:", optimizer.param_groups[0]["lr"]) if opt.testOnly == False: # Train the network over the training data trainer.train(train_loader, epoch, opt) #if opt.tensorboard: #logger.scalar_summary('learning_rate', opt.lr, epoch) # Measure the validation accuracy acc = validator.validate(val_loader, epoch, opt) best_prec1 = max(acc, best_prec1) if best_prec1 == acc: # Save the new model if the accuracy is better than the previous saved model init.save_checkpoint(opt, model, optimizer, best_prec1, epoch) print('Best accuracy: [{0:.3f}]\t'.format(best_prec1))
def od_mmd_train(init_lr_da, init_lr_kd, epochs, growth_rate, alpha, gamma, init_beta, distils, source_dataloader, targets_dataloader, targets_testloader, optimizer_das, optimizer_kds, criterion, device, batch_norm, is_scheduler_da=True, is_scheduler_kd=True, scheduler_da=None, scheduler_kd=None, is_cst=True): total_loss_arr = [] teacher_da_temp_loss_arr = [] kd_temp_loss_arr = [] teacher_target_acc_arr = [] student_target_acc_arr = [] best_student_acc = 0. best_teacher_acc = 0. epochs += 1 for epoch in range(1, epochs): beta = init_beta * torch.exp(growth_rate * (epoch - 1)) beta = beta.to(device) if (is_scheduler_da): new_lr_da = init_lr_da / np.power((1 + 10 * (epoch - 1) / epochs), 0.75) # 10* for optimizer_da in optimizer_das: adjust_learning_rate(optimizer_da, new_lr_da) if (is_scheduler_kd): new_lr_kd = init_lr_kd / np.power((1 + 10 * (epoch - 1) / epochs), 0.75) # 10* for optimizer_kd in optimizer_kds: adjust_learning_rate(optimizer_kd, new_lr_kd) total_loss_1, total_loss_2, teacher_da_temp_loss_1 = od_mmd_one_epoch(epoch, epochs, distils, source_dataloader, targets_dataloader, optimizer_das, optimizer_kds, criterion, device,alpha, beta, gamma, batch_norm, is_cst) students_targets_acc = np.zeros(len(distils)) for i, d in enumerate(targets_testloader): students_targets_acc[i] = eval(distils[i].s_net, device, d, False) total_target_acc = students_targets_acc.mean() print(f'epoch : {epoch}\tacc : {total_target_acc}') if (total_target_acc > best_student_acc): best_student_acc = total_target_acc torch.save({'student_model': distils[0].s_net.state_dict(), 'acc': best_student_acc, 'epoch': epoch}, "./student_model.pth") if scheduler_da is not None: scheduler_da.step() if scheduler_kd is not None: scheduler_kd.step() if(epoch == 150 and epoch == 250): for optimizer_kd in optimizer_kds: for param_group in optimizer_kd.param_groups: param_group['lr'] = param_group['lr'] * .1 return best_student_acc
def main(): # 加载词库,加载数据集 voc = Lang('data/WORDMAP.json') print("词库数量 " + str(voc.n_words)) train_data = SaDataset('train', voc) val_data = SaDataset('valid', voc) # 初始化模型 encoder = EncoderRNN(voc.n_words, hidden_size, encoder_n_layers, dropout) # 将模型使用device进行计算,如果是gpu,则会使用显存,如果是cpu,则会使用内存 encoder = encoder.to(device) # 初始化优化器 优化器的目的是让梯度下降,手段是调整模型的参数,optim是一个pytorch的一个包,adam是一个优化算法,梯度下降 print('Building optimizers ...') ''' 需要优化的参数 学习率 ''' optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) # 基础准确率 best_acc = 0 epochs_since_improvement = 0 # epochs 训练的次数 for epoch in range(0, epochs): # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20 if epochs_since_improvement == 20: break if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0: adjust_learning_rate(optimizer, 0.8) # 训练一次 train(epoch, train_data, encoder, optimizer) # 使用验证集对训练结果进行验证,防止过拟合 val_acc, val_loss = valid(val_data, encoder) print('\n * ACCURACY - {acc:.3f}, LOSS - {loss:.3f}\n'.format(acc=val_acc, loss=val_loss)) # 检查是否有提升 is_best = val_acc > best_acc best_acc = max(best_acc, val_acc) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,)) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, encoder, optimizer, val_acc, is_best) # Reshuffle samples 将验证集合测试集打乱 np.random.shuffle(train_data.samples) np.random.shuffle(val_data.samples)
def train(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_acc = 0 epochs_since_improvement = 0 logger = get_logger() if checkpoint is None: model = CarRecognitionNet() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-6) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] optimizer = checkpoint['optimizer'] # Move to GPU, if available model = model.to(device) # 构建数据集 loader = fetch_dataloaders(args.image_folder, [0.8, 0.2], batchsize=args.batch_size) train_loader = loader['train'] valid_loader = loader['valid'] for epoch in range(start_epoch, args.end_epoch): # 模型长时间不更新退出 if epochs_since_improvement > 50: break # 调整学习率 if epochs_since_improvement > 0 and epochs_since_improvement % 15 == 0: adjust_learning_rate(optimizer, 0.1) train_loss, train_acc = __train(train_loader=train_loader, model=model, optimizer=optimizer, epoch=epoch, logger=logger ) vail_loss, vail_acc = __valid(valid_loader=valid_loader, model=model, logger=logger) is_best = vail_acc > best_acc best_acc = max(vail_acc, best_acc) if not is_best: epochs_since_improvement += 1 logger.info("Epochs since last improvement: %d\n" % (epochs_since_improvement,)) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint("OURS_RES50", epoch, epochs_since_improvement, model, optimizer, best_acc, is_best)
def train_epoch( net, epoch, data_loader, optimizer, input_file, device, config, epoch_size ): net.train() total_loss = 0.0 dataprocess = tqdm(data_loader) for iteration, batch_item in enumerate(dataprocess): image, mask = batch_item['image'], batch_item['mask'] image = image.to(device) mask = mask.to(device) # 调整当前学习率 utils.adjust_learning_rate( optimizer, config.LR_STRATEGY, epoch, iteration, epoch_size ) optimizer.zero_grad() # forward out = net(image) # loss loss = utils.create_loss(out, mask, config.NUM_CLASSES) total_loss += loss.item() # 反向传播 loss.backward() # 学习 optimizer.step() # 界面显示 dataprocess.set_description_str("epoch:{}".format(epoch)) dataprocess.set_postfix_str("loss:{:.4f}".format(loss.item())) input_file.write( "Epoch:{}, loss is {:.4f} \n".format(epoch, total_loss / len(data_loader)) ) input_file.flush()
def train(epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode net.train() end = time.time() for batch_idx, (inputs, targets) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if use_cuda is not None: inputs, targets = inputs.cuda(), targets.cuda() # compute output outputs = net(inputs) loss = criterion(outputs, targets) # measure accuracy and record loss prec1, prec5 = accuracy(outputs, targets, topk=(1, 5)) losses.update(loss.item(), inputs.size(0)) top1.update(prec1[0], inputs.size(0)) top5.update(prec5[0], inputs.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() utils.adjust_learning_rate(optimizer, epoch, batch_idx, len(train_loader), args.ne, args.lr) if batch_idx % 200 == 0: batch_time.update(time.time() - end) end = time.time() print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, batch_idx, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5))
def run(self): batch_size = self.config['batch_size'] learning_rate = self.config['learning_rate'] # Create Models self.itc_attentioner = itc_model.Attentioner().cuda() self.mfd_attentioner = mfd_model.attentioner().cuda() image_smoother = itc_model.ImageSmoother(kernel_size=self.config['smoother_kernel']) image_smoother = image_smoother.cuda() self.image_smoother = image_smoother variance_pool2d = ops.VariancePool2d(kernel_size=self.config['variance_kernel'], same=True) variance_pool2d = variance_pool2d.cuda() self.variance_pool2d = variance_pool2d # Model Summary self.logger.debug('ITC Attentioner Architecture') summary(self.itc_attentioner, (3, 224, 224), batch_size=batch_size) self.logger.debug('MFD Attentioner Architecture') summary(self.mfd_attentioner, (3, 224, 224), batch_size=batch_size) # Load Pretrained Models self.load_pretrained_models() model_params = [] model_params += self.itc_attentioner.parameters() self.optm = torch.optim.Adam(model_params, lr=learning_rate) # Restore Model if not self.args.restart: self.load_checkpoint() # Setup Global Train Index self.gidx = self.epoch * len(self.dataset_train) # Initial Validation # self.valid = DataObject() # self.run_valid() total_epochs = self.config['epochs'] for _ in range(self.epoch, total_epochs): utils.adjust_learning_rate(learning_rate, self.optm, self.epoch) self.train = DataObject() self.run_train() self.valid = DataObject() self.run_valid() self.epoch += 1
def run(dataset, net_type): # Hyper Parameter settings train_ens = cfg.train_ens valid_ens = cfg.valid_ens n_epochs = cfg.n_epochs lr_start = cfg.lr_start num_workers = cfg.num_workers valid_size = cfg.valid_size batch_size = cfg.batch_size trainset, testset, inputs, outputs = data.getDataset(dataset) train_loader, valid_loader, test_loader = data.getDataloader( trainset, testset, valid_size, batch_size, num_workers) net = getModel(net_type, inputs, outputs).to(device) ckpt_dir = f'checkpoints/{dataset}/bayesian' ckpt_name = f'checkpoints/{dataset}/bayesian/model_{net_type}.pt' if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir, exist_ok=True) criterion = metrics.ELBO(len(trainset)).to(device) optimizer = Adam(net.parameters(), lr=lr_start) valid_loss_max = np.Inf for epoch in range(n_epochs): # loop over the dataset multiple times utils.adjust_learning_rate( optimizer, metrics.lr_linear(epoch, 0, n_epochs, lr_start)) train_loss, train_acc, train_kl = train_model(net, optimizer, criterion, train_loader, num_ens=train_ens) valid_loss, valid_acc = validate_model(net, criterion, valid_loader, num_ens=valid_ens) print( 'Epoch: {} \tTraining Loss: {:.4f} \tTraining Accuracy: {:.4f} \tValidation Loss: {:.4f} \tValidation Accuracy: {:.4f} \ttrain_kl_div: {:.4f}' .format(epoch, train_loss, train_acc, valid_loss, valid_acc, train_kl)) # save model if validation accuracy has increased if valid_loss <= valid_loss_max: print( 'Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...' .format(valid_loss_max, valid_loss)) torch.save(net.state_dict(), ckpt_name) valid_loss_max = valid_loss
def adjust_boost_lr(dataloader, model, criterion=torch.nn.CrossEntropyLoss(), device=torch.device('cuda'), lr_initialization=[ 0.01, 0.1, 1., 10., 100., 1000., 10000., 100000., 1000000. ], n_steps=1000, step_size=1000000.): model.eval().to(device) #.detach() with torch.no_grad(): logits = [] targets = [] predicts = [] for iter, (input, target, logit) in enumerate(dataloader): input = input.to(device) predict = model(input) logits.append(logit) targets.append(target) predicts.append(predict.cpu()) logits = torch.cat(logits, dim=0).detach() targets = torch.cat(targets, dim=0).detach() predicts = torch.cat(predicts, dim=0).detach() lr_initialization = torch.tensor(lr_initialization) results = torch.zeros(lr_initialization.shape, dtype=torch.float) for iter, lr in enumerate(lr_initialization): results[iter] = criterion(logits + lr * predicts, targets) learning_rate = float(lr_initialization[torch.argmin(results)]) learning_rate = torch.tensor([learning_rate], requires_grad=True) optim = torch.optim.SGD([learning_rate], lr=1., momentum=0.5) learning_rates = np.arange(1., 0., -1 / n_steps, dtype=float) * float(learning_rate) * step_size for iter, loc_lr in enumerate(learning_rates): utils.adjust_learning_rate(optim, loc_lr) loss = criterion(logits + learning_rate * predicts, targets) loss.backward() optim.step() if iter % (100) == 99: print('[', iter, '] lr :', learning_rate, 'grad :', learning_rate.grad) optim.zero_grad() return float(learning_rate.detach())
def train(config, train_iter, val_iter, model, criterion, optimizer, epoch): global iteration, n_total, train_loss, n_bad_loss global init, best_val_loss print("=> EPOCH {}".format(epoch)) train_iter.init_epoch() for batch in train_iter: model = model.to('cuda') # batch = batch.to('cuda') iteration += 1 model.train() output, _, __ = model(batch.grapheme.to('cuda'), batch.phoneme[:-1].to('cuda')) target = batch.phoneme[1:].to('cuda') loss = criterion(output.view(output.size(0) * output.size(1), -1), target.view(target.size(0) * target.size(1))) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), config.clip, 'inf') optimizer.step() n_total += batch.batch_size train_loss += loss.data * batch.batch_size if iteration % config.log_every == 0: train_loss /= n_total val_loss = validate(val_iter, model, criterion) print(" % Time: {:5.0f} | Iteration: {:5} | Batch: {:4}/{}" " | Train loss: {:.4f} | Val loss: {:.4f}".format( time.time() - init, iteration, train_iter.iterations, len(train_iter), train_loss, val_loss)) # test for val_loss improvement n_total = train_loss = 0 if val_loss < best_val_loss: best_val_loss = val_loss n_bad_loss = 0 torch.save(model.state_dict(), config.best_model) else: n_bad_loss += 1 if n_bad_loss == config.n_bad_loss: best_val_loss = val_loss n_bad_loss = 0 adjust_learning_rate(optimizer, config.lr_decay) new_lr = optimizer.param_groups[0]['lr'] print("=> Adjust learning rate to: {}".format(new_lr)) if new_lr < config.lr_min: return True return False
def swa_train(model, swa_model, train_iter, valid_iter, optimizer, criterion, pretrain_epochs, swa_epochs, swa_lr, cycle_length, device, writer, cpt_filename): swa_n = 1 swa_model.load_state_dict(copy.deepcopy(model.state_dict())) utils.save_checkpoint( cpt_directory, 1, '{}-swa-{:2.4f}-{:03d}-{}'.format(date, swa_lr, cycle_length, cpt_filename), state_dict=model.state_dict(), swa_state_dict=swa_model.state_dict(), swa_n=swa_n, optimizer=optimizer.state_dict() ) for e in range(swa_epochs): epoch = e + pretrain_epochs time_ep = time.time() lr = utils.schedule(epoch, cycle_length, lr_init, swa_lr) utils.adjust_learning_rate(optimizer, lr) train_res = utils.train_epoch(model, train_iter, optimizer, criterion, device) valid_res = utils.evaluate(model, valid_iter, criterion, device) utils.moving_average(swa_model, model, swa_n) swa_n += 1 utils.bn_update(train_iter, swa_model) swa_res = utils.evaluate(swa_model, valid_iter, criterion, device) time_ep = time.time() - time_ep values = [epoch + 1, lr, swa_lr, cycle_length, train_res['loss'], valid_res['loss'], swa_res['loss'], None, None, time_ep] writer.writerow(values) table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') if epoch % 20 == 0: table = table.split('\n') table = '\n'.join([table[1]] + table) else: table = table.split('\n')[2] print(table) utils.save_checkpoint( cpt_directory, epoch + 1, '{}-swa-{:2.4f}-{:03d}-{}'.format(date, swa_lr, cycle_length, cpt_filename), state_dict=model.state_dict(), swa_state_dict=swa_model.state_dict(), swa_n=swa_n, optimizer=optimizer.state_dict() )
def main(m): best_error = 100 opt = parser_params() if opt.dataset == 'cifar10': train_loader, test_loader = cifar10_dataloaders( batch_size=opt.batch_size, num_workers=opt.num_workers) n_cls = 10 else: raise NotImplementedError(opt.dataset) print(opt.model[m]) model = model_dict[opt.model[m]](num_classes=n_cls) optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, weight_decay=opt.weight_decay) criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() cudnn.benchmark = True for epoch in range(1, opt.epochs + 1): if m == 4 and epoch == 1: opt.learning_rate = 0.01 else: opt.learning_rate = 0.1 adjust_learning_rate(epoch, opt, optimizer) print("==> training...") train_error, train_loss = train(epoch, train_loader, model, criterion, optimizer, list_loss_train[m]) print('epoch {} | train_loss: {}'.format(epoch, train_loss)) print('epoch {} | train_error: {}'.format(epoch, train_error)) test_error, test_loss = test(test_loader, model, criterion, list_loss_test[m]) print('epoch {} | test_loss: {}'.format(epoch, test_loss)) print('epoch {} | test_error: {}'.format(epoch, test_error)) print('iterations: {}'.format(epoch * len(train_loader))) if best_error > test_error: best_error = test_error print('Min error: ', best_error)
def main(): global opt, best_prec1 opt = parser.parse_args() opt.logdir = opt.logdir+'/'+opt.name logger = None#Logger(opt.logdir) opt.lr = opt.maxlr print(opt) best_prec1 = 0 cudnn.benchmark = True model = init_model.load_model(opt) if opt.model_def.startswith('alexnet') or opt.model_def.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() elif opt.ngpus > 1: model = torch.nn.DataParallel(model).cuda() print(model) model, criterion, optimizer = init_model.setup(model,opt) trainer = train.Trainer(model, criterion, optimizer, opt, logger) validator = train.Validator(model, criterion, opt, logger) if opt.resume: if os.path.isfile(opt.resume): model, optimizer, opt, best_acc = init_model.resumer(opt, model, optimizer) else: print("=> no checkpoint found at '{}'".format(opt.resume)) dataloader = init_data.load_data(opt) train_loader = dataloader.train_loader #print(utils.get_mean_and_std(train_loader)) val_loader = dataloader.val_loader for epoch in range(opt.start_epoch, opt.epochs): utils.adjust_learning_rate(opt, optimizer, epoch) print("Starting epoch number:",epoch,"Learning rate:", opt.lr) if opt.testOnly == False: trainer.train(train_loader, epoch, opt) if opt.tensorboard: logger.scalar_summary('learning_rate', opt.lr, epoch) prec1 = validator.validate(val_loader, epoch, opt) best_prec1 = max(prec1, best_prec1) init_model.save_checkpoint(opt, model, optimizer, best_prec1, epoch) print('Best Prec@1: [{0:.3f}]\t'.format(best_prec1))
def train(x_train, x_val, x_train_external, x_val_external, y_train, y_val, num_class): # model model = ECGNet(BasicBlock, [3, 4, 6, 3], num_classes=num_class) model = model.to(device) # optimizer and loss optimizer = optim.Adam(model.parameters(), lr=config.lr) # optimizer = optim. RMSProp(model.parameters(), lr=config.lr) wc = y_train.sum(axis=0) wc = 1. / (np.log(wc) + 1) #添加和标签权重的惩罚,如果一个标签和其他标签越接近越容易混淆,它的权重得分会越大,应该更加关注一些,此权重是已经做了归一化 # weight=np.array([0.9608,0.9000,0.8373,0.8373,0.8706,0.6412,0.8373,0.9118,1.0,0.9255,0.9118, # 0.9892,0.9588,0.9118,0.9118,0.8137,0.9608,1.0,0.9118,0.9588,0.9588,0.9863, # 0.8373,0.9892,0.9588,0.9118,0.9863]) # wc=weight*wc w = torch.tensor(wc, dtype=torch.float).to(device) criterion1 = utils.WeightedMultilabel(w) criterion2 = nn.BCEWithLogitsLoss() lr = config.lr start_epoch = 1 stage = 1 best_auc = -1 # =========>开始训练<========= print("*" * 10, "step into stage %02d lr %.5f" % (stage, lr)) for epoch in range(start_epoch, config.max_epoch + 1): since = time.time() train_loss, train_auc = train_epoch(model, optimizer, criterion1, x_train, x_train_external, y_train, num_class) val_loss, val_auc = val_epoch(model, criterion2, x_val, x_val_external, y_val, num_class) print( '#epoch:%02d stage:%d train_loss:%.4f train_auc:%.4f val_loss:%.4f val_auc:%.4f time:%s' % (epoch, stage, train_loss, train_auc, val_loss, val_auc, utils.print_time_cost(since))) if epoch in config.stage_epoch: stage += 1 lr /= config.lr_decay print("*" * 10, "step into stage %02d lr %.5f" % (stage, lr)) utils.adjust_learning_rate(optimizer, lr) return model
def train(train_loader, model, optimizer, start_iter, num_iters): batch_time = AverageMeter() data_time = AverageMeter() total_losses = AverageMeter() rpn_losses = AverageMeter() odn_losses = AverageMeter() rpn_ce_losses = AverageMeter() rpn_box_losses = AverageMeter() odn_ce_losses = AverageMeter() odn_box_losses = AverageMeter() # switch to train mode end_iter = start_iter + num_iters - 1 model.train() end = time.time() # for i in range(start_iter, start_iter + num_iters): for i, (inputs, anns) in enumerate(train_loader): i += start_iter # get minibatch # inputs, anns = next(train_loader) lr = adjust_learning_rate(optimizer, args.lr, args.decay_rate, i, args.niters) # TODO: add custom # measure data loading time data_time.update(time.time() - end) optimizer.zero_grad() # forward images one by one (TODO: support batch mode later, or # multiprocess) for j, input in enumerate(inputs): input_anns = anns[j] # anns of this input if len(input_anns) == 0: continue gt_bbox = np.vstack([ann['bbox'] + [ann['ordered_id']] for ann in input_anns]) im_info= [[input.size(1), input.size(2), input_anns[0]['scale_ratio']]] input_var= torch.autograd.Variable(input.unsqueeze(0).cuda(), requires_grad=False) cls_prob, bbox_pred, rois= model(input_var, im_info, gt_bbox) loss= model.loss loss.backward() # record loss total_losses.update(loss.data[0], input_var.size(0)) rpn_losses.update(model.rpn.loss.data[0], input_var.size(0)) rpn_ce_losses.update( model.rpn.cross_entropy.data[0], input_var.size(0)) rpn_box_losses.update( model.rpn.loss_box.data[0], input_var.size(0)) odn_losses.update(model.odn.loss.data[0], input_var.size(0)) odn_ce_losses.update( model.odn.cross_entropy.data[0], input_var.size(0)) odn_box_losses.update( model.odn.loss_box.data[0], input_var.size(0)) # do SGD step optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if args.print_freq > 0 and (i + 1) % args.print_freq == 0: print('iter: [{0}] ' 'Time {batch_time.val:.3f} ' 'Data {data_time.val:.3f} ' 'Loss {total_losses.val:.4f} ' 'RPN {rpn_losses.val:.4f} ' '{rpn_ce_losses.val:.4f} ' '{rpn_box_losses.val:.4f} ' 'ODN {odn_losses.val:.4f} ' '{odn_ce_losses.val:.4f} ' '{odn_box_losses.val:.4f} ' .format(i, batch_time=batch_time, data_time=data_time, total_losses=total_losses, rpn_losses=rpn_losses, rpn_ce_losses=rpn_ce_losses, rpn_box_losses=rpn_box_losses, odn_losses=odn_losses, odn_ce_losses=odn_ce_losses, odn_box_losses=odn_box_losses)) del inputs del anns if i == end_iter: break print('iter: [{0}-{1}] ' 'Time {batch_time.avg:.3f} ' 'Data {data_time.avg:.3f} ' 'Loss {total_losses.avg:.4f} ' 'RPN {rpn_losses.avg:.4f} ' '{rpn_ce_losses.avg:.4f} ' '{rpn_box_losses.avg:.4f} ' 'ODN {odn_losses.avg:.4f} ' '{odn_ce_losses.avg:.4f} ' '{odn_box_losses.avg:.4f} ' .format(start_iter, end_iter, batch_time=batch_time, data_time=data_time, total_losses=total_losses, rpn_losses=rpn_losses, rpn_ce_losses=rpn_ce_losses, rpn_box_losses=rpn_box_losses, odn_losses=odn_losses, odn_ce_losses=odn_ce_losses, odn_box_losses=odn_box_losses)) if args.tensorboard: log_value('train_total_loss', total_losses.avg, end_iter) log_value('train_rpn_loss', rpn_losses.avg, end_iter) log_value('train_rpn_ce_loss', rpn_ce_losses.avg, end_iter) log_value('train_rpn_box_loss', rpn_box_losses.avg, end_iter) log_value('train_odn_loss', odn_losses.avg, end_iter) log_value('train_odn_ce_loss', odn_ce_losses.avg, end_iter) log_value('train_odn_box_loss', odn_box_losses.avg, end_iter) return total_losses.avg