def augment(out_dir, chkpt_path, train_loader, valid_loader, model, writer, logger, device, config): w_optim = utils.get_optim(model.weights(), config.w_optim) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optim, config.epochs, eta_min=config.w_optim.lr_min) init_epoch = -1 if chkpt_path is not None: logger.info("Resuming from checkpoint: %s" % chkpt_path) checkpoint = torch.load(chkpt_path) model.load_state_dict(checkpoint['model']) w_optim.load_state_dict(checkpoint['w_optim']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) init_epoch = checkpoint['epoch'] else: logger.info("Starting new training run") logger.info("Model params count: {:.3f} M, size: {:.3f} MB".format( utils.param_size(model), utils.param_count(model))) # training loop logger.info('begin training') best_top1 = 0. tot_epochs = config.epochs for epoch in itertools.count(init_epoch + 1): if epoch == tot_epochs: break drop_prob = config.drop_path_prob * epoch / tot_epochs model.drop_path_prob(drop_prob) lr = lr_scheduler.get_lr()[0] # training train(train_loader, None, model, writer, logger, None, w_optim, None, lr, epoch, tot_epochs, device, config) lr_scheduler.step() # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, writer, logger, epoch, tot_epochs, cur_step, device, config) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False if config.save_freq != 0 and epoch % config.save_freq == 0: save_checkpoint(out_dir, model, w_optim, None, lr_scheduler, epoch, logger) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) tprof.stat_acc('model_' + NASModule.get_device()[0])
def __init__(self): self.config = SearchConfig() self.writer = None if self.config.tb_dir != "": from torch.utils.tensorboard import SummaryWriter self.writer = SummaryWriter(self.config.tb_dir, flush_secs=20) init_gpu_params(self.config) set_seed(self.config) self.logger = FileLogger('./log', self.config.is_master, self.config.is_master) self.load_data() self.logger.info(self.config) self.model = SearchCNNController(self.config, self.n_classes, self.output_mode) self.load_model() self.init_kd_component() if self.config.n_gpu > 0: self.model.to(device) if self.config.n_gpu > 1: self.model = torch.nn.parallel.DistributedDataParallel( self.model, device_ids=[self.config.local_rank], find_unused_parameters=True) self.model_to_print = self.model if self.config.multi_gpu is False else self.model.module self.architect = Architect(self.model, self.teacher_model, self.config, self.emd_tool) mb_params = param_size(self.model) self.logger.info("Model size = {:.3f} MB".format(mb_params)) self.eval_result_map = [] self.init_optim()
def main(): if not torch.cuda.is_available(): logger.info("no gpu device available") sys.exit(1) logger.info("*** Begin {} ***".format(config.stage)) # set default gpu device torch.cuda.set_device(config.gpus[0]) # set random seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get data with meta info logger.info("preparing data...") input_size, channels_in, num_classes, train_data, valid_data = \ load_dataset(dataset=config.dataset, data_dir=config.data_dir, cutout_length=config.cutout_length, validation=True, auto_aug=config.auto_aug) valid_loader = torch.utils.data.DataLoader(dataset=valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers, pin_memory=True) logger.info("loading model...") if config.load_model_dir is not None: model = torch.load(config.load_model_dir) else: model = utils.load_checkpoint(config.model_dir) model = model.to(device) model_size = utils.param_size(model) logger.info("model_size: {:.3f} MB".format(model_size)) if config.label_smooth > 0: criterion = utils.CrossEntropyLabelSmooth(num_classes, config.label_smooth) else: criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) logger.info("start testing...") best_top1 = test(valid_loader, model, criterion) logger.info("Final Prec@1: {:.4%}".format(best_top1)) logger.info("*** Finish {} ***".format(config.stage))
def make_param_declaration(self, assign_node): lhs = assign_node.targets[0].id size = u.param_size(assign_node) decl = u.stmt_from_str( "self._m_%s = tf.Variable(init_params(%s), name='%s')" % (lhs, size, lhs)) softmax = u.stmt_from_str( "%s = tpt.softmax(self._m_%s, scope='%s_softmax')" % (lhs, lhs, lhs)) return [decl, softmax], ("self._m_%s" % lhs, lhs)
def get_model(config, device, dev_list, genotype=None): mtype = config.type configure_ops(config) if mtype in model_creator: config.augment = not genotype is None net, arch = model_creator[mtype](config) crit = get_net_crit(config).to(device) prim = gt.get_primitives() model = NASController(config, net, crit, prim, dev_list).to(device) if config.augment: print("genotype = {}".format(genotype)) model.build_from_genotype(genotype) model.to(device=device) if config.verbose: print(model) mb_params = param_size(model) n_params = param_count(model) print("Model params count: {:.3f} M, size: {:.3f} MB".format(n_params, mb_params)) NASModule.set_device(dev_list) return model, arch else: raise Exception("invalid model type")
def evaluation(sample, name): geno = eval(convert_sample_to_genotype(sample)) logger.info('Model sample: {}'.format(sample)) logger.info('Genotype: {}'.format(str(geno))) # get data with meta info input_size, input_channels, n_classes, train_data, valid_data = utils.get_data( 'cifar10', args.data_path, config['image_size'], config['cutout_length'], validation=True) criterion = nn.CrossEntropyLoss().to(device) use_aux = True # change size of input image input_size = config['image_size'] model = AugmentCNN(input_size, input_channels, config['init_channels'], 10, config['layers'], True, geno) mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) model = nn.DataParallel(model, device_ids=[0]).to(device) # weights optimizer optimizer = torch.optim.SGD(model.parameters(), config['lr'], momentum=0.9, weight_decay=3e-4) # get data loader train_loader = torch.utils.data.DataLoader(train_data, batch_size=config['batch_size'], \ shuffle=True, num_workers=4, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=config['batch_size'], \ shuffle=True, num_workers=4, pin_memory=True) # lr scheduler lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config['epochs']) best_top1 = 0. len_train_loader = len(train_loader) # training loop for epoch in range(config['epochs']): lr_scheduler.step() drop_prob = 0.2 * epoch / config['epochs'] model.module.drop_path_prob(drop_prob, config['fp']) train(train_loader, model, optimizer, criterion, epoch) cur_step = (epoch + 1) * len_train_loader top1 = validate(valid_loader, model, criterion, epoch, cur_step) if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False # utils.save_checkpoint(model, config.path, is_best) logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) return best_top1, geno
def main(): logger.info("Logger is set - training start") # set default gpu device id torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get data with meta info input_size, input_channels, n_classes, train_data, valid_data = utils.get_data( config.dataset, config.data_path, config.cutout_length, validation=True) criterion = nn.CrossEntropyLoss().to(device) use_aux = config.aux_weight > 0. model = AugmentCNN(input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype) model = nn.DataParallel(model, device_ids=config.gpus).to(device) # model size mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) # weights optimizer optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum, weight_decay=config.weight_decay) train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, config.epochs) best_top1 = 0. # training loop for epoch in range(config.epochs): lr_scheduler.step() drop_prob = config.drop_path_prob * epoch / config.epochs model.module.drop_path_prob(drop_prob) # training train(train_loader, model, optimizer, criterion, epoch) # validation cur_step = (epoch+1) * len(train_loader) top1 = validate(valid_loader, model, criterion, epoch, cur_step) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False timebudget.report() utils.save_checkpoint(model, config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
def main(): config = RetrainConfig() main_proc = not config.distributed or config.local_rank == 0 if config.distributed: torch.cuda.set_device(config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method=config.dist_url, rank=config.local_rank, world_size=config.world_size) if main_proc: os.makedirs(config.output_path, exist_ok=True) if config.distributed: torch.distributed.barrier() logger = utils.get_logger(os.path.join(config.output_path, 'search.log')) if main_proc: config.print_params(logger.info) utils.reset_seed(config.seed) loaders, samplers = get_augment_datasets(config) train_loader, valid_loader = loaders train_sampler, valid_sampler = samplers train_loader = CyclicIterator(train_loader, train_sampler) # valid_loader = CyclicIterator(valid_loader, valid_sampler, False) model = Model(config.dataset, config.layers, in_channels=config.input_channels, channels=config.init_channels, retrain=True).cuda() if config.label_smooth > 0: criterion = utils.CrossEntropyLabelSmooth(config.n_classes, config.label_smooth) else: criterion = nn.CrossEntropyLoss() fixed_arc_path = os.path.join('', config.arc_checkpoint) with open(fixed_arc_path, "r") as f: fixed_arc = json.load(f) fixed_arc = utils.encode_tensor(fixed_arc, torch.device("cuda")) genotypes = utils.parse_results(fixed_arc, n_nodes=4) genotypes_dict = {i: genotypes for i in range(3)} apply_fixed_architecture(model, fixed_arc_path) param_size = utils.param_size(model, criterion, [3, 512, 512]) if main_proc: logger.info("Param size: %.6f", param_size) logger.info("Genotype: %s", genotypes) # change training hyper parameters according to cell type if 'cifar' in config.dataset: if param_size < 3.0: config.weight_decay = 3e-4 config.drop_path_prob = 0.2 elif 3.0 < param_size < 3.5: config.weight_decay = 3e-4 config.drop_path_prob = 0.3 else: config.weight_decay = 5e-4 config.drop_path_prob = 0.3 if config.distributed: apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel(model, delay_allreduce=True) optimizer = torch.optim.AdamW(model.parameters(), config.lr) # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, config.epochs, eta_min=1E-6) best_top1 = 0. epoch = 0 try: checkpoint = torch.load(config.model_checkpoint) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] loss = checkpoint['loss'] model.eval() print("----------------------------") print("MODEL LOADED FROM CHECKPOINT" + config.model_checkpoint) print("----------------------------") except: print("----------------------------") print("MODEL NOT LOADED FROM CHECKPOINT") print("----------------------------") pass # for epoch in range(0, epoch): # lr_scheduler.step() for epoch in range(epoch, config.epochs): drop_prob = config.drop_path_prob * epoch / config.epochs if config.distributed: model.module.drop_path_prob(drop_prob) else: model.drop_path_prob(drop_prob) # training if config.distributed: train_sampler.set_epoch(epoch) train(logger, config, train_loader, model, optimizer, criterion, epoch, main_proc) if (epoch % config.log_frequency == 0): # validation top1 = validate(logger, config, valid_loader, model, criterion, epoch, main_proc) best_top1 = max(best_top1, top1) # lr_scheduler.step() logger.info("Final best Prec@1 = %.4f", best_top1)
def exp(args, fold_idx, train_set, test_set): path = args.save_root + args.result_dir if not os.path.isdir(path): os.makedirs(path) os.makedirs(path + '/models') os.makedirs(path + '/logs') logger = eegdg_logger(path + f'/logs/{fold_idx}') with open(path + '/args.txt', 'w') as f: f.write(str(args)) import torch.cuda cuda = torch.cuda.is_available() # check if GPU is available, if True chooses to use it device = 'cuda' if cuda else 'cpu' if cuda: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False seed = args.seed random.seed(seed) torch.manual_seed(seed) if cuda: torch.cuda.manual_seed_all(seed) np.random.seed(seed) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False) model = models.get_model(args) # model = FcClfNet(embedding_net) # model = torch.nn.DataParallel(model) mb_params = utils.param_size(model) print(f"Model size = {mb_params:.4f} MB") if cuda: model.cuda(device=device) print(model) optimizer = torch.optim.RMSprop(model.parameters(), lr=args.lr, weight_decay=0.01) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs - 1) results_columns = [ f'valid_loss', f'test_loss', f'valid_accuracy', f'test_acc0.3uracy' ] df = pd.DataFrame(columns=results_columns) valid_acc = 0 best_acc = 0 max_acc = 0 for epochidx in range(1, args.epochs): print(epochidx) start = time.time() train(10, model, device, train_loader, optimizer, scheduler, cuda, args.gpuidx) print(f'total time: {time.time()-start}') # utils.blockPrint() train_loss, train_score = eval(model, device, train_loader) test_loss, test_score = eval(model, device, test_loader) # utils.enablePrint() scheduler.step() lr = scheduler.get_last_lr()[0] # # lrs = [] # for i in range(100): # scheduler.step() # lr = scheduler.get_last_lr()[0] # lrs.append(lr) # # import matplotlib.pyplot as plt # plt.plot(lrs) # plt.show() print(f'LR : {lr}') logger.log_training(train_loss, train_score, test_loss, test_score, lr, epochidx) if test_score >= max_acc: max_acc = test_score torch.save( model.state_dict(), os.path.join(path, 'models', f"model_fold{fold_idx}_max.pt")) max_epoch = epochidx torch.save( model.state_dict(), os.path.join(path, 'models', f"model_fold{fold_idx}_last.pt")) print(f'current max acc : {max_acc:.4f} at epoch {max_epoch}') best_model = models.get_model(args) best_model.load_state_dict( torch.load(os.path.join(path, 'models', f"model_fold{fold_idx}_last.pt"), map_location=device)) if cuda: best_model.cuda(device=device) print("last accuracy") _, _ = eval(best_model, device, test_loader) df = utils.get_testset_accuracy(best_model, device, test_set, args) logger.close() return df
def exp(args, fold_idx, train_set, valid_set, test_set): path = args.save_root + args.result_dir if not os.path.isdir(path): os.makedirs(path) os.makedirs(path + '/models') with open(path + '/args.txt', 'w') as f: f.write(str(args)) import torch.cuda cuda = torch.cuda.is_available() device = 'cuda' if cuda else 'cpu' if cuda: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False seed = args.seed random.seed(seed) torch.manual_seed(seed) if cuda: torch.cuda.manual_seed_all(seed) np.random.seed(seed) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True, pin_memory=False) valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=args.batch_size, shuffle=False) test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False) temp = torch.utils.data.Subset(train_set, range(100, 200)) model = Net(args.n_class, args.n_ch, args.n_time) mb_params = utils.param_size(model) print(f"Model size = {mb_params:.4f} MB") if cuda: model.cuda(device=device) print(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs) results_columns = [f'valid_loss', f'test_loss', f'valid_accuracy', f'test_accuracy'] df = pd.DataFrame(columns=results_columns) valid_min_loss = float('inf') best_acc_loss = 0 for epochidx in range(1, args.epochs): print(epochidx) train(10, model, device, train_loader, optimizer, scheduler, cuda, args.gpuidx) valid_loss, valid_score, _ = eval(model, device, valid_loader) test_loss, test_score, _ = eval(model, device, test_loader) results = {f'valid_loss': valid_loss, f'test_loss': test_loss, f'valid_accuracy': valid_score, f'test_accuracy': test_score} df = df.append(results, ignore_index=True) print(results) scheduler.step() lr = scheduler.get_last_lr()[0] print(f'LR : {lr}') if valid_loss < valid_min_loss: valid_min_loss = valid_loss best_acc_loss = test_score torch.save(model.state_dict(), os.path.join( path, 'models', f"model_fold{fold_idx}.pt")) best_loss_epoch = epochidx print(f'current best(loss) acc : {best_acc_loss:.4f} at epoch {best_loss_epoch}') best_model = Net(args.n_class, args.n_ch, args.n_time) best_model.load_state_dict(torch.load(os.path.join( path, 'models', f"model_fold{fold_idx}.pt"), map_location=device)) if cuda: best_model.cuda(device=device) print("best accuracy") _, test_score, _ = eval(best_model, device, test_loader) utils.enablePrint() print(f"subject:{fold_idx}, acc:{test_score}") df = pd.DataFrame(np.array(test_score).reshape(-1, 1), columns=['sess2-on']) print(f"all acc: {np.mean(test_score):.4f}") return df
def main(): logger.info("Logger is set - training start") fileRoot = r'/home/hlu/Data/VIPL' saveRoot = r'/home/hlu/Data/VIPL_STMap' + str(config.fold_num) + str(config.fold_index) n_classes = 1 input_channels = 3 input_size = np.array([64, 300]) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) toTensor = transforms.ToTensor() resize = transforms.Resize(size=(64, 300)) # set default gpu device id torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) # net acc torch.backends.cudnn.benchmark = True # get data with meta info if config.reData == 1: test_index, train_index = MyDataset.CrossValidation(fileRoot, fold_num=config.fold_num, fold_index=config.fold_index) Train_Indexa = MyDataset.getIndex(fileRoot, train_index, saveRoot + '_Train', 'STMap_YUV_Align_CSI_POS.png', 15, 300) Test_Indexa = MyDataset.getIndex(fileRoot, test_index, saveRoot + '_Test', 'STMap_YUV_Align_CSI_POS.png', 15, 300) train_data = MyDataset.Data_STMap(root_dir=(saveRoot + '_Train'), frames_num=300, transform=transforms.Compose([resize, toTensor, normalize])) valid_data = MyDataset.Data_STMap(root_dir=(saveRoot + '_Test'), frames_num=300, transform=transforms.Compose([resize, toTensor, normalize])) train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True) # loss criterion = nn.L1Loss().to(device) # net Model_name = config.name + 'fn' + str(config.fold_num) + 'fi' + str(config.fold_index) use_aux = config.aux_weight > 0. if config.reTrain == 1: model = torch.load(os.path.join(config.path, Model_name + 'best.pth.tar'), map_location=device) print('load ' + Model_name + ' right') model = nn.DataParallel(model, device_ids=config.gpus).to(device) else: model = AugmentCNN(input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype) model._init_weight() model = nn.DataParallel(model, device_ids=config.gpus).to(device) # model size mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) # weights optimizer optimizer = torch.optim.Adam(model.parameters(), config.lr) best_losses = 10 # training loop for epoch in range(config.epochs): # training train(train_loader, model, optimizer, criterion, epoch) # validation cur_step = (epoch+1) * len(train_loader) best_losses = validate(valid_loader, model, criterion, epoch, cur_step, best_losses) logger.info("Final best Losses@1 = {:.4%}".format(best_losses))
def main(): logger.info("Logger is set - training start") logger.info("Torch version is: {}".format(torch.__version__)) logger.info("Torch_vision version is: {}".format(torchvision.__version__)) # set default gpu device id torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) # using deterministic torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.enabled = True # get data with meta info input_size, input_channels, n_classes, train_data, valid_data = utils.get_data( config.dataset, config.data_path, config.cutout_length, validation=True) criterion = nn.CrossEntropyLoss().to(device) use_aux = config.aux_weight > 0. model = AugmentCNN(input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype) # model size mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) model = nn.DataParallel(model, device_ids=config.gpus).to(device) # weights optimizer optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum, weight_decay=config.weight_decay) # get data loader if config.data_loader_type == 'Torch': train_loader = torch.utils.data.DataLoader( train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader( valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True) elif config.data_loader_type == 'DALI': config.dataset = config.dataset.lower() if config.dataset == 'cifar10': from DataLoaders_DALI import cifar10 train_loader = cifar10.get_cifar_iter_dali( type='train', image_dir=config.data_path, batch_size=config.batch_size, num_threads=config.workers) valid_loader = cifar10.get_cifar_iter_dali( type='val', image_dir=config.data_path, batch_size=config.batch_size, num_threads=config.workers) else: raise NotImplementedError else: raise NotImplementedError lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config.epochs) best_top1 = 0. if config.data_loader_type == 'DALI': len_train_loader = get_train_loader_len(config.dataset.lower(), config.batch_size, is_train=True) else: len_train_loader = len(train_loader) # training loop for epoch in range(config.epochs): lr_scheduler.step() drop_prob = config.drop_path_prob * epoch / config.epochs model.module.drop_path_prob(drop_prob) # training train(train_loader, model, optimizer, criterion, epoch) # validation cur_step = (epoch + 1) * len_train_loader top1 = validate(valid_loader, model, criterion, epoch, cur_step) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False utils.save_checkpoint(model, config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
def exp(args, fold_idx, train_set, valid_set, test_set): path = args.save_root + args.result_dir if not os.path.isdir(path): os.makedirs(path) os.makedirs(path + '/models') os.makedirs(path + '/logs') logger = eegdg_logger(path + f'/logs/{fold_idx}') with open(path + '/args.txt', 'w') as f: f.write(str(args)) import torch.cuda cuda = torch.cuda.is_available() # check if GPU is available, if True chooses to use it device = 'cuda' if cuda else 'cpu' if cuda: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False seed = args.seed random.seed(seed) torch.manual_seed(seed) if cuda: torch.cuda.manual_seed_all(seed) np.random.seed(seed) train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size=args.batch_size, shuffle=False) model = models.get_model(args) # model = FcClfNet(embedding_net) # model = torch.nn.DataParallel(model) mb_params = utils.param_size(model) print(f"Model size = {mb_params:.4f} MB") if cuda: model.cuda(device=device) print(model) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=1e-4, momentum=0.9) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs - 1) results_columns = [ f'valid_loss', f'test_loss', f'valid_accuracy', f'test_accuracy' ] df = pd.DataFrame(columns=results_columns) valid_acc = 0 valid_min_loss = 100 best_acc = 0 best_acc_loss = 0 max_acc = 0 n_epochs_stop = 200 epochs_no_improve = 0 early_stop = False for epochidx in range(1, args.epochs): print(epochidx) start = time.time() train(10, model, device, train_loader, optimizer, scheduler, cuda, args.gpuidx) print(f'total time: {time.time()-start}') utils.blockPrint() train_loss, train_score = eval(model, device, train_loader) valid_loss, valid_score = eval(model, device, valid_loader) test_loss, test_score = eval(model, device, test_loader) utils.enablePrint() scheduler.step() lr = scheduler.get_last_lr()[0] print(f'LR : {lr}') logger.log_training(train_loss, train_score, test_loss, test_score, lr, epochidx) results = { f'valid_loss': valid_loss, f'test_loss': test_loss, f'valid_accuracy': valid_score, f'test_accuracy': test_score } df = df.append(results, ignore_index=True) print(results) if valid_score > valid_acc: valid_acc = valid_score best_acc = test_score torch.save( model.state_dict(), os.path.join(path, 'models', f"model_fold{fold_idx}_best.pt")) best_epoch = epochidx if valid_loss < valid_min_loss: #모델이 개선된경우 valid_min_loss = valid_loss best_acc_loss = test_score torch.save( model.state_dict(), os.path.join(path, 'models', f"model_fold{fold_idx}_best(loss).pt")) best_loss_epoch = epochidx epochs_no_improve = 0 else: epochs_no_improve += 1 if test_score > max_acc: max_acc = test_score torch.save( model.state_dict(), os.path.join(path, 'models', f"model_fold{fold_idx}_max.pt")) max_epoch = epochidx print(f'current best acc : {best_acc:.4f} at epoch {best_epoch}') print( f'current best(loss) acc : {best_acc_loss:.4f} at epoch {best_loss_epoch}' ) print(f'current max acc : {max_acc:.4f} at epoch {max_epoch}') if epochidx > 5 and epochs_no_improve == n_epochs_stop: print('Early stopping!') early_stop = True break else: continue if early_stop: print("Stopped") best_model = models.get_model(args) best_model.load_state_dict( torch.load(os.path.join(path, 'models', f"model_fold{fold_idx}_best.pt"), map_location=device)) if cuda: best_model.cuda(device=device) print("best accuracy") _, _ = eval(best_model, device, test_loader) df = utils.get_testset_accuracy(best_model, device, test_set, args) return df
def evaluate(self, trial_no, trial_hyperparams): """Evaluates objective function Trains the child model k times with same augmentation hyperparameters. k is determined by the user by `opt_samples` argument. Args: trial_no (int): no of trial. needed for recording to notebook trial_hyperparams (list) Returns: float: trial-cost = 1 - avg. rewards from samples """ augmented_data = augment_by_policy(self.data["X_train"], self.data["y_train"], *trial_hyperparams) sample_rewards = [] #pytorch layers = 2 init_channels = 24 use_aux = True epochs = 30 lr = 0.01 momentum = 0.995 weight_decay = 0.995 drop_path_prob = 0.2 genotype = "Genotype(normal=[[('dil_conv_3x3', 0), ('sep_conv_5x5', 1)], [('sep_conv_3x3', 1), ('avg_pool_3x3', 0)],[('dil_conv_3x3', 1), ('dil_conv_3x3', 0)], [('sep_conv_3x3', 3), ('skip_connect', 1)]], normal_concat=range(2, 6), reduce=[[('sep_conv_3x3', 1), ('dil_conv_5x5', 0)], [('skip_connect', 0), ('sep_conv_5x5', 1)], [('sep_conv_5x5', 1),('sep_conv_5x5', 0)], [('max_pool_3x3', 1), ('sep_conv_3x3', 0)]], reduce_concat=range(2, 6))" model = AugmentCNN(self.input_size, self.input_channels, init_channels, self.n_classes, layers, use_aux, genotype) model = nn.DataParallel(model, device_ids='0').to(device) # model size mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) # weights optimizer optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) a = 2 / 0 """ for sample_no in range(1, self.opt_samples + 1): self.child_model.load_pre_augment_weights() # TRAIN history = self.child_model.fit(self.data, augmented_data) # reward = self.calculate_reward(history) sample_rewards.append(reward) self.notebook.record( trial_no, trial_hyperparams, sample_no, reward, history ) """ best_top1 = -9999 for epoch in range(epochs): lr_scheduler.step() drop_prob = drop_path_prob * epoch / epochs model.module.drop_path_prob(drop_prob) # training train(train_loader, model, optimizer, criterion, epoch) # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, criterion, epoch, cur_step) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False print('best_top1:', best_top1) #sample_rewards.append(reward) #self.notebook.record( # trial_no, trial_hyperparams, sample_no, reward, history #) #trial_cost = 1 - np.mean(sample_rewards) #self.notebook.save() log_and_print( f"{str(trial_no)}, {str(trial_cost)}, {str(trial_hyperparams)}", self.logging, ) #return trial_cost return best_top1
def main(): logger.info("Logger is set - training start") # set default gpu device id torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get data with meta info input_size, input_channels, n_classes, train_data, valid_data = utils.get_data( config.dataset, config.data_path, config.cutout_length, validation=True, autoaugment=config.autoaugment) if config.label_smooth != 0: criterion = utils.CrossEntropyLabelSmooth( 10, config.label_smooth).to(device) else: criterion = nn.CrossEntropyLoss().to(device) use_aux = config.aux_weight > 0. if config.dataset in utils.LARGE_DATASETS: model = AugmentCNNImageNet(input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype) else: model = AugmentCNN(input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype, SSC=config.SSC) model = nn.DataParallel(model, device_ids=config.gpus).to(device) # model size mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) # weights optimizer if config.p != 1: optimizer = torch.optim.SGD(model.parameters(), 1., momentum=config.momentum, weight_decay=config.weight_decay) else: optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum, weight_decay=config.weight_decay) train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True) if config.p == 1: lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config.epochs) else: lr_cpa = utils.cosine_power_annealing_lr(nepochs=config.epochs, min_lr=config.lr_min, max_lr=config.lr, p=config.p) lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, [lr_cpa]) best_top1 = 0. # training loop for epoch in range(config.epochs): lr_scheduler.step() drop_prob = config.drop_path_prob * epoch / config.epochs model.module.drop_path_prob(drop_prob) # training train(train_loader, model, optimizer, criterion, epoch) # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, criterion, epoch, cur_step) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False utils.save_checkpoint(model, config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%} for job {}".format( best_top1, config.name))
def main(): if not torch.cuda.is_available(): logger.info("no gpu device available") sys.exit(1) logger.info("*** Begin {} ***".format(config.stage)) # set default gpu device torch.cuda.set_device(config.gpus[0]) # set random seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get data with meta info logger.info("preparing data...") input_size, channels_in, num_classes, train_data, valid_data = \ load_dataset(dataset=config.dataset, data_dir=config.data_dir, cutout_length=config.cutout_length, validation=True, auto_aug=config.auto_aug) train_loader = torch.utils.data.DataLoader( dataset=train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader( dataset=valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers, pin_memory=True) logger.info("parsing genotypes...") genotypes = parse_genotypes() logger.info(genotypes) logger.info("building model...") model = AugmentCNN(input_size=input_size, channels_in=channels_in, channels_init=config.init_channels, num_cells=config.num_cells, num_nodes=config.num_nodes, num_classes=num_classes, stem_multiplier=3, auxiliary=(config.aux_weight > 0), genotypes=genotypes, alpha_share=config.alpha_share) model = model.to(device) model_size = utils.param_size(model) logger.info("model_size: {:.3f} MB".format(model_size)) if config.label_smooth > 0: criterion = utils.CrossEntropyLabelSmooth( num_classes, config.label_smooth) else: criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) optimizer = torch.optim.SGD(params=model.parameters(), lr=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay) if config.power_lr: lr_scheduler = utils.CosinePowerAnnealingLR( optimizer=optimizer, T_max=config.epochs, p=2) else: lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer=optimizer, T_max=config.epochs) logger.info("start training...") history_top1 = [] best_top1 = 0.0 for epoch in range(config.epochs): lr_scheduler.step() lr = lr_scheduler.get_lr()[0] logger.info("epoch: {:d}, lr: {:e}".format(epoch, lr)) drop_prob = config.drop_path_prob * epoch / config.epochs model.drop_path_prob(drop_prob) train(train_loader, model, criterion, optimizer, epoch) global_step = (epoch + 1) * len(train_loader) - 1 valid_top1 = valid(valid_loader, model, criterion, epoch, global_step) history_top1.append(valid_top1) if epoch == 0 or best_top1 < valid_top1: best_top1 = valid_top1 is_best = True else: is_best = False utils.save_checkpoint(model, config.model_dir, is_best=is_best) with open(os.path.join(config.stage_dir, "history_top1.pk"), "wb") as f: pickle.dump(history_top1, f) logger.info("Final best valid Prec@1: {:.4%}".format(best_top1)) logger.info("*** Finish {} ***".format(config.stage))
def main(HIDDEN_NODE_FC1, HIDDEN_NODE_FC2, HIDDEN_NODE_FC3): args = parse_args_for_train(HIDDEN_NODE_FC1, HIDDEN_NODE_FC2, HIDDEN_NODE_FC3) custom_train_data_from_txt = CustomDatasetFromTxt(args.dataset, train=True) custom_test_data_from_txt = CustomDatasetFromTxt(args.dataset, train=False) if custom_train_data_from_txt.data_len > 10000: args.batch_size = 1000 args.log_interval = custom_train_data_from_txt.data_len / 10 writer = SummaryWriter(log_dir=os.path.join(args.log_path, 'tensorboard')) writer.add_text('config', utils.as_markdown(args), 0) logger = utils.get_logger( os.path.join(args.log_path, "{}.log".format("automl_nn_ax"))) mlp_params = argparse.Namespace(FC1=HIDDEN_NODE_FC1, FC2=HIDDEN_NODE_FC2, FC3=HIDDEN_NODE_FC3, sync=False) utils.print_params(mlp_params, logger.info) train_loader = torch.utils.data.DataLoader( dataset=custom_train_data_from_txt, batch_size=args.batch_size, shuffle=True) validate_loader = torch.utils.data.DataLoader( dataset=custom_test_data_from_txt, batch_size=custom_test_data_from_txt.data_len, shuffle=False) input_size, out_size = custom_train_data_from_txt.input_out_size() torch.cuda.set_device(args.gpu) #set seed np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.benchmark = True model = MLPNet(input_size, HIDDEN_NODE_FC1, HIDDEN_NODE_FC2, HIDDEN_NODE_FC3, out_size) NN = MLP_network(input_size, HIDDEN_NODE_FC1, HIDDEN_NODE_FC2, HIDDEN_NODE_FC3, out_size) test_mlp_data_flow = TestMLP_network(NN) res_map = test_mlp_data_flow.test_eyeriss_isca16() model = model.to(device) if args.multi_gpu: model = torch.nn.DataParallel(model) start_epoch = 0 best_top1 = 0. if args.resume: logger.info("===> resume from the checkpoint") assert os.path.isdir(args.log_path), 'Error: no checkpoint path found!' checkpoint_file = best_filename = os.path.join(args.log_path, 'best.pth.tar') checkpoint = torch.load(checkpoint_file) model.load_state_dict(checkpoint['net']) best_top1 = checkpoint['acc'] start_epoch = checkpoint['epoch'] #model size mb_params = utils.param_size(model) logger.info("model size: {:.3f} KB".format(mb_params)) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), args.w_lr) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.2) for epoch in range(start_epoch, start_epoch + args.epochs): lr_scheduler.step() # training train(train_loader, model, optimizer, criterion, epoch, logger, args) top1 = validate(validate_loader, model, criterion, epoch, logger, args) if best_top1 < top1: best_top1 = top1 state = { 'net': model.state_dict(), 'acc': best_top1, 'epoch': epoch, } utils.save_checkpoint(state, args.log_path, True) logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) logger.info("total_cost:{%d},total_time:{%d}" % (int(res_map['total_cost']), int(res_map['total_time']))) return [best_top1, res_map['total_cost'], res_map['total_time']]
def start_run(): config = Config() if os.path.exists(config.path): while True: cont_str = input( "Name has been used. Continue and delete other log files? (y/n)" ) if cont_str.lower() == 'n': exit() elif cont_str.lower() == 'y': shutil.rmtree(config.path) break else: print("Invalid input.") device = torch.device("cuda") # tensorboard writer = SummaryWriter(log_dir=os.path.join(config.path, "tb")) writer.add_text('config', config.as_markdown(), 0) logger = utils.get_logger( os.path.join(config.path, "{}.log".format(config.name))) config.print_params(logger.info) logger.info("Logger is set - training start") # set gpu device id logger.info("Set GPU device {}".format(config.gpu)) torch.cuda.set_device(config.gpu) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True #TODO: fix folds/cv data_params, train_data, valid_data = utils.get_data( config.prop_mouse_data_to_use) model = UNet(config.total_channels_to_add, data_params['num_classes'], data_params['input_channels'], config.shake_drop, not config.no_scse, config.num_downsamples, config.num_blocks_per_downsample) model = model.to(device) logger.info("Model Size (MB): {}".format(utils.param_size(model))) train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True) nb_iters_train = config.epochs * len(train_loader) if config.lr_finder: w_sched_lr = utils.ExpFinderSchedule(config.w_lr_start, config.w_lr_end, nb_iters_train) else: w_sched_lr = utils.PiecewiseLinearOrCos( [0.0, config.first_prop * nb_iters_train, nb_iters_train], np.array([config.w_lr_start, config.w_lr_middle, config.w_lr_end]), [False, True]) if config.wd_finder: weight_decay = utils.ExpFinderSchedule(config.w_weight_decay, config.w_weight_decay_end, nb_iters_train) else: weight_decay = config.w_weight_decay w_optim = Adam(model.parameters(), lr=w_sched_lr, weight_decay=weight_decay) cur_step = 0 best_iou = 0. # training loop for epoch in range(config.epochs): cur_step = train(train_loader, model, w_optim, epoch, writer, device, config, logger, cur_step) if (epoch + 1) % config.val_freq == 0: # validation total_iou = validate(valid_loader, model, epoch, cur_step, writer, device, config, logger) saves = ['checkpoint'] is_best = best_iou < total_iou # save if is_best: best_iou = total_iou saves.append('best') utils.save_item(model, config.path, saves) print("") logger.info("Final best iou = {:.4%}".format(best_iou))
def main(): logger.info("Logger is set - training start") # set gpu device id logger.info("Set GPU device {}".format(config.gpu)) torch.cuda.set_device(config.gpu) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get dataset train_data, valid_data, data_shape = get_dataset(config.data, config.data_path, config.aug_lv) # build model criterion = nn.CrossEntropyLoss().to(device) model = FractalNet(data_shape, config.columns, config.init_channels, p_ldrop=config.p_ldrop, dropout_probs=config.dropout_probs, gdrop_ratio=config.gdrop_ratio, gap=config.gap, init=config.init, pad_type=config.pad, doubling=config.doubling, dropout_pos=config.dropout_pos, consist_gdrop=config.consist_gdrop) model = model.to(device) # model size m_params = utils.param_size(model) logger.info("Models:\n{}".format(model)) logger.info("Model size (# of params) = {:.3f} M".format(m_params)) # weights optimizer optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum) # setup data loader train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, config.lr_milestone) best_top1 = 0. # training loop for epoch in range(config.epochs): lr_scheduler.step() # training train(train_loader, model, optimizer, criterion, epoch) # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, criterion, epoch, cur_step) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False utils.save_checkpoint(model.state_dict(), config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
def main(config, writer, logger): logger.info("Logger is set - training augment start") # get data with meta info input_size, input_channels, n_classes, train_data, valid_data = utils.get_data( config.dataset, config.data_path, config.cutout_length, validation=True) criterion = nn.CrossEntropyLoss().cuda() use_aux = config.aux_weight > 0. model = AugmentCNN(input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype).cuda() # model size mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) # weights optimizer optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum, weight_decay=config.weight_decay) train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config.epochs) best_top1 = 0. # training loop for epoch in range(config.epochs): lr_scheduler.step() drop_prob = config.drop_path_prob * epoch / config.epochs model.drop_path_prob(drop_prob) # training train(train_loader, model, optimizer, criterion, epoch, config, writer, logger) # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, criterion, epoch, cur_step, config, writer, logger) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False utils.save_checkpoint(model, config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
def main(): config = RetrainConfig() main_proc = not config.distributed or config.local_rank == 0 if config.distributed: torch.cuda.set_device(config.local_rank) torch.distributed.init_process_group(backend='nccl', init_method=config.dist_url, rank=config.local_rank, world_size=config.world_size) if main_proc: os.makedirs(config.output_path, exist_ok=True) if config.distributed: torch.distributed.barrier() logger = utils.get_logger(os.path.join(config.output_path, 'search.log')) if main_proc: config.print_params(logger.info) utils.reset_seed(config.seed) loaders, samplers = get_augment_datasets(config) train_loader, valid_loader = loaders train_sampler, valid_sampler = samplers model = Model(config.dataset, config.layers, in_channels=config.input_channels, channels=config.init_channels, retrain=True).cuda() if config.label_smooth > 0: criterion = utils.CrossEntropyLabelSmooth(config.n_classes, config.label_smooth) else: criterion = nn.CrossEntropyLoss() fixed_arc_path = os.path.join(config.output_path, config.arc_checkpoint) with open(fixed_arc_path, "r") as f: fixed_arc = json.load(f) fixed_arc = utils.encode_tensor(fixed_arc, torch.device("cuda")) genotypes = utils.parse_results(fixed_arc, n_nodes=4) genotypes_dict = {i: genotypes for i in range(3)} apply_fixed_architecture(model, fixed_arc_path) param_size = utils.param_size( model, criterion, [3, 32, 32] if 'cifar' in config.dataset else [3, 224, 224]) if main_proc: logger.info("Param size: %.6f", param_size) logger.info("Genotype: %s", genotypes) # change training hyper parameters according to cell type if 'cifar' in config.dataset: if param_size < 3.0: config.weight_decay = 3e-4 config.drop_path_prob = 0.2 elif 3.0 < param_size < 3.5: config.weight_decay = 3e-4 config.drop_path_prob = 0.3 else: config.weight_decay = 5e-4 config.drop_path_prob = 0.3 if config.distributed: apex.parallel.convert_syncbn_model(model) model = DistributedDataParallel(model, delay_allreduce=True) optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum, weight_decay=config.weight_decay) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, config.epochs, eta_min=1E-6) best_top1 = best_top5 = 0. for epoch in range(config.epochs): drop_prob = config.drop_path_prob * epoch / config.epochs if config.distributed: model.module.drop_path_prob(drop_prob) else: model.drop_path_prob(drop_prob) # training if config.distributed: train_sampler.set_epoch(epoch) train(logger, config, train_loader, model, optimizer, criterion, epoch, main_proc) # validation top1, top5 = validate(logger, config, valid_loader, model, criterion, epoch, main_proc) best_top1 = max(best_top1, top1) best_top5 = max(best_top5, top5) lr_scheduler.step() logger.info("Final best Prec@1 = %.4f Prec@5 = %.4f", best_top1, best_top5)
def main(): logger.info("Logger is set - training start") # set default gpu device id torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get data with meta info #input_size, input_channels, n_classes, train_data, valid_data = utils.get_data( # config.dataset, config.data_path, config.cutout_length, validation=True) input_size, input_channels, n_classes, train_data, test_dat, val_dat = utils.get_data( config.dataset, config.data_path, cutout_length=0, validation=True, validation2=True) print('input_size', input_size) criterion = nn.CrossEntropyLoss().to(device) use_aux = config.aux_weight > 0. #from evaluate #model = SearchCNNController(input_channels, config.init_channels, n_classes, config.layers, # net_crit, device_ids=config.gpus) model = AugmentCNN(input_size, input_channels, config.init_channels, n_classes, config.layers, use_aux, config.genotype) model = nn.DataParallel(model, device_ids=config.gpus).to(device) # model size mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) # weights optimizer optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum, weight_decay=config.weight_decay) # split data to train/validation best_top1 = 0. best_top_overall = -999 n_train = len(train_data) n_val = len(val_dat) n_test = len(test_dat) split = n_train // 2 indices1 = list(range(n_train)) indices2 = list(range(n_val)) indices3 = list(range(n_test)) train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices1) valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices2) test_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices3) train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, sampler=train_sampler, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(val_dat, batch_size=config.batch_size, sampler=valid_sampler, num_workers=config.workers, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_dat, batch_size=config.batch_size, sampler=test_sampler, num_workers=config.workers, pin_memory=True) """ train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, shuffle=True, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=config.batch_size, shuffle=False, num_workers=config.workers, pin_memory=True) """ lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, config.epochs) #lambda1 = lambda epoch: 0.95 ** epoch #lr_scheduler = torch.optim.lr_scheduler.RLambdaLR(optimizer, lr_lambda=[lambda1]) best_top1 = 0. # training loop for epoch in range(config.epochs): lr_scheduler.step() drop_prob = config.drop_path_prob * epoch / config.epochs model.module.drop_path_prob(drop_prob) # training train(train_loader, model, optimizer, criterion, epoch) # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, criterion, epoch, cur_step) # save if best_top1 < top1: best_top1 = top1 is_best = True else: is_best = False #utils.save_checkpoint(model, config.path, is_best) utils.save_checkpoint2(model, epoch, optimizer, criterion, config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1))