def main(): args = parse_args() update_config(cfg, args) # cudnn related setting cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED # Set the random seed manually for reproducibility. np.random.seed(cfg.SEED) torch.manual_seed(cfg.SEED) torch.cuda.manual_seed_all(cfg.SEED) # Loss criterion = CrossEntropyLoss(cfg.MODEL.NUM_CLASSES).cuda() # model and optimizer print(f"Definining network with {cfg.MODEL.LAYERS} layers...") model = Network(cfg.MODEL.INIT_CHANNELS, cfg.MODEL.NUM_CLASSES, cfg.MODEL.LAYERS, criterion, primitives_2, drop_path_prob=cfg.TRAIN.DROPPATH_PROB) model = model.cuda() # weight params arch_params = list(map(id, model.arch_parameters())) weight_params = filter(lambda p: id(p) not in arch_params, model.parameters()) # Optimizer optimizer = optim.Adam( weight_params, lr=cfg.TRAIN.LR ) # resume && make log dir and logger if args.load_path and os.path.exists(args.load_path): checkpoint_file = os.path.join(args.load_path, 'Model', 'checkpoint_best.pth') assert os.path.exists(checkpoint_file) checkpoint = torch.load(checkpoint_file) # load checkpoint begin_epoch = checkpoint['epoch'] last_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) best_acc1 = checkpoint['best_acc1'] optimizer.load_state_dict(checkpoint['optimizer']) args.path_helper = checkpoint['path_helper'] logger = create_logger(args.path_helper['log_path']) logger.info("=> loaded checkpoint '{}'".format(checkpoint_file)) else: exp_name = args.cfg.split('/')[-1].split('.')[0] args.path_helper = set_path('logs_search', exp_name) logger = create_logger(args.path_helper['log_path']) begin_epoch = cfg.TRAIN.BEGIN_EPOCH best_acc1 = 0.0 last_epoch = -1 logger.info(args) logger.info(cfg) # copy model file this_dir = os.path.dirname(__file__) shutil.copy2( os.path.join(this_dir, 'models', cfg.MODEL.NAME + '.py'), args.path_helper['ckpt_path']) # Datasets and dataloaders # The toy dataset is downloaded with 10 items for each partition. Remove the sample_size parameters to use the full toy dataset asv_train, asv_dev, asv_eval = asv_toys(sample_size=10) train_dataset = asv_train #MNIST('mydata', transform=totensor, train=True, download=True) val_dataset = asv_dev #MNIST('mydata', transform=totensor, train=False, download=True) train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE, num_workers=cfg.DATASET.NUM_WORKERS, pin_memory=True, shuffle=True, drop_last=True, ) print(f'search.py: Train loader of {len(train_loader)} batches') print(f'Tot train set: {len(train_dataset)}') val_loader = torch.utils.data.DataLoader( dataset=val_dataset, batch_size=cfg.TRAIN.BATCH_SIZE, num_workers=cfg.DATASET.NUM_WORKERS, pin_memory=True, shuffle=True, drop_last=True, ) print(f'search.py: Val loader of {len(val_loader)} batches') print(f'Tot val set {len(val_dataset)}') test_dataset = asv_eval #MNIST('mydata', transform=totensor, train=False, download=True) test_loader = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=cfg.TRAIN.BATCH_SIZE, num_workers=cfg.DATASET.NUM_WORKERS, pin_memory=True, shuffle=True, drop_last=True, ) # training setting writer_dict = { 'writer': SummaryWriter(args.path_helper['log_path']), 'train_global_steps': begin_epoch * len(train_loader), 'valid_global_steps': begin_epoch // cfg.VAL_FREQ, } # training loop architect = Architect(model, cfg) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, cfg.TRAIN.END_EPOCH, cfg.TRAIN.LR_MIN, last_epoch=last_epoch ) for epoch in tqdm(range(begin_epoch, cfg.TRAIN.END_EPOCH), desc='search progress'): model.train() genotype = model.genotype() logger.info('genotype = %s', genotype) if cfg.TRAIN.DROPPATH_PROB != 0: model.drop_path_prob = cfg.TRAIN.DROPPATH_PROB * epoch / (cfg.TRAIN.END_EPOCH - 1) train(cfg, model, optimizer, train_loader, val_loader, criterion, architect, epoch, writer_dict) if epoch % cfg.VAL_FREQ == 0: # get threshold and evaluate on validation set acc = validate_identification(cfg, model, test_loader, criterion) # remember best acc@1 and save checkpoint is_best = acc > best_acc1 best_acc1 = max(acc, best_acc1) # save logger.info('=> saving checkpoint to {}'.format(args.path_helper['ckpt_path'])) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'arch': model.arch_parameters(), 'genotype': genotype, 'path_helper': args.path_helper }, is_best, args.path_helper['ckpt_path'], 'checkpoint_{}.pth'.format(epoch)) lr_scheduler.step(epoch)
def start(args): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) cudnn.benchmark = True cudnn.enabled = True logging.info("args = %s", args) dataset = LoadData(args.data_name) if args.data_name == 'SBM_PATTERN': in_dim = 3 num_classes = 2 elif args.data_name == 'SBM_CLUSTER': in_dim = 7 num_classes = 6 print(f"input dimension: {in_dim}, number classes: {num_classes}") criterion = MyCriterion(num_classes) criterion = criterion.cuda() model = Network(args.layers, args.nodes, in_dim, args.feature_dim, num_classes, criterion, args.data_type, args.readout) model = model.cuda() logging.info("param size = %fMB", count_parameters_in_MB(model)) train_data, val_data, test_data = dataset.train, dataset.val, dataset.test num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) print(f"train set full size : {num_train}; split train set size : {split}") train_queue = torch.utils.data.DataLoader( train_data, batch_size = args.batch_size, sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), pin_memory = True, num_workers=args.workers, collate_fn = dataset.collate) valid_queue = torch.utils.data.DataLoader( train_data, batch_size = args.batch_size, sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]), pin_memory = True, num_workers=args.workers, collate_fn = dataset.collate) true_valid_queue = torch.utils.data.DataLoader( val_data, batch_size=args.batch_size, pin_memory=True, num_workers=args.workers, collate_fn=dataset.collate) test_queue = torch.utils.data.DataLoader( test_data, batch_size=args.batch_size, pin_memory=True, num_workers=args.workers, collate_fn=dataset.collate) optimizer = torch.optim.SGD(model.parameters(),args.learning_rate, momentum=args.momentum,weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.learning_rate_min) architect = Architect(model, args) # viz = Visdom(env = '{} {}'.format(args.data_name, time.asctime(time.localtime(time.time())) )) viz = None save_file = open(args.save_result, "w") for epoch in range(args.epochs): scheduler.step() lr = scheduler.get_lr()[0] logging.info('[LR]\t%f', lr) if epoch % args.save_freq == 0: print(model.show_genotypes()) save_file.write(f"Epoch : {epoch}\n{model.show_genotypes()}\n") for i in range(args.layers): logging.info('layer = %d', i) genotype = model.show_genotype(i) logging.info('genotype = %s', genotype) ''' w1, w2, w3 = model.show_weights(0) print('[1] weights in first cell\n',w1) print('[2] weights in middle cell\n', w2) print('[3] weights in last cell\n', w3) ''' # training macro_acc, micro_acc, loss = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr, epoch, viz) # true validation macro_acc, micro_acc, loss = infer(true_valid_queue, model, criterion, stage = 'validating') # testing macro_acc, micro_acc, loss = infer(test_queue, model, criterion, stage = ' testing ')
def main(): args = parse_args() reset_config(config, args) #device = torch.device("cuda") # tensorboard if not os.path.exists(config.SEARCH.PATH): os.makedirs(config.SEARCH.PATH) writer = SummaryWriter(log_dir=os.path.join(config.SEARCH.PATH, "log")) logger = utils.get_logger(os.path.join(config.SEARCH.PATH, "{}.log".format(config.SEARCH.NAME))) logger.info("Logger is set - training start") # cudnn related setting cudnn.benchmark = config.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = config.CUDNN.ENABLED # set seed #np.random.seed(config.SEARCH.SEED) #torch.manual_seed(config.SEARCH.SEED) #torch.cuda.manual_seed_all(config.SEARCH.SEED) torch.backends.cudnn.benchmark = True gpus = [int(i) for i in config.GPUS.split(',')] criterion = JointsMSELoss(use_target_weight = config.LOSS.USE_TARGET_WEIGHT).to(device) model = Network(config) if len(gpus)>1: model = nn.DataParallel(model) model = model.cuda() #for name,p in model.module.named_parameters(): # logger.info(name) mb_params = utils.param_size(model) logger.info("Model size = {:.3f} MB".format(mb_params)) # weights optimizer params = model.parameters() #arch_params = list(map(id, model.module.arch_parameters())) #weight_params = filter(lambda p: id(p) not in arch_params, model.parameters()) #params = [{'params': weight_params}, # {'params': model.module.arch_parameters(), 'lr': 0.0004}] optimizer = torch.optim.Adam(params, config.SEARCH.W_LR) # split data to train/validation normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]) train_data = MPIIDataset(config, config.DATASET.ROOT, config.SEARCH.TRAIN_SET, True, transforms.Compose([ transforms.ToTensor(), normalize, ])) valid_data = MPIIDataset(config, config.DATASET.ROOT, config.SEARCH.TEST_SET, False, transforms.Compose([ transforms.ToTensor(), normalize, ])) print(len(train_data),len(valid_data)) train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.SEARCH.BATCH_SIZE, shuffle=True, num_workers=config.WORKERS, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=config.SEARCH.BATCH_SIZE, shuffle=False, num_workers=config.WORKERS, pin_memory=True) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, config.SEARCH.LR_STEP, config.SEARCH.LR_FACTOR) # training loop best_top1 = 0. for epoch in range(config.SEARCH.EPOCHS): lr_scheduler.step() # training train(config, train_loader, model, criterion, optimizer, epoch, logger, writer) # validation cur_step = (epoch+1) * len(train_loader) top1 = validate(config, valid_loader, valid_data, epoch+1, model, criterion, logger, writer) # log # genotype genotype = model.module.genotype() logger.info(F.softmax(model.module.alphas_normal, dim=-1)) logger.info(F.softmax(model.module.alphas_reduce, dim=-1)) logger.info("genotype = {}".format(genotype)) # save state = {'state_dict':model.state_dict(), 'schedule':lr_scheduler.state_dict(), 'epoch':epoch+1} if best_top1 < top1: best_top1 = top1 best_genotype = genotype is_best = True else: is_best = False utils.save_checkpoint(state, config.SEARCH.PATH, is_best) logger.info("Final best Accuracy = {:.3f}".format(best_top1)) logger.info("Best Genotype = {}".format(best_genotype))