def train(self): layers = self.layers switches = None for epoch in range(self.pdarts_epoch): layers = self.layers+self.pdarts_num_layers[epoch] model, criterion, optim, lr_scheduler = self.model_creator(layers) self.mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches) for callback in self.callbacks: callback.build(model, self.mutator, self) callback.on_epoch_begin(epoch) darts_callbacks = [] if lr_scheduler is not None: darts_callbacks.append(LRSchedulerCallback(lr_scheduler)) self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, optimizer=optim, callbacks=darts_callbacks, **self.darts_parameters) logger.info("start pdarts training epoch %s...", epoch) self.trainer.train() switches = self.mutator.drop_paths() for callback in self.callbacks: callback.on_epoch_end(epoch)
def main(args): reset_seed(args.seed) prepare_logger(args) logger.info("These are the hyper-parameters you want to tune:\n%s", pprint.pformat(vars(args))) device = 'cuda' if torch.cuda.is_available() else 'cpu' train_loader, test_loader = data_preprocess(args) # model = models.__dict__[args.model](num_classes=10) model = CNN(32, 3, args.channels, 10, args.layers) model.to(device) criterion = nn.CrossEntropyLoss() if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.initial_lr, weight_decay=args.weight_decay) else: if args.optimizer == 'sgd': optimizer_cls = optim.SGD elif args.optimizer == 'rmsprop': optimizer_cls = optim.RMSprop optimizer = optimizer_cls(model.parameters(), lr=args.initial_lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.lr_scheduler == 'cosin': scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, args.epochs, eta_min=args.ending_lr) elif args.lr_scheduler == 'linear': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1) trainer = DartsTrainer( model, loss=criterion, metrics=lambda output, target: accuracy(output, target), optimizer=optimizer, num_epochs=args.epochs, dataset_train=train_loader, dataset_valid=test_loader, batch_size=args.batch_size, log_frequency=args.log_frequency, unrolled=args.unrolled, callbacks=[ LRSchedulerCallback(scheduler), ArchitectureCheckpoint("./checkpoints_layer5") ]) if args.visualization: trainer.enable_visualization() trainer.train()
tanh_constant=1.1, cell_exit_extra_step=True) else: raise AssertionError criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.001) trainer = enas.EnasTrainer(model, loss=criterion, metrics=accuracy, reward_function=reward_accuracy, optimizer=optimizer, callbacks=[ LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./checkpoints") ], batch_size=args.batch_size, num_epochs=num_epochs, dataset_train=dataset_train, dataset_valid=dataset_valid, log_frequency=args.log_frequency, mutator=mutator) trainer.train()
criterion = nn.CrossEntropyLoss() if args.arch is not None: logger.info('model retraining...') with open(args.arch, 'r') as f: arch = json.load(f) for trial in query_nb201_trial_stats(arch, 200, 'cifar100'): pprint.pprint(trial) apply_fixed_architecture(model, args.arch) dataloader_train = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=0) dataloader_valid = DataLoader(dataset_valid, batch_size=args.batch_size, shuffle=True, num_workers=0) train(args, model, dataloader_train, dataloader_valid, criterion, optim, torch.device('cuda' if torch.cuda.is_available() else 'cpu')) exit(0) trainer = enas.EnasTrainer(model, loss=criterion, metrics=lambda output, target: accuracy(output, target, topk=(1,)), reward_function=reward_accuracy, optimizer=optim, callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./checkpoints")], batch_size=args.batch_size, num_epochs=args.epochs, dataset_train=dataset_train, dataset_valid=dataset_valid, log_frequency=args.log_frequency) if args.visualization: trainer.enable_visualization() trainer.train()
lambda step: (1.0 - step / args.epochs) if step <= args.epochs else 0, last_epoch=-1) train_loader = get_imagenet_iter_dali( "train", args.imagenet_dir, args.batch_size, args.workers, spos_preprocessing=args.spos_preprocessing) valid_loader = get_imagenet_iter_dali( "val", args.imagenet_dir, args.batch_size, args.workers, spos_preprocessing=args.spos_preprocessing) trainer = SPOSSupernetTrainer(model, criterion, accuracy, optimizer, args.epochs, train_loader, valid_loader, mutator=mutator, batch_size=args.batch_size, log_frequency=args.log_frequency, workers=args.workers, callbacks=[ LRSchedulerCallback(scheduler), ModelCheckpoint("./checkpoints") ]) trainer.train()
criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, eps=1e-3, weight_decay=2e-6) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs, eta_min=1e-5) trainer = TextNASTrainer( model, loss=criterion, metrics=lambda output, target: {"acc": accuracy(output, target)}, reward_function=accuracy, optimizer=optimizer, callbacks=[LRSchedulerCallback(lr_scheduler)], batch_size=args.batch_size, num_epochs=args.epochs, dataset_train=None, dataset_valid=None, train_loader=train_loader, valid_loader=valid_loader, test_loader=test_loader, log_frequency=args.log_frequency, mutator=mutator, mutator_lr=2e-3, mutator_steps=500, mutator_steps_aggregate=1, child_steps=3000, baseline_decay=0.99, test_arc_per_epoch=10)
def main(args): reset_seed(args.seed) prepare_logger(args) logger.info("These are the hyper-parameters you want to tune:\n%s", pprint.pformat(vars(args))) if args.model == 'nas': logger.info("Using NAS.\n") if args.fix_arch: if not os.path.exists(args.arc_checkpoint): print(args.arc_checkpoint, 'does not exist, don not fix archetect') args.fix_arch = False device = 'cuda' if torch.cuda.is_available() else 'cpu' if args.model == 'nas': if not args.fix_arch: model = CNN(32, 3, args.channels, 10, args.layers) trainset, testset = data_preprocess(args) else: model = CNN(32, 3, args.channels, 10, args.layers) apply_fixed_architecture(model, args.arc_checkpoint) model.to(device) train_loader, test_loader = data_preprocess(args) else: train_loader, test_loader = data_preprocess(args) model = models.__dict__[args.model]() model.to(device) criterion = nn.CrossEntropyLoss() if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.initial_lr, weight_decay=args.weight_decay) else: if args.optimizer == 'sgd': optimizer_cls = optim.SGD elif args.optimizer == 'rmsprop': optimizer_cls = optim.RMSprop optimizer = optimizer_cls(model.parameters(), lr=args.initial_lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=args.ending_lr) if args.model == 'nas' and not args.fix_arch: trainer = DartsTrainer(model, loss=criterion, metrics=lambda output, target: accuracyTopk( output, target, topk=(1, )), optimizer=optimizer, num_epochs=args.epochs, dataset_train=trainset, dataset_valid=testset, batch_size=args.batch_size, log_frequency=args.log_frequency, unrolled=args.unrolled, callbacks=[ LRSchedulerCallback(scheduler), ArchitectureCheckpoint("./checkpoints") ]) if args.visualization: trainer.enable_visualization() trainer.train() trainer.export("final_arch.json") else: for epoch in range(1, args.epochs + 1): train(model, train_loader, criterion, optimizer, scheduler, args, epoch, device) top1, _ = test(model, test_loader, criterion, args, epoch, device) nni.report_intermediate_result(top1) logger.info("Final accuracy is: %.6f", top1) nni.report_final_result(top1)
def main(): args, cfg = parse_config_args('nni.cream.supernet') # resolve logging output_dir = os.path.join( cfg.SAVE_PATH, "{}-{}".format(datetime.date.today().strftime('%m%d'), cfg.MODEL)) if not os.path.exists(output_dir): os.mkdir(output_dir) if args.local_rank == 0: logger = get_logger(os.path.join(output_dir, "train.log")) else: logger = None # initialize distributed parameters torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') if args.local_rank == 0: logger.info('Training on Process %d with %d GPUs.', args.local_rank, cfg.NUM_GPU) # fix random seeds torch.manual_seed(cfg.SEED) torch.cuda.manual_seed_all(cfg.SEED) np.random.seed(cfg.SEED) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # generate supernet model, sta_num, resolution = gen_supernet( flops_minimum=cfg.SUPERNET.FLOPS_MINIMUM, flops_maximum=cfg.SUPERNET.FLOPS_MAXIMUM, num_classes=cfg.DATASET.NUM_CLASSES, drop_rate=cfg.NET.DROPOUT_RATE, global_pool=cfg.NET.GP, resunit=cfg.SUPERNET.RESUNIT, dil_conv=cfg.SUPERNET.DIL_CONV, slice=cfg.SUPERNET.SLICE, verbose=cfg.VERBOSE, logger=logger) # number of choice blocks in supernet choice_num = len(model.blocks[7]) if args.local_rank == 0: logger.info('Supernet created, param count: %d', (sum([m.numel() for m in model.parameters()]))) logger.info('resolution: %d', (resolution)) logger.info('choice number: %d', (choice_num)) # initialize flops look-up table model_est = FlopsEst(model) flops_dict, flops_fixed = model_est.flops_dict, model_est.flops_fixed # optionally resume from a checkpoint optimizer_state = None resume_epoch = None if cfg.AUTO_RESUME: optimizer_state, resume_epoch = resume_checkpoint( model, cfg.RESUME_PATH) # create optimizer and resume from checkpoint optimizer = create_optimizer_supernet(cfg, model, USE_APEX) if optimizer_state is not None: optimizer.load_state_dict(optimizer_state['optimizer']) model = model.cuda() # convert model to distributed mode if cfg.BATCHNORM.SYNC_BN: try: if USE_APEX: model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if args.local_rank == 0: logger.info('Converted model to use Synchronized BatchNorm.') except Exception as exception: logger.info( 'Failed to enable Synchronized BatchNorm. ' 'Install Apex or Torch >= 1.1 with Exception %s', exception) if USE_APEX: model = DDP(model, delay_allreduce=True) else: if args.local_rank == 0: logger.info( "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP." ) # can use device str in Torch >= 1.1 model = DDP(model, device_ids=[args.local_rank]) # create learning rate scheduler lr_scheduler, num_epochs = create_supernet_scheduler(cfg, optimizer) start_epoch = resume_epoch if resume_epoch is not None else 0 if start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: logger.info('Scheduled epochs: %d', num_epochs) # imagenet train dataset train_dir = os.path.join(cfg.DATA_DIR, 'train') if not os.path.exists(train_dir): logger.info('Training folder does not exist at: %s', train_dir) sys.exit() dataset_train = Dataset(train_dir) loader_train = create_loader(dataset_train, input_size=(3, cfg.DATASET.IMAGE_SIZE, cfg.DATASET.IMAGE_SIZE), batch_size=cfg.DATASET.BATCH_SIZE, is_training=True, use_prefetcher=True, re_prob=cfg.AUGMENTATION.RE_PROB, re_mode=cfg.AUGMENTATION.RE_MODE, color_jitter=cfg.AUGMENTATION.COLOR_JITTER, interpolation='random', num_workers=cfg.WORKERS, distributed=True, collate_fn=None, crop_pct=DEFAULT_CROP_PCT, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD) # imagenet validation dataset eval_dir = os.path.join(cfg.DATA_DIR, 'val') if not os.path.isdir(eval_dir): logger.info('Validation folder does not exist at: %s', eval_dir) sys.exit() dataset_eval = Dataset(eval_dir) loader_eval = create_loader(dataset_eval, input_size=(3, cfg.DATASET.IMAGE_SIZE, cfg.DATASET.IMAGE_SIZE), batch_size=4 * cfg.DATASET.BATCH_SIZE, is_training=False, use_prefetcher=True, num_workers=cfg.WORKERS, distributed=True, crop_pct=DEFAULT_CROP_PCT, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, interpolation=cfg.DATASET.INTERPOLATION) # whether to use label smoothing if cfg.AUGMENTATION.SMOOTHING > 0.: train_loss_fn = LabelSmoothingCrossEntropy( smoothing=cfg.AUGMENTATION.SMOOTHING).cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() else: train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = train_loss_fn mutator = RandomMutator(model) trainer = CreamSupernetTrainer(model, train_loss_fn, validate_loss_fn, optimizer, num_epochs, loader_train, loader_eval, mutator=mutator, batch_size=cfg.DATASET.BATCH_SIZE, log_frequency=cfg.LOG_INTERVAL, meta_sta_epoch=cfg.SUPERNET.META_STA_EPOCH, update_iter=cfg.SUPERNET.UPDATE_ITER, slices=cfg.SUPERNET.SLICE, pool_size=cfg.SUPERNET.POOL_SIZE, pick_method=cfg.SUPERNET.PICK_METHOD, choice_num=choice_num, sta_num=sta_num, acc_gap=cfg.ACC_GAP, flops_dict=flops_dict, flops_fixed=flops_fixed, local_rank=args.local_rank, callbacks=[ LRSchedulerCallback(lr_scheduler), ModelCheckpoint(output_dir) ]) trainer.train()