def main(): global args, best_prec1, dtype best_prec1 = 0 args = parser.parse_args() dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt'), resume=args.resume is not '') results_path = os.path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model logging.info("creating model %s", args.model) model = models.__dict__[args.model] model_config = {'input_size': args.input_size, 'dataset': args.dataset} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) model = model(**model_config) logging.info("created model with configuration: %s", model_config) # optionally resume from a checkpoint if args.evaluate: if not os.path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate) model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): results.load(os.path.join(checkpoint_file, 'results.csv')) checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file) args.start_epoch = checkpoint['epoch'] - 1 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) else: logging.error("no checkpoint found at '%s'", args.resume) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # Data loading code default_transform = { 'train': get_transform(args.dataset, input_size=args.input_size, augment=True), 'eval': get_transform(args.dataset, input_size=args.input_size, augment=False) } transform = getattr(model, 'input_transform', default_transform) regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) # define loss function (criterion) and optimizer criterion = getattr(model, 'criterion', nn.CrossEntropyLoss)() criterion.to(args.device, dtype) model.to(args.device, dtype) val_data = get_dataset(args.dataset, 'val', transform['eval']) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, 0) return train_data = get_dataset(args.dataset, 'train', transform['train']) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) optimizer = OptimRegime(model.parameters(), regime) logging.info('training regime: %s', regime) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_prec1, train_prec5 = train(train_loader, model, criterion, epoch, optimizer) # evaluate on validation set val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion, epoch) # remember best prec@1 and save checkpoint is_best = val_prec1 > best_prec1 best_prec1 = max(val_prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'regime': regime }, is_best, path=save_path) logging.info('\n Epoch: {0}\t' 'Training Loss {train_loss:.4f} \t' 'Training Prec@1 {train_prec1:.3f} \t' 'Training Prec@5 {train_prec5:.3f} \t' 'Validation Loss {val_loss:.4f} \t' 'Validation Prec@1 {val_prec1:.3f} \t' 'Validation Prec@5 {val_prec5:.3f} \n'.format( epoch + 1, train_loss=train_loss, val_loss=val_loss, train_prec1=train_prec1, val_prec1=val_prec1, train_prec5=train_prec5, val_prec5=val_prec5)) results.add(epoch=epoch + 1, train_loss=train_loss, val_loss=val_loss, train_error1=100 - train_prec1, val_error1=100 - val_prec1, train_error5=100 - train_prec5, val_error5=100 - val_prec5) results.plot(x='epoch', y=['train_loss', 'val_loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['train_error1', 'val_error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['train_error5', 'val_error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') results.save()
def main_worker(args): global best_prec1, dtype best_prec1 = 0 dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = path.join(args.results_dir, args.save) args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) args.local_rank = dist.get_rank() args.world_size = dist.get_world_size() if args.dist_backend == 'mpi': # If using MPI, select all visible devices args.device_ids = list(range(torch.cuda.device_count())) else: args.device_ids = [args.local_rank] # if not (args.distributed and args.local_rank > 0): if not path.exists(save_path): makedirs(save_path) dump_args(args, path.join(save_path, 'args.txt')) setup_logging(path.join(save_path, 'log.txt'), resume=args.resume is not '', dummy=False) results_path = path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # All parameters to the model should be passed via this dict. model_config = { 'dataset': args.dataset, 'dp_type': args.dropout_type, 'dp_percentage': args.dropout_perc, 'dropout': args.drop_rate, 'device': args.device } if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) # create Resnet model model = resnet(**model_config) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # # optionally resume from a checkpoint # if args.evaluate: # if not path.isfile(args.evaluate): # parser.error('invalid checkpoint: {}'.format(args.evaluate)) # checkpoint = torch.load(args.evaluate, map_location="cpu") # # Overrride configuration with checkpoint info # args.model = checkpoint.get('model', args.model) # args.model_config = checkpoint.get('config', args.model_config) # # load checkpoint # model.load_state_dict(checkpoint['state_dict']) # logging.info("loaded checkpoint '%s' (epoch %s)", # args.evaluate, checkpoint['epoch']) # # if args.resume: # checkpoint_file = args.resume # if path.isdir(checkpoint_file): # results.load(path.join(checkpoint_file, 'results.csv')) # checkpoint_file = path.join( # checkpoint_file, 'model_best.pth.tar') # if path.isfile(checkpoint_file): # logging.info("loading checkpoint '%s'", args.resume) # checkpoint = torch.load(checkpoint_file, map_location="cpu") # if args.start_epoch < 0: # not explicitly set # args.start_epoch = checkpoint['epoch'] # best_prec1 = checkpoint['best_prec1'] # model.load_state_dict(checkpoint['state_dict']) # optim_state_dict = checkpoint.get('optim_state_dict', None) # logging.info("loaded checkpoint '%s' (epoch %s)", # checkpoint_file, checkpoint['epoch']) # else: # logging.error("no checkpoint found at '%s'", args.resume) # else: # optim_state_dict = None # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # Batch-norm should always be done in float if 'half' in args.dtype: FilterModules(model, module=is_bn).to(dtype=torch.float) # optimizer configuration optim_regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) optimizer = optim_regime if isinstance(optim_regime, OptimRegime) \ else OptimRegime(model, optim_regime, use_float_copy='half' in args.dtype) # if optim_state_dict is not None: # optimizer.load_state_dict(optim_state_dict) trainer = Trainer(model, criterion, optimizer, device_ids=args.device_ids, device=args.device, dtype=dtype, distributed=args.distributed, local_rank=args.local_rank, mixup=args.mixup, loss_scale=args.loss_scale, grad_clip=args.grad_clip, print_freq=args.print_freq, adapt_grad_norm=args.adapt_grad_norm) if args.tensorwatch: trainer.set_watcher(filename=path.abspath( path.join(save_path, 'tensorwatch.log')), port=args.tensorwatch_port) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False }) if args.evaluate: results = trainer.validate(val_data.get_loader()) logging.info(results) return # Training Data loading code train_data_defaults = { 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'autoaugment': args.autoaugment, 'cutout': { 'holes': 1, 'length': 16 } if args.cutout else None } if hasattr(model, 'sampled_data_regime'): sampled_data_regime = model.sampled_data_regime probs, regime_configs = zip(*sampled_data_regime) regimes = [] for config in regime_configs: defaults = {**train_data_defaults} defaults.update(config) regimes.append(DataRegime(None, defaults=defaults)) train_data = SampledDataRegime(regimes, probs) else: train_data = DataRegime(getattr(model, 'data_regime', None), defaults=train_data_defaults) logging.info('optimization regime: %s', optim_regime) args.start_epoch = max(args.start_epoch, 0) trainer.training_steps = args.start_epoch * len(train_data) for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) # train for one epoch train_results = trainer.train(train_data.get_loader(), chunk_batch=args.chunk_batch) # evaluate on validation set val_results = trainer.validate(val_data.get_loader()) # # save weights heatmap # w = model._modules['layer3']._modules['5']._modules['conv2']._parameters['weight'].view(64, -1).cpu().detach().numpy() # heat_maps_dir = 'C:\\Users\\Pavel\\Desktop\\targeted_dropout_pytorch\\pics\\experiment_0' # plot = sns.heatmap(w, center=0) # name = str(datetime.now()).replace(':', '_').replace('-', '_').replace('.', '_').replace(' ', '_') + '.png' # plot.get_figure().savefig(path.join(heat_maps_dir, name)) # plt.clf() if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) if args.drop_optim_state: optim_state_dict = None else: optim_state_dict = optimizer.state_dict() save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'optim_state_dict': optim_state_dict, 'best_prec1': best_prec1 }, is_best, path=save_path, save_all=False) logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Prec@1 {train[prec1]:.3f} \t' 'Training Prec@5 {train[prec5]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Prec@1 {val[prec1]:.3f} \t' 'Validation Prec@5 {val[prec5]:.3f} \t\n'.format( epoch + 1, train=train_results, val=val_results)) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) results.add(**values) results.plot(x='epoch', y=['training loss', 'validation loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['training error1', 'validation error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['training error5', 'validation error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') if 'grad' in train_results.keys(): results.plot(x='epoch', y=['training grad'], legend=['gradient L2 norm'], title='Gradient Norm', ylabel='value') results.save()
def main(): global args, best_prec1, dtype best_prec1 = 0 args = parser.parse_args() dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) args.local_rank = dist.get_rank() args.world_size = dist.get_world_size() if args.dist_backend == 'mpi': # If using MPI, select all visible devices args.device_ids = list(range(torch.cuda.device_count())) else: args.device_ids = [args.local_rank] if not os.path.exists(save_path) and not (args.distributed and args.local_rank > 0): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt'), resume=args.resume is not '', dummy=args.distributed and args.local_rank > 0) results_path = os.path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model model = models.__dict__[args.model] model_config = {'dataset': args.dataset} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) model = model(**model_config) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # optionally resume from a checkpoint if args.evaluate: if not os.path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate) model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): results.load(os.path.join(checkpoint_file, 'results.csv')) checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file) if args.start_epoch < 0: # not explicitly set args.start_epoch = checkpoint['epoch'] - 1 best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) else: logging.error("no checkpoint found at '%s'", args.resume) # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # Batch-norm should always be done in float if 'half' in args.dtype: FilterModules(model, module=is_bn).to(dtype=torch.float) # optimizer configuration optim_regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) optimizer = optim_regime if isinstance(optim_regime, OptimRegime) \ else OptimRegime(model, optim_regime, use_float_copy='half' in args.dtype) trainer = Trainer(model, criterion, optimizer, device_ids=args.device_ids, device=args.device, dtype=dtype, distributed=args.distributed, local_rank=args.local_rank, grad_clip=args.grad_clip, print_freq=args.print_freq, adapt_grad_norm=args.adapt_grad_norm) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False }) if args.evaluate: results = trainer.validate(val_data.get_loader()) logging.info(results) return # Training Data loading code train_data = DataRegime(getattr(model, 'data_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'cutout': { 'holes': 1, 'length': 16 } if args.cutout else None }) logging.info('optimization regime: %s', optim_regime) args.start_epoch = max(args.start_epoch, 0) trainer.training_steps = args.start_epoch * len(train_data) for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) # train for one epoch train_results = trainer.train(train_data.get_loader(), duplicates=train_data.get('duplicates'), chunk_batch=args.chunk_batch) # evaluate on validation set val_results = trainer.validate(val_data.get_loader()) if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': best_prec1 }, is_best, path=save_path) logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Prec@1 {train[prec1]:.3f} \t' 'Training Prec@5 {train[prec5]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Prec@1 {val[prec1]:.3f} \t' 'Validation Prec@5 {val[prec5]:.3f} \t\n'.format( epoch + 1, train=train_results, val=val_results)) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) results.add(**values) results.plot(x='epoch', y=['training loss', 'validation loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['training error1', 'validation error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['training error5', 'validation error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') if 'grad' in train_results.keys(): results.plot(x='epoch', y=['training grad'], legend=['gradient L2 norm'], title='Gradient Norm', ylabel='value') results.save()
def main_worker(args, ml_logger): global best_prec1, dtype best_prec1 = 0 dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) args.local_rank = dist.get_rank() args.world_size = dist.get_world_size() if args.dist_backend == 'mpi': # If using MPI, select all visible devices args.device_ids = list(range(torch.cuda.device_count())) else: args.device_ids = [args.local_rank] if not (args.distributed and args.local_rank > 0): if not path.exists(args.save_path): makedirs(args.save_path) export_args_namespace(args, path.join(args.save_path, 'config.json')) setup_logging(path.join(args.save_path, 'log.txt'), resume=args.resume is not '', dummy=args.distributed and args.local_rank > 0) results_path = path.join(args.save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", args.save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model model = models.__dict__[args.model] model_config = {'dataset': args.dataset} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) model = model(**model_config) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) if args.resume: checkpoint_file = args.resume if path.isdir(checkpoint_file): results.load(path.join(checkpoint_file, 'results.csv')) checkpoint_file = path.join(checkpoint_file, 'model_best.pth.tar') if path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file, map_location="cpu") if args.start_epoch < 0: # not explicitly set args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optim_state_dict = checkpoint.get('optim_state_dict', None) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) else: logging.error("no checkpoint found at '%s'", args.resume) else: optim_state_dict = None # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # Batch-norm should always be done in float if 'half' in args.dtype: FilterModules(model, module=is_bn).to(dtype=torch.float) # optimizer configuration optim_regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) optimizer = optim_regime if isinstance(optim_regime, OptimRegime) \ else OptimRegime(model, optim_regime, use_float_copy='half' in args.dtype) if optim_state_dict is not None: optimizer.load_state_dict(optim_state_dict) trainer = Trainer( model, criterion, optimizer, device_ids=args.device_ids, device=args.device, dtype=dtype, print_freq=args.print_freq, distributed=args.distributed, local_rank=args.local_rank, mixup=args.mixup, cutmix=args.cutmix, loss_scale=args.loss_scale, grad_clip=args.grad_clip, adapt_grad_norm=args.adapt_grad_norm, ) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False }) # Training Data loading code train_data_defaults = { 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'autoaugment': args.autoaugment, 'cutout': { 'holes': 1, 'length': 16 } if args.cutout else None } if hasattr(model, 'sampled_data_regime'): sampled_data_regime = model.sampled_data_regime probs, regime_configs = zip(*sampled_data_regime) regimes = [] for config in regime_configs: defaults = {**train_data_defaults} defaults.update(config) regimes.append(DataRegime(None, defaults=defaults)) train_data = SampledDataRegime(regimes, probs) else: train_data = DataRegime(getattr(model, 'data_regime', None), defaults=train_data_defaults) logging.info('optimization regime: %s', optim_regime) logging.info('data regime: %s', train_data) args.start_epoch = max(args.start_epoch, 0) trainer.training_steps = args.start_epoch * len(train_data) if 'zeroBN' in model_config: #hot start num_steps = int(len(train_data.get_loader()) * 0.5) trainer.train(train_data.get_loader(), chunk_batch=args.chunk_batch, num_steps=num_steps) for m in model.modules(): if isinstance(m, ZeroBN): m.max_sparsity = args.max_sparsity m.max_cos_sim = args.max_cos_sim if args.preserve_cosine: if args.layers_cos_sim1 in m.fullName: m.preserve_cosine = args.preserve_cosine m.cos_sim = args.cos_sim1 if args.layers_cos_sim2 in m.fullName: m.preserve_cosine = args.preserve_cosine m.cos_sim = args.cos_sim2 if args.layers_cos_sim3 in m.fullName: m.preserve_cosine = args.preserve_cosine m.cos_sim = args.cos_sim3 if args.min_cos_sim: if args.layers_min_cos_sim1 in m.fullName: m.min_cos_sim = args.min_cos_sim m.cos_sim_min = args.cos_sim_min1 if args.layers_min_cos_sim2 in m.fullName: m.min_cos_sim = args.min_cos_sim m.cos_sim_min = args.cos_sim_min2 for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) if 'zeroBN' in model_config: trainer.collectStat(train_data.get_loader(), num_steps=1, prunRatio=args.stochasticPrunning, cos_sim=args.cos_sim, cos_sim_max=args.cos_sim_max) trainer.collectStat(train_data.get_loader(), num_steps=1, prunRatio=args.stochasticPrunning, cos_sim=args.cos_sim, cos_sim_max=args.cos_sim_max) # torch.cuda.empty_cache() train_results = trainer.train(train_data.get_loader(), ml_logger, chunk_batch=args.chunk_batch) # evaluate on validation set val_results = trainer.validate(val_data.get_loader()) ml_logger.log_metric('Val Acc1', val_results['prec1'], step='auto') # torch.cuda.empty_cache() if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) if args.drop_optim_state: optim_state_dict = None else: optim_state_dict = optimizer.state_dict() save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'optim_state_dict': optim_state_dict, 'best_prec1': best_prec1 }, is_best, path=args.save_path, save_all=args.save_all) logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Prec@1 {train[prec1]:.3f} \t' 'Training Prec@5 {train[prec5]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Prec@1 {val[prec1]:.3f} \t' 'Validation Prec@5 {val[prec5]:.3f} \t\n'.format( epoch + 1, train=train_results, val=val_results)) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) results.add(**values) results.plot(x='epoch', y=['training loss', 'validation loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['training error1', 'validation error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['training error5', 'validation error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') if 'grad' in train_results.keys(): results.plot(x='epoch', y=['training grad'], legend=['gradient L2 norm'], title='Gradient Norm', ylabel='value') results.save()
def main_worker(args): global best_prec1, dtype best_prec1 = 0 dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' model_config = {'dataset': args.dataset, 'batch': args.batch_size} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) ##autoname fname = auto_name(args, model_config) args.save = fname monitor = args.monitor print(fname) save_path = path.join(args.results_dir, args.save) args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) args.local_rank = dist.get_rank() args.world_size = dist.get_world_size() if args.dist_backend == 'mpi': # If using MPI, select all visible devices args.device_ids = list(range(torch.cuda.device_count())) else: args.device_ids = [args.local_rank] if not (args.distributed and args.local_rank > 0): if not args.dry: if not path.exists(save_path): makedirs(save_path) export_args_namespace(args, path.join(save_path, 'config.json')) if monitor > 0 and not args.dry: events_path = "runs/%s" % fname my_file = Path(events_path) if my_file.is_file(): os.remove(events_path) writer = SummaryWriter(log_dir=events_path ,comment=str(args)) model_config['writer'] = writer model_config['monitor'] = monitor else: monitor = 0 writer = None if args.dry: model = models.__dict__[args.model] model = model(**model_config) print("created model with configuration: %s" % model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) print("number of parameters: %d" % num_parameters) return setup_logging(path.join(save_path, 'log.txt'), resume=args.resume is not '', dummy=args.distributed and args.local_rank > 0) results_path = path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model model = models.__dict__[args.model] model = model(**model_config) if args.sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # optionally resume from a checkpoint if args.evaluate: if not path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate, map_location="cpu") # Overrride configuration with checkpoint info args.model = checkpoint.get('model', args.model) args.model_config = checkpoint.get('config', args.model_config) # load checkpoint model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) if args.resume: checkpoint_file = args.resume if path.isdir(checkpoint_file): results.load(path.join(checkpoint_file, 'results.csv')) checkpoint_file = path.join( checkpoint_file, 'model_best.pth.tar') if path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file, map_location="cpu") if args.start_epoch < 0: # not explicitly set args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optim_state_dict = checkpoint.get('optim_state_dict', None) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) else: logging.error("no checkpoint found at '%s'", args.resume) else: optim_state_dict = None # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # Batch-norm should always be done in float if 'half' in args.dtype: FilterModules(model, module=is_bn).to(dtype=torch.float) # optimizer configuration optim_regime = getattr(model, 'regime', [{'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay}]) optimizer = optim_regime if isinstance(optim_regime, OptimRegime) \ else OptimRegime(model, optim_regime, use_float_copy='half' in args.dtype) if optim_state_dict is not None: optimizer.load_state_dict(optim_state_dict) trainer = Trainer(model, criterion, optimizer, device_ids=args.device_ids, device=args.device, dtype=dtype, print_freq=args.print_freq, distributed=args.distributed, local_rank=args.local_rank, mixup=args.mixup, cutmix=args.cutmix, loss_scale=args.loss_scale, grad_clip=args.grad_clip, adapt_grad_norm=args.adapt_grad_norm, writer = writer, monitor = monitor) if args.tensorwatch: trainer.set_watcher(filename=path.abspath(path.join(save_path, 'tensorwatch.log')), port=args.tensorwatch_port) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False}) if args.evaluate: results = trainer.validate(val_data.get_loader()) logging.info(results) return # Training Data loading code train_data_defaults = {'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'autoaugment': args.autoaugment, 'cutout': {'holes': 1, 'length': 16} if args.cutout else None} if hasattr(model, 'sampled_data_regime'): sampled_data_regime = model.sampled_data_regime probs, regime_configs = zip(*sampled_data_regime) regimes = [] for config in regime_configs: defaults = {**train_data_defaults} defaults.update(config) regimes.append(DataRegime(None, defaults=defaults)) train_data = SampledDataRegime(regimes, probs) else: train_data = DataRegime( getattr(model, 'data_regime', None), defaults=train_data_defaults) logging.info('optimization regime: %s', optim_regime) logging.info('data regime: %s', train_data) args.start_epoch = max(args.start_epoch, 0) trainer.training_steps = args.start_epoch * len(train_data) if not args.covmat == "": try: int_covmat = int(args.covmat) if int_covmat < 0: total_layers = len([name for name, layer in model.named_children()]) int_covmat = total_layers + int_covmat child_cnt = 0 except ValueError: int_covmat = None def calc_covmat(x_, partitions = 64): L = x_.shape[0] // partitions non_diags = [] diags = [] for p1 in range(partitions): for p2 in range(partitions): x = x_[p1*L:(p1+1)*L] y = x_[p2*L:(p2+1)*L] X = torch.matmul(x,y.transpose(0,1)) if p1 == p2: mask = torch.eye(X.shape[0],dtype=torch.bool) non_diag = X[~mask].reshape(-1).cpu() diag = X[mask].reshape(-1).cpu() non_diags.append(non_diag) diags.append(diag) else: non_diag = X.reshape(-1).cpu() non_diags.append(diag) diags = torch.cat(diags) non_diags = torch.cat(non_diags) diag_var = diags.var() non_diag_var = non_diags.var() diags = diags - diags.mean() non_diags = non_diags - non_diags.mean() ##import pdb; pdb.set_trace() diag_small_ratio = (diags < -diags.std()).to(dtype = torch.float).mean() non_diag_small_ratio = (non_diags < -non_diags.std()).to(dtype = torch.float).mean() return diag_var, non_diag_var, diag_small_ratio, non_diag_small_ratio global diag_var_mean global non_diag_var_mean global var_count var_count = 0 diag_var_mean = 0 non_diag_var_mean = 0 def report_covmat_hook(module, input, output): global diag_var_mean global non_diag_var_mean global var_count flatten_output = output.reshape([-1,1]).detach() diag_var, non_diag_var, diag_small_ratio, non_diag_small_ratio = calc_covmat(flatten_output) diag_var_mean = diag_var_mean + diag_var non_diag_var_mean = non_diag_var_mean + non_diag_var var_count = var_count + 1 if var_count % 10 == 1: print("diag_var = %.02f (%.02f), ratio: %.02f , non_diag_var = %0.2f (%.02f), ratio: %.02f" % (diag_var, diag_var_mean/var_count, diag_small_ratio , non_diag_var, non_diag_var_mean/var_count, non_diag_small_ratio )) for name, layer in model.named_children(): if int_covmat is None: condition = (name == args.covmat) else: condition = (child_cnt == int_covmat) child_cnt = child_cnt + 1 if condition: layer.register_forward_hook( report_covmat_hook) for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) # train for one epoch train_results = trainer.train(train_data.get_loader(), chunk_batch=args.chunk_batch) # evaluate on validation set val_results = trainer.validate(val_data.get_loader()) if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) if args.drop_optim_state: optim_state_dict = None else: optim_state_dict = optimizer.state_dict() save_checkpoint({ 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'optim_state_dict': optim_state_dict, 'best_prec1': best_prec1 }, is_best, path=save_path, save_all=args.save_all) logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Prec@1 {train[prec1]:.3f} \t' 'Training Prec@5 {train[prec5]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Prec@1 {val[prec1]:.3f} \t' 'Validation Prec@5 {val[prec5]:.3f} \t\n' .format(epoch + 1, train=train_results, val=val_results)) if writer is not None: writer.add_scalar('Train/Loss', train_results['loss'], epoch) writer.add_scalar('Train/Prec@1', train_results['prec1'], epoch) writer.add_scalar('Train/Prec@5', train_results['prec5'], epoch) writer.add_scalar('Val/Loss', val_results['loss'], epoch) writer.add_scalar('Val/Prec@1', val_results['prec1'], epoch) writer.add_scalar('Val/Prec@5', val_results['prec5'], epoch) # tmplr = optimizer.get_lr() # writer.add_scalar('HyperParameters/learning-rate', tmplr, epoch) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) results.add(**values) results.plot(x='epoch', y=['training loss', 'validation loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['training error1', 'validation error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['training error5', 'validation error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') if 'grad' in train_results.keys(): results.plot(x='epoch', y=['training grad'], legend=['gradient L2 norm'], title='Gradient Norm', ylabel='value') results.save() logging.info(f'\nBest Validation Accuracy (top1): {best_prec1}') if writer: writer.close()
def main_worker(args): global best_prec1, dtype best_prec1 = 0 dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = path.join(args.results_dir, args.save) args.distributed = args.local_rank >= 0 or args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_init, world_size=args.world_size, rank=args.local_rank) args.local_rank = dist.get_rank() args.world_size = dist.get_world_size() if args.dist_backend == 'mpi': # If using MPI, select all visible devices args.device_ids = list(range(torch.cuda.device_count())) else: args.device_ids = [args.local_rank] if not (args.distributed and args.local_rank > 0): if not path.exists(save_path): makedirs(save_path) export_args_namespace(args, path.join(save_path, 'config.json')) setup_logging(path.join(save_path, 'log.txt'), resume=args.resume is not '', dummy=args.distributed and args.local_rank > 0) results_path = path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) grad_stats_path = path.join(save_path, 'grad_stats') grad_stats = ResultsLog(grad_stats_path, title='collect grad stats - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) logging.info("creating model %s", args.model) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model model = models.__dict__[args.model] model_config = {'dataset': args.dataset} if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) if args.enable_scheduler: model_config['fp8_dynamic'] = True if args.smart_loss_scale_only: model_config['smart_loss_scale_only'] = True if args.smart_loss_scale_and_exp_bits: model_config['smart_loss_scale_and_exp_bits'] = True model = model(**model_config) quantize_modules_name = [ n for n, m in model.named_modules() if isinstance(m, nn.Conv2d) ] fp8_scheduler = FP8TrainingScheduler( model, model_config, args, collect_stats_online=False, start_to_collect_stats_in_epoch=3, collect_stats_every_epochs=10, online_update=False, first_update_with_stats_from_epoch=4, start_online_update_in_epoch=3, update_every_epochs=1, update_loss_scale=True, update_exp_bit_width=args.smart_loss_scale_and_exp_bits, stats_path= "/data/moran/ConvNet_lowp_0/convNet.pytorch/results/2020-05-16_01-44-22/results.csv", # ResNet18- cifar10 # stats_path = "/data/moran/ConvNet_lowp_0/convNet.pytorch/results/2020-05-19_01-27-57/results.csv", # ResNet18- ImageNet quantize_modules_name=quantize_modules_name, enable_scheduler=False) if args.sync_bn: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # optionally resume from a checkpoint if args.evaluate: if not path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate, map_location="cpu") # Overrride configuration with checkpoint info args.model = checkpoint.get('model', args.model) args.model_config = checkpoint.get('config', args.model_config) # load checkpoint model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate, checkpoint['epoch']) if args.resume: checkpoint_file = args.resume if path.isdir(checkpoint_file): results.load(path.join(checkpoint_file, 'results.csv')) checkpoint_file = path.join(checkpoint_file, 'model_best.pth.tar') if path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file, map_location="cpu") if args.start_epoch < 0: # not explicitly set args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optim_state_dict = checkpoint.get('optim_state_dict', None) logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, checkpoint['epoch']) else: logging.error("no checkpoint found at '%s'", args.resume) else: optim_state_dict = None # define loss function (criterion) and optimizer loss_params = {} if args.label_smoothing > 0: loss_params['smooth_eps'] = args.label_smoothing criterion = getattr(model, 'criterion', CrossEntropyLoss)(**loss_params) criterion.to(args.device, dtype) model.to(args.device, dtype) # Batch-norm should always be done in float if 'half' in args.dtype: FilterModules(model, module=is_bn).to(dtype=torch.float) # optimizer configuration optim_regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) optimizer = optim_regime if isinstance(optim_regime, OptimRegime) \ else OptimRegime(model, optim_regime, use_float_copy='half' in args.dtype) if optim_state_dict is not None: optimizer.load_state_dict(optim_state_dict) trainer = Trainer(model, criterion, optimizer, device_ids=args.device_ids, device=args.device, dtype=dtype, print_freq=args.print_freq, distributed=args.distributed, local_rank=args.local_rank, mixup=args.mixup, cutmix=args.cutmix, loss_scale=args.loss_scale, grad_clip=args.grad_clip, adapt_grad_norm=args.adapt_grad_norm, enable_input_grad_statistics=True, exp_bits=args.exp_bits, fp_bits=args.fp_bits) if args.tensorwatch: trainer.set_watcher(filename=path.abspath( path.join(save_path, 'tensorwatch.log')), port=args.tensorwatch_port) # Evaluation Data loading code args.eval_batch_size = args.eval_batch_size if args.eval_batch_size > 0 else args.batch_size val_data = DataRegime(getattr(model, 'data_eval_regime', None), defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'val', 'augment': False, 'input_size': args.input_size, 'batch_size': args.eval_batch_size, 'shuffle': False, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False }) if args.evaluate: results = trainer.validate(val_data.get_loader()) logging.info(results) return # Training Data loading code train_data_defaults = { 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True, 'distributed': args.distributed, 'duplicates': args.duplicates, 'autoaugment': args.autoaugment, 'cutout': { 'holes': 1, 'length': 16 } if args.cutout else None } if hasattr(model, 'sampled_data_regime'): sampled_data_regime = model.sampled_data_regime probs, regime_configs = zip(*sampled_data_regime) regimes = [] for config in regime_configs: defaults = {**train_data_defaults} defaults.update(config) regimes.append(DataRegime(None, defaults=defaults)) train_data = SampledDataRegime(regimes, probs) else: train_data = DataRegime(getattr(model, 'data_regime', None), defaults=train_data_defaults) logging.info('optimization regime: %s', optim_regime) logging.info('data regime: %s', train_data) args.start_epoch = max(args.start_epoch, 0) trainer.training_steps = args.start_epoch * len(train_data) for epoch in range(args.start_epoch, args.epochs): trainer.epoch = epoch train_data.set_epoch(epoch) val_data.set_epoch(epoch) logging.info('\nStarting Epoch: {0}\n'.format(epoch + 1)) fp8_scheduler.schedule_before_epoch(epoch) # train for one epoch # pdb.set_trace() train_results, meters_grad = trainer.train( train_data.get_loader(), chunk_batch=args.chunk_batch, scheduled_instructions=fp8_scheduler.scheduled_instructions) # evaluate on validation set if args.calibrate_bn: train_data = DataRegime(None, defaults={ 'datasets_path': args.datasets_dir, 'name': args.dataset, 'split': 'train', 'augment': True, 'input_size': args.input_size, 'batch_size': args.batch_size, 'shuffle': True, 'num_workers': args.workers, 'pin_memory': True, 'drop_last': False }) trainer.calibrate_bn(train_data.get_loader(), num_steps=200) val_results, _ = trainer.validate(val_data.get_loader()) if args.distributed and args.local_rank > 0: continue # remember best prec@1 and save checkpoint is_best = val_results['prec1'] > best_prec1 best_prec1 = max(val_results['prec1'], best_prec1) if args.drop_optim_state: optim_state_dict = None else: optim_state_dict = optimizer.state_dict() save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'optim_state_dict': optim_state_dict, 'best_prec1': best_prec1 }, is_best, path=save_path, save_all=args.save_all) logging.info('\nResults - Epoch: {0}\n' 'Training Loss {train[loss]:.4f} \t' 'Training Prec@1 {train[prec1]:.3f} \t' 'Training Prec@5 {train[prec5]:.3f} \t' 'Validation Loss {val[loss]:.4f} \t' 'Validation Prec@1 {val[prec1]:.3f} \t' 'Validation Prec@5 {val[prec5]:.3f} \t\n'.format( epoch + 1, train=train_results, val=val_results)) values = dict(epoch=epoch + 1, steps=trainer.training_steps) values.update({'training ' + k: v for k, v in train_results.items()}) values.update({'validation ' + k: v for k, v in val_results.items()}) values.update( {'grad mean ' + k: v['mean'].avg for k, v in meters_grad.items()}) values.update( {'grad std ' + k: v['std'].avg for k, v in meters_grad.items()}) results.add(**values) # stats was collected if fp8_scheduler.scheduled_instructions['collect_stat']: grad_stats_values = dict(epoch=epoch + 1) grad_stats_values.update({ 'grad mean ' + k: v['mean'].avg for k, v in meters_grad.items() }) grad_stats_values.update({ 'grad std ' + k: v['std'].avg for k, v in meters_grad.items() }) grad_stats.add(**grad_stats_values) fp8_scheduler.update_stats(grad_stats) results.plot(x='epoch', y=['training loss', 'validation loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['training error1', 'validation error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['training error5', 'validation error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') if 'grad' in train_results.keys(): results.plot(x='epoch', y=['training grad'], legend=['gradient L2 norm'], title='Gradient Norm', ylabel='value') results.save() grad_stats.save()
def main(): global args, best_prec1, dtype best_prec1 = 0 args = parser.parse_args() dtype = torch_dtypes.get(args.dtype) torch.manual_seed(args.seed) time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') if args.evaluate: args.results_dir = '/tmp' if args.save is '': args.save = time_stamp save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt'), resume=args.resume is not '') results_path = os.path.join(save_path, 'results') results = ResultsLog(results_path, title='Training Results - %s' % args.save) logging.info("saving to %s", save_path) logging.debug("run arguments: %s", args) if 'cuda' in args.device and torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) torch.cuda.set_device(args.device_ids[0]) cudnn.benchmark = True else: args.device_ids = None # create model logging.info("creating model %s", args.model) model_builder = models.__dict__[args.model] model_config = { 'input_size': args.input_size, 'dataset': args.dataset if args.dataset != 'imaginet' else 'imagenet' } if args.model_config is not '': model_config = dict(model_config, **literal_eval(args.model_config)) model = model_builder(**model_config) model.to(args.device, dtype) # Data loading code default_transform = { 'train': get_transform(args.dataset, input_size=args.input_size, augment=True), 'eval': get_transform(args.dataset, input_size=args.input_size, augment=False) } logging.info("created model with configuration: %s", model_config) num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) transform = getattr(model, 'input_transform', default_transform) regime = getattr(model, 'regime', [{ 'epoch': 0, 'optimizer': args.optimizer, 'lr': args.lr, 'momentum': args.momentum, 'weight_decay': args.weight_decay }]) # define loss function (criterion) and optimizer criterion = getattr(model, 'criterion', nn.CrossEntropyLoss)() criterion.to(args.device, dtype) train_data = get_dataset(args.dataset, 'train', transform['train']) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True) val_data = get_dataset(args.dataset, 'val', transform['eval']) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) def load_maybe_calibrate(checkpoint): try: model.load_state_dict(checkpoint) except BaseException as e: if model_config.get('quantize'): measure_name = '{}-{}.measure'.format(args.model, model_config['depth']) measure_path = os.path.join(save_path, measure_name) if os.path.exists(measure_path): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(measure_path) if 'state_dict' in checkpoint: best_prec1 = checkpoint['best_prec1'] checkpoint = checkpoint['state_dict'] logging.info( f"Measured checkpoint loaded, reference score top1 {best_prec1:.3f}" ) model.load_state_dict(checkpoint) else: if model_config.get('absorb_bn'): from utils.absorb_bn import search_absorbe_bn logging.info('absorbing batch normalization') model_config.update({ 'absorb_bn': False, 'quantize': False }) model_bn = model_builder(**model_config) model_bn.load_state_dict(checkpoint) search_absorbe_bn(model_bn, verbose=True) model_config.update({ 'absorb_bn': True, 'quantize': True }) checkpoint = model_bn.state_dict() model.load_state_dict(checkpoint, strict=False) logging.info("set model measure mode") # set_bn_is_train(model,False) set_measure_mode(model, True, logger=logging) logging.info( "calibrating apprentice model to get quant params") model.to(args.device, dtype) with torch.no_grad(): losses_avg, top1_avg, top5_avg = forward( val_loader, model, criterion, 0, training=False, optimizer=None) logging.info('Measured float resutls:\nLoss {loss:.4f}\t' 'Prec@1 {top1:.3f}\t' 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) set_measure_mode(model, False, logger=logging) # logging.info("test quant model accuracy") # losses_avg, top1_avg, top5_avg = validate(val_loader, model, criterion, 0) # logging.info('Quantized results:\nLoss {loss:.4f}\t' # 'Prec@1 {top1:.3f}\t' # 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) save_checkpoint( { 'epoch': 0, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': top1_avg, 'regime': regime }, True, path=save_path, save_all=True, filename=measure_name) else: raise e # optionally resume from a checkpoint if args.evaluate: if not os.path.isfile(args.evaluate): parser.error('invalid checkpoint: {}'.format(args.evaluate)) checkpoint = torch.load(args.evaluate) # model.load_state_dict(checkpoint['state_dict']) # logging.info("loaded checkpoint '%s' (epoch %s)", # args.evaluate, checkpoint['epoch']) load_maybe_calibrate(checkpoint) elif args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): results.load(os.path.join(checkpoint_file, 'results.csv')) checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth.tar') if os.path.isfile(checkpoint_file): logging.info("loading checkpoint '%s'", args.resume) checkpoint = torch.load(checkpoint_file) if 'state_dict' in checkpoint: if checkpoint['epoch'] > 0: args.start_epoch = checkpoint['epoch'] - 1 best_prec1 = checkpoint['best_prec1'] checkpoint = checkpoint['state_dict'] try: model.load_state_dict(checkpoint) except BaseException as e: if model_config.get('quantize'): if model_config.get('absorb_bn'): from utils.absorb_bn import search_absorbe_bn logging.info('absorbing batch normalization') model_config.update({ 'absorb_bn': False, 'quantize': False }) model_bn = model_builder(**model_config) model_bn.load_state_dict(checkpoint) search_absorbe_bn(model_bn, verbose=True) model_config.update({ 'absorb_bn': True, 'quantize': True }) checkpoint = model_bn.state_dict() model.load_state_dict(checkpoint, strict=False) model.to(args.device, dtype) logging.info("set model measure mode") # set_bn_is_train(model,False) set_measure_mode(model, True, logger=logging) logging.info( "calibrating apprentice model to get quant params") model.to(args.device, dtype) with torch.no_grad(): losses_avg, top1_avg, top5_avg = forward( val_loader, model, criterion, 0, training=False, optimizer=None) logging.info('Measured float resutls:\nLoss {loss:.4f}\t' 'Prec@1 {top1:.3f}\t' 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) set_measure_mode(model, False, logger=logging) logging.info("test quant model accuracy") losses_avg, top1_avg, top5_avg = validate( val_loader, model, criterion, 0) logging.info('Quantized results:\nLoss {loss:.4f}\t' 'Prec@1 {top1:.3f}\t' 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) save_checkpoint( { 'epoch': 0, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': top1_avg, 'regime': regime }, True, path=save_path, save_freq=5) #save_checkpoint(model.state_dict(), is_best=True, path=save_path, save_all=True) logging.info( f'overwriting quantization method with {args.q_method}' ) set_global_quantization_method(model, args.q_method) else: raise e logging.info("loaded checkpoint '%s' (epoch %s)", checkpoint_file, args.start_epoch) else: logging.error("no checkpoint found at '%s'", args.resume) if args.evaluate: if model_config.get('quantize'): logging.info( f'overwriting quantization method with {args.q_method}') set_global_quantization_method(model, args.q_method) losses_avg, top1_avg, top5_avg = validate(val_loader, model, criterion, 0) logging.info('Evaluation results:\nLoss {loss:.4f}\t' 'Prec@1 {top1:.3f}\t' 'Prec@5 {top5:.3f}'.format(loss=losses_avg, top1=top1_avg, top5=top5_avg)) return optimizer = OptimRegime(model, regime) logging.info('training regime: %s', regime) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train_loss, train_prec1, train_prec5 = train(train_loader, model, criterion, epoch, optimizer) # evaluate on validation set val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion, epoch) # remember best prec@1 and save checkpoint is_best = val_prec1 > best_prec1 best_prec1 = max(val_prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'config': args.model_config, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'regime': regime }, is_best, path=save_path) logging.info('\n Epoch: {0}\t' 'Training Loss {train_loss:.4f} \t' 'Training Prec@1 {train_prec1:.3f} \t' 'Training Prec@5 {train_prec5:.3f} \t' 'Validation Loss {val_loss:.4f} \t' 'Validation Prec@1 {val_prec1:.3f} \t' 'Validation Prec@5 {val_prec5:.3f} \n'.format( epoch + 1, train_loss=train_loss, val_loss=val_loss, train_prec1=train_prec1, val_prec1=val_prec1, train_prec5=train_prec5, val_prec5=val_prec5)) results.add(epoch=epoch + 1, train_loss=train_loss, val_loss=val_loss, train_error1=100 - train_prec1, val_error1=100 - val_prec1, train_error5=100 - train_prec5, val_error5=100 - val_prec5) results.plot(x='epoch', y=['train_loss', 'val_loss'], legend=['training', 'validation'], title='Loss', ylabel='loss') results.plot(x='epoch', y=['train_error1', 'val_error1'], legend=['training', 'validation'], title='Error@1', ylabel='error %') results.plot(x='epoch', y=['train_error5', 'val_error5'], legend=['training', 'validation'], title='Error@5', ylabel='error %') results.save()
def main(args): args.model_path = os.path.join(args.model_path, str(datetime.date.today())) if not os.path.exists(args.model_path): os.makedirs(args.model_path) loss_fn = nn.CrossEntropyLoss() softmax = nn.Softmax(dim=1) # Build DDxNet with 4 DDx blocks of convolutions model = DDxNet(args.num_channels, args.num_timesteps, DDx_block, [2, 6, 8, 4], args.output_dim, causal=True, use_dilation=True).to(device) # multi-gpu training if available if (torch.cuda.device_count() > 1): print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(0.9, 0.98)) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1**0.75, verbose=True) train_loader = get_loader(args.data_dir, 'train', args.batch_size, args.shuffle) test_loader = get_loader(args.data_dir, 'test', args.batch_size, shuffle=False) best_val_acc = 0. if not os.path.exists(os.path.join('./logs', str(datetime.date.today()))): os.makedirs(os.path.join('./logs', str(datetime.date.today()))) results_file = os.path.join('./logs', str(datetime.date.today()), args.results_file) results = ResultsLog(results_file) for epoch in range(args.num_epochs): avg_loss = 0. total_predlabs = [] total_truelabs = [] total_probs = [] for itr, (X, y_true) in enumerate(train_loader): model.train() X = X.to(device).float() y_true = y_true.to(device).long() y_pred = model(X) loss = loss_fn(y_pred, torch.max(y_true, 1)[1]) avg_loss += loss.item() / len(train_loader) optimizer.zero_grad() loss.backward() if args.clip_grad > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad) optimizer.step() probs = softmax(y_pred) _, predlabs = torch.max(probs.data, 1) total_probs.extend(probs.data.cpu().numpy()) total_predlabs.extend(predlabs.data.cpu().numpy()) total_truelabs.extend(torch.max(y_true, 1)[1].data.cpu().numpy()) batch_acc = accuracy_score( torch.max(y_true, 1)[1].data.cpu().numpy(), predlabs.data.cpu().numpy()) if (itr + 1) % 50 == 0: print(('Epoch: {} Iter: {}/{} Loss: {} Acc: {}').format( epoch, itr + 1, len(train_loader), loss.item(), batch_acc)) if ((itr + 1) % len(train_loader)) == 0: total_truelabs = np.array(total_truelabs) total_predlabs = np.array(total_predlabs) total_probs = np.array(total_probs) total_train_acc = accuracy_score(total_truelabs, total_predlabs) f1 = f1_score(total_truelabs, total_predlabs, average='macro') res = { 'epoch': epoch + (itr * 1.0 + 1.0) / len(train_loader), 'steps': epoch * len(train_loader) + itr + 1, 'train_loss': avg_loss, 'train_f1': f1, 'train_acc': total_train_acc } model.eval() with torch.no_grad(): total_predlabs = [] total_probs = [] total_truelabs = [] total_val_loss = 0. for i, (dat, labs) in enumerate(test_loader): dat = dat.to(device).float() labs = labs.to(device).long() y_pred = model(dat) val_loss = loss_fn(y_pred, torch.max(labs, 1)[1]) probs = softmax(y_pred) _, predlabs = torch.max(probs.data, 1) total_probs.extend(probs.data.cpu().numpy()) total_predlabs.extend(predlabs.data.cpu().numpy()) total_truelabs.extend( torch.max(labs, 1)[1].data.cpu().numpy()) total_val_loss += (val_loss.item() / len(test_loader)) total_truelabs = np.array(total_truelabs) total_predlabs = np.array(total_predlabs) total_probs = np.array(total_probs) total_val_acc = accuracy_score(total_truelabs, total_predlabs) total_val_f1 = f1_score(total_truelabs, total_predlabs, average='macro') print("At Epoch: {}, Iter: {}, val_loss: {}, val_acc: {}". format(epoch, itr + 1, total_val_loss, total_val_acc)) print("Confusion Matrix: ") print(confusion_matrix(total_truelabs, total_predlabs)) if (total_val_acc > best_val_acc): best_val_acc = total_val_acc print("saving model") torch.save( model.state_dict(), os.path.join(args.model_path, args.results_file + '_model.pth')) np.savetxt(os.path.join( args.model_path, args.results_file + '_prob.txt'), total_probs, delimiter=',') np.savetxt(os.path.join( args.model_path, args.results_file + '_pred.txt'), total_predlabs, delimiter=',') np.savetxt(os.path.join( args.model_path, args.results_file + '_true.txt'), total_truelabs, delimiter=',') res['val_loss'] = total_val_loss res['val_acc'] = total_val_acc res['val_f1'] = total_val_f1 plot_loss = ['train_loss'] plot_acc = ['train_acc'] plot_f1 = ['train_f1'] plot_loss += ['val_loss'] plot_acc += ['val_acc'] plot_f1 += ['val_f1'] results.add(**res) results.plot(x='epoch', y=plot_loss, title='Multi-Class Loss', ylabel='CE Loss') results.plot(x='epoch', y=plot_acc, title='Accuracy', ylabel='Accuracy') results.plot(x='epoch', y=plot_f1, title='F1-Score (Macro)', ylabel='F1-Score') results.save() scheduler.step(total_val_loss, epoch)