def compare_loss_scaling_by_nonzero(net, cfgs, init_scale=1, dataset='mnist', verbose=False, sample_iterations=None, includes=None, manual_seed=0, device=-1, dtype=chainer.mixed16, n_epoch=10, learnrate=0.01): """ Collect the number of nonzero at various points during training and see their relationship with the loss scaling method. The model should be provided. Different loss scaling method is specified by cfgs (a list) """ # history hists = [] with chainer.using_config('dtype', dtype): # create loss scaled model for i, cfg in enumerate(cfgs): net_ = net.copy(mode='copy') # deeply copy the original link if init_scales net_ = AdaLossScaled(net_, init_scale=init_scale, cfg=cfg, verbose=verbose) # prepare the hook that records necessary values hook = AdaLossMonitor(sample_iterations=sample_iterations, includes=includes, verbose=verbose) # collect data utils.set_random_seed(manual_seed, device=device) if dataset == 'mnist': hooks, log = train_utils.train_model_on_mnist( net_, epoch=n_epoch, batchsize=128, device=device, learnrate=learnrate, hooks=[hook]) else: raise ValueError('dataset name not found: {}'.format(dataset)) # prepare the sampled results df = hooks[0].export_history()
def set_random_seed(args, device): # Set up random seed if args.manual_seed is not None: utils.set_random_seed(args.manual_seed, device=device)
def train_model_on_cifar(net, dataset='cifar10', n_epoch=164, batchsize=128, device=-1, learnrate=0.1, lr_decay=0.1, schedule=None, weight_decay=1e-4, manual_seed=None, warmup_attr_ratio=None, warmup_n_epoch=None, cleanup=True, tmpdir=None, recorder=None, hooks=None): """ Train a model on the cifar dataset """ # Mean and Std _mean = np.array([0.4914, 0.4822, 0.4465], dtype=chainer.get_dtype()).reshape([3, 1, 1]) _std = np.array([0.2023, 0.1994, 0.2010], dtype=chainer.get_dtype()).reshape([3, 1, 1]) # Set up random seed if manual_seed is not None: utils.set_random_seed(manual_seed, device=device) if dataset == 'cifar10': print('Using CIFAR10 dataset.') class_labels = 10 train, test = chainer.datasets.get_cifar10() mean = _mean std = _std elif dataset == 'cifar100': print('Using CIFAR100 dataset.') class_labels = 100 train, test = chainer.datasets.get_cifar100() mean = np.array([0.5071, 0.4867, 0.4408], dtype=chainer.get_dtype()).reshape([3, 1, 1]) std = np.array([0.2675, 0.2565, 0.2761], dtype=chainer.get_dtype()).reshape([3, 1, 1]) else: raise RuntimeError('Invalid dataset choice.') train = PreprocessCIFARTrainData(train, mean=mean, std=std) test = PreprocessCIFARTestData(test, mean=mean, std=std) train_iter = chainer.iterators.SerialIterator(train, batchsize) test_iter = chainer.iterators.SerialIterator(test, batchsize, repeat=False, shuffle=False) # Model initialisation model = L.Classifier(net) model.to_device(device) # Create optimizer # NOTE: here the momentum is 0.9 by default if warmup_attr_ratio is not None: learnrate *= warmup_attr_ratio optimizer = chainer.optimizers.MomentumSGD(learnrate) if chainer.get_dtype() == chainer.mixed16: print('==> Using FP32 update for dtype=mixed16') optimizer.use_fp32_update() # by default use fp32 update # TODO: loss scaling optimizer.setup(model) optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(weight_decay)) # Setting up the trigger for stopping training stop_trigger = (n_epoch, 'epoch') # Set up a trainer if tmpdir is None: tmpdir = '/tmp' out = tempfile.mkdtemp(prefix='{}_train-'.format(dataset), dir=tmpdir) updater = training.updaters.StandardUpdater(train_iter, optimizer, device=device) trainer = training.Trainer(updater, stop_trigger, out=out) if recorder is not None: recorder.setup(trainer) trainer.extend(extensions.LogReport()) trainer.extend(extensions.Evaluator(test_iter, model, device=device)) trainer.extend(extensions.observe_lr()) trainer.extend( extensions.PrintReport([ 'epoch', 'lr', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time', ])) trainer.extend(extensions.snapshot( filename='snapshot_epoch_{.updater.epoch}', snapshot_on_error=True), trigger=(1, 'epoch')) lr_shift = chainerlp.extensions.ExponentialShift( 'lr', lr_decay, warmup_attr_ratio=warmup_attr_ratio, warmup_n_epoch=warmup_n_epoch, schedule=schedule) trainer.extend(lr_shift, trigger=(1, 'epoch')) trainer.extend(extensions.ProgressBar()) # RUN if hooks is None: hooks = [] with ExitStack() as stack: for hook in hooks: if hasattr(hook, 'trainer'): hook.trainer = trainer # patch the hooks stack.enter_context(hook) trainer.run() log = notebook_utils.load_train_log(train_dir=out) if cleanup: print('==> Cleaning up {} ...'.format(out)) shutil.rmtree(out) return hooks, log
import tempfile import pickle import numpy as np import chainer import chainer.functions as F # testing purpose import chainer.links as L from chainer import testing from chainer import Function, FunctionNode, gradient_check, report, training, Variable from chainer import datasets, initializers, iterators, optimizers, serializers from chainer import Link, Chain, ChainList from chainer.datasets import mnist # for trainer test from chainerlp import utils from chainerlp.hooks.act_stat_hook import ActStatFuncHook utils.set_random_seed(0) class TestActStatHook(unittest.TestCase): def test_forward(self): """ ActStatFuncHook should work properly for the forward pass of a model. Properly means the input data should be correctly collected. """ hook = ActStatFuncHook() with hook: data = np.random.random((3, 3)).astype(np.float32) - 0.5 x = chainer.Variable(data) _ = F.relu(x) self.assertEqual(len(hook.call_history), 1) # one history recorded
def train(n_layer, init_scale=1, scale_upper_bound=128, accum_upper_bound=4096, method='approx_range', update_per_n_iteration=1, warmup_attr_ratio=None, warmup_n_epoch=None, n_class=10, manual_seed=0, train_batch=128, device=-1, learnrate=0.1): """ Train function """ utils.set_random_seed(manual_seed, device=device) # Recorder for loss scale values recorder = AdaLossRecorder(sample_per_n_iter=100) with chainer.using_config('dtype', chainer.mixed16): if n_layer == 16 or n_layer == 19: net_ = VGGNetCIFAR(n_layer, n_class=n_class) elif n_layer == 164: net_ = ResNetCIFARv2(n_layer, n_class=n_class) else: net_ = ResNetCIFAR(n_layer, n_class=n_class) net = AdaLossScaled( net_, init_scale=init_scale, cfg={ 'loss_scale_method': method, 'scale_upper_bound': scale_upper_bound, 'accum_upper_bound': accum_upper_bound, 'recorder': recorder, 'update_per_n_iteration': update_per_n_iteration, 'n_uf_threshold': 1e-3, }, transforms=[ transforms.AdaLossTransformLinear(), transforms.AdaLossTransformConvolution2D(), transforms.AdaLossTransformBatchNormalization(), # customized transform for chainerlp models chainerlp_transforms.AdaLossTransformConv2DBNActiv(), chainerlp_transforms.AdaLossTransformBasicBlock(), chainerlp_transforms.AdaLossTransformBNActivConv2D(), chainerlp_transforms.AdaLossTransformBottleneckv2(), ], verbose=True) hook = AdaLossMonitor(sample_per_n_iter=100, verbose=False, includes=['Grad', 'Deconvolution']) utils.set_random_seed(manual_seed, device=device) hooks, log = train_utils.train_model_on_cifar( net, dataset='cifar{}'.format(n_class), learnrate=learnrate, batchsize=train_batch, device=device, schedule=[81, 122], warmup_attr_ratio=warmup_attr_ratio, warmup_n_epoch=warmup_n_epoch, hooks=[hook], recorder=recorder) # post processing grad_stats = hooks[0].export_history() loss_scale = recorder.export() return grad_stats, loss_scale, log