def test_imperative_i2_o1(): import nnabla.functions as F x0 = nn.NdArray([2, 3, 4]) x1 = nn.NdArray([2, 1, 1]) x0.fill(3) x1.fill(0.5) y = F.mul2(x0, x1) assert np.allclose(y.data, 1.5)
def reset(self, epoch, pbar): self.epoch = epoch self.epoch_loss = 0.0 self.epoch_error = 0 self.batch_counter = 0 self.pbar = pbar self.buff = [nn.NdArray(), nn.NdArray()] self.reset_buff() self.flush = True
def test_copy_from(): shape = [2, 3, 4] src = nn.NdArray(shape) dst = nn.NdArray(shape) src.data = 0 src.cast(dtype=np.uint8) dst.copy_from(src, use_current_context=False) assert dst.dtype == np.uint8 from nnabla.ext_utils import get_extension_context with nn.context_scope(get_extension_context('cpu', dtype='float')): dst.copy_from(src, use_current_context=True) assert dst.dtype == np.float32
def test_wrong_case_ndarray_arithmetic_matmul_ops(seed, shape): rng = np.random.RandomState(seed) if not shape[0]: a1 = rng.randn() n1 = nn.NdArray() n1.cast(np.float32)[...] = a1 v1 = nn.Variable() v1.data.cast(np.float32)[...] = a1 else: a1 = rng.randn(*shape[0]).astype(np.float32) n1 = nn.NdArray.from_numpy_array(a1) v1 = nn.Variable.from_numpy_array(a1) if not shape[1]: a2 = rng.randn() n2 = nn.NdArray() n2.cast(np.float32)[...] = a2 v2 = nn.Variable() v2.data.cast(np.float32)[...] = a2 else: a2 = rng.randn(*shape[1]).astype(np.float32) n2 = nn.NdArray.from_numpy_array(a2) v2 = nn.Variable.from_numpy_array(a2) with pytest.raises(AssertionError) as excinfo: # NdArray @ NdArray ans1 = n1 @ n2 # NdArray @ Variable ans2 = n1 @ v2 # Variable @ NdArray ans3 = v1 @ n2 # Variable @ Variable ans4 = v1 @ v2 # numpy.ndarray or float @ NdArray ans5 = a1 @ n1 # NdArray @ numpy.ndarray or float ans6 = n1 @ a2 # numpy.ndarray or float @ Variable ans7 = a1 @ v2 # Variable @ numpy.ndarray or float ans8 = v1 @ a2
def test_scalar_dot(seed, scalar, is_dynamic): rng = np.random.RandomState(seed) a1 = scalar a2 = rng.randn(3, 4, 5, 6).astype(np.float32) n = nn.NdArray.from_numpy_array(a2) v = nn.Variable.from_numpy_array(a2) ref = F.dot(a1, a2) ans1 = F.dot(a1, n) assert_allclose(ans1.data, ref) out1 = nn.NdArray((3, 4, 5, 6)) F.dot(a1, n, out1) assert_allclose(out1.data, ref) with nn.auto_forward(is_dynamic): ans2 = F.dot(a1, v) if not is_dynamic: ans2.forward() assert_allclose(ans2.d, ref) out2 = nn.Variable((3, 4, 5, 6)) F.dot(a1, v, out2) if not is_dynamic: out2.forward() assert_allclose(out2.d, ref)
def __init__(self, comm, losses, save_path=None, nimage_per_epoch=1, show_interval=20, show_keys=None): # losses: {"loss_name": loss_Variable, ...} or list of tuple(key, value) self.batch_cnt = 0 self.piter = None self.comm = comm self.save_path = save_path self.nimage_per_epoch = nimage_per_epoch self.show_interval = show_interval self.losses = OrderedDict(losses) # fix loss order self.epoch_losses = {k: 0. for k in losses.keys()} self.buff = {k: nn.NdArray() for k in losses.keys()} self.show_keys = list( losses.keys()) if show_keys is None else show_keys is_master = comm.rank == 0 self.monitor = MonitorWrapper(save_path, self.epoch_losses) if ( save_path is not None and is_master) else None self._reset_buffer()
def sum_grad_norm(params): norm = nn.NdArray() norm.zero() for p in params: assert isinstance(p, nn.Variable) and not p.grad.clear_called norm += F.sum(p.grad**2) return np.sqrt(norm.data)
def _reset_buffer(self): # reset buff for loss_name, loss in self.losses.items(): if loss is None: continue self.buff[loss_name] = nn.NdArray() self.buff[loss_name].zero() self.flushed = True
def test_nd_array_data(value): shape = (2, 3) # Use default dtype (float32) in getter a = nn.NdArray(shape) with pytest.raises(Exception): _ = a.dtype _ = a.data assert a.dtype == np.float32 # Use value dtype in setter a = nn.NdArray(shape) a.data = value if not np.isscalar(value) or \ (np.dtype(type(value)).kind != 'f' and value > (1 << 53)): assert a.dtype == np.asarray(value).dtype assert a.data.dtype == np.asarray(value).dtype else: assert a.data.dtype == np.float32
def _sum_error(sum, error): ret = None if comm: # logger.log(99, "Calc error with communicator") var = [nn.NdArray()] var[0].data = error _all_reduce(comm, var, division=False, inplace=True) ret = sum + var[0].data else: ret = sum + error return ret
def _sum_cost(): if comm: # logger.log(99, "Calc cost with communicator") var = [nn.NdArray()] var[0].data = cost.sum_iteration _all_reduce(comm, var, division=False, inplace=True) cost.sum_epoch += var[0].data cost.num_iteration += comm.size else: cost.sum_epoch += cost.sum_iteration cost.num_iteration += 1
def test_nd_array(): shape = [2, 3, 4] a = nn.NdArray(shape) npa = np.arange(a.size).reshape(a.shape).astype(np.int32) a.data = npa b = nn.NdArray.from_numpy_array(npa) b.dtype == np.int32 assert np.all(a.data == npa) assert np.all(a.data == b.data) assert a.shape == npa.shape assert b.size == np.prod(shape) a.cast(np.int32) assert a.data.dtype == np.int32 b.zero() assert np.all(b.data == 0) a.fill(3) assert np.all(a.data == 3) b.copy_from(a) assert np.all(a.data == b.data)
def test_clear_called(): a = nn.NdArray(1) assert a.clear_called == False a.fill(3) assert a.clear_called == False a.clear() assert a.clear_called == True a.fill(3) assert a.clear_called == False a.clear() assert a.clear_called == True a.zero() assert a.clear_called == False a.clear() assert a.clear_called == True a.data[0] = -1 assert a.clear_called == False
def test_from_dlpack_given(ext_name, numpy_type, torch_type): ctx = get_extension_context(ext_name) device_name = ctx.backend[0].split(':')[0] if device_name == 'cudnn': device_name = 'cuda' # for PyTorch nn.set_default_context(ctx) # Init PyTorch Tensor t = torch.ones((5, 5), dtype=torch_type, device=torch.device(device_name)) # PyTorch to DLPack dlp = torch.utils.dlpack.to_dlpack(t) # DLPack to NNabla a = nn.NdArray() nn.utils.dlpack.from_dlpack(dlp, a) assert a.dtype == numpy_type # Check if the memory locations are still same, # which means DlpackArray is not copied to other arrays # in the same ArrayGroup. a += 1 assert np.all(a.data == t.to('cpu').detach().numpy().copy())
class TestClearOutputGrad(): def check_grad_cleared_flags(self, answer): result = clear_called_flag_recorder.get_output_clear_called_flags() assert len(result) == len(answer) for i, flags in enumerate(answer): assert len(result[i]) == len(flags) for j, flag in enumerate(flags): assert flag == result[i][j][1] def setup_method(self): clear_called_flag_recorder.activate_clear_called_flag_recorder() def teardown_method(self): clear_called_flag_recorder.deactivate_clear_called_flag_recorder() # Test for the type of grad given to backward. @pytest.mark.parametrize("grad", [1, None, np.ndarray([1]), nn.NdArray([1])]) def test_clear_output_grad_argument(self, grad): x1 = nn.Variable([1], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1) answer_grad = [] if grad is None or isinstance(grad, nn.NdArray): answer_grad.append([False]) # y1 else: answer_grad.append([True]) # y1 answer_grad.append([True]) # xx1 y1.forward(clear_no_need_grad=True) clear_called_flag_recorder.deactivate_clear_called_flag_recorder() clear_called_flag_recorder.activate_clear_called_flag_recorder() y1.backward(clear_buffer=True, grad=grad) self.check_grad_cleared_flags(answer_grad) assert y1.grad.clear_called == False # Test for an inplaced variable. def test_clear_output_grad_inplace(self): x1 = nn.Variable([1], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1, inplace=True) y2 = F.add_scalar(y1) answer_grad = [] answer_grad.append([True]) answer_grad.append([True]) answer_grad.append([True]) y2.forward(clear_no_need_grad=True) clear_called_flag_recorder.deactivate_clear_called_flag_recorder() clear_called_flag_recorder.activate_clear_called_flag_recorder() y2.backward(clear_buffer=True) self.check_grad_cleared_flags(answer_grad) # Test for a variable shared with two layer functions. def test_clear_output_grad_shared_variable(self): x1 = nn.Variable([1], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1) y2 = F.add_scalar(xx1) y3 = F.add2(y1, y2) answer_grad = [] answer_grad.append([True]) answer_grad.append([True]) answer_grad.append([True]) answer_grad.append([True]) y3.forward(clear_no_need_grad=True) clear_called_flag_recorder.deactivate_clear_called_flag_recorder() clear_called_flag_recorder.activate_clear_called_flag_recorder() y3.backward(clear_buffer=True) self.check_grad_cleared_flags(answer_grad) # Test for a persistent variable. def test_clear_output_grad_persistent(self): x1 = nn.Variable([1], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1) y2 = F.add_scalar(y1) xx1.persistent = True y2.persistent = True answer_grad = [] answer_grad.append([False]) # y2 answer_grad.append([True]) # y1 answer_grad.append([False]) # xx1 y2.forward(clear_no_need_grad=True) clear_called_flag_recorder.deactivate_clear_called_flag_recorder() clear_called_flag_recorder.activate_clear_called_flag_recorder() y2.backward(clear_buffer=True) self.check_grad_cleared_flags(answer_grad) # Test for the input variables of sink. # In the case where Function::prohibit_clear_input_buffers returns true, # these inputs must not be cleared from any function. def test_clear_output_grad_prohibit_clear_input(self): x1 = nn.Variable([1], need_grad=True) xx1 = F.identity(x1) y1 = F.add_scalar(xx1) y2 = F.add_scalar(xx1) y3 = F.sink(y1, y2) answer_grad = [] answer_grad.append([True]) # y3 answer_grad.append([False]) # y2 answer_grad.append([False]) # y1 answer_grad.append([True]) # xx1 y3.forward(clear_no_need_grad=True) clear_called_flag_recorder.deactivate_clear_called_flag_recorder() clear_called_flag_recorder.activate_clear_called_flag_recorder() y3.backward(clear_buffer=True) self.check_grad_cleared_flags(answer_grad)
def test_imperative_i1_o1(): import nnabla.functions as F x = nn.NdArray([2, 3, 4]) x.fill(1) x1 = F.add_scalar(x, 1) assert np.allclose(x1.data, 2)
def test_imperative_pf(): import nnabla.parametric_functions as PF x = nn.NdArray([2, 3, 4, 5]) y = PF.batch_normalization(x)
# # NNabla Python API Demonstration Tutorial # # (https://nnabla.readthedocs.io/en/latest/python/tutorial/python_api.html) import matplotlib.pyplot as plt import nnabla as nn import nnabla.functions as F import nnabla.parametric_functions as PF import nnabla.solvers as S import numpy as np from ivory.utils.path import cache_file # ## NdArray a = nn.NdArray((2, 3, 4)) print(a.data) # - print("[Substituting random values]") a.data = np.random.randn(*a.shape) print(a.data) print("[Slicing]") a.data[0, :, ::2] = 0 print(a.data) # - a.fill(1) # Filling all values with one. print(a.data) # - b = nn.NdArray.from_numpy_array(np.ones(a.shape)) print(b.data) # ## Variable
def test_ndarray_dot(seed, shape, is_dynamic): rng = np.random.RandomState(seed) if not shape[0]: a1 = rng.randn() n1 = nn.NdArray() n1.cast(np.float32)[...] = a1 v1 = nn.Variable() v1.data.cast(np.float32)[...] = a1 else: a1 = rng.randn(*shape[0]).astype(np.float32) n1 = nn.NdArray.from_numpy_array(a1) v1 = nn.Variable.from_numpy_array(a1) if not shape[1]: a2 = rng.randn() n2 = nn.NdArray() n2.cast(np.float32)[...] = a2 v2 = nn.Variable() v2.data.cast(np.float32)[...] = a2 else: a2 = rng.randn(*shape[1]).astype(np.float32) n2 = nn.NdArray.from_numpy_array(a2) v2 = nn.Variable.from_numpy_array(a2) ref = F.dot(a1, a2) ans1_1 = F.dot(n1, n2) ans1_2 = F.dot(n1, v2) ans1_3 = F.dot(v1, n2) assert_allclose(ans1_1.data, ref, atol=1e-3) assert_allclose(ans1_2.data, ref, atol=1e-3) assert_allclose(ans1_3.data, ref, atol=1e-3) with nn.auto_forward(is_dynamic): ans1_4 = F.dot(v1, v2) if is_dynamic: ans1_4.forward() assert_allclose(ans1_4.d, ref, atol=1e-3) out = ref.copy() F.dot(a1, a2, out) assert_allclose(out, ref, atol=1e-3) out1_1 = nn.NdArray(ans1_1.shape) out1_1.cast(np.float32) F.dot(n1, n2, out1_1) assert_allclose(out1_1.data, ref, atol=1e-3) out1_2 = nn.NdArray(ans1_2.shape) out1_2.cast(np.float32) F.dot(n1, v2, out1_2) assert_allclose(out1_2.data, ref, atol=1e-3) out1_3 = nn.NdArray(ans1_3.shape) out1_3.cast(np.float32) F.dot(v1, n2, out1_3) assert_allclose(out1_3.data, ref, atol=1e-3) out1_4 = nn.Variable(ans1_4.shape) out1_4.data.cast(np.float32) with nn.auto_forward(is_dynamic): F.dot(v1, v2, out1_4) if not is_dynamic: out1_4.forward() assert_allclose(out1_4.d, ref, atol=1e-3) # Ndarray with a wrong dtype out2_1 = nn.NdArray(ref.shape) out2_1.cast(int) out2_2 = nn.Variable(ref.shape) out2_2.data.cast(int) # should not exec with pytest.raises(ValueError) as excinfo: F.dot(n1, n2, out2_1) F.dot(n1, v2, out2_1) F.dot(v1, n2, out2_1) F.dot(v1, v2, out2_2)
def train(): # Check NNabla version if utils.get_nnabla_version_integer() < 11900: raise ValueError( 'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0' ) parser, args = get_train_args() # Get context. ctx = get_extension_context(args.context, device_id=args.device_id) comm = CommunicatorWrapper(ctx) nn.set_default_context(comm.ctx) ext = import_extension_module(args.context) # Monitors # setting up monitors for logging monitor_path = args.output monitor = Monitor(monitor_path) monitor_best_epoch = MonitorSeries('Best epoch', monitor, interval=1) monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1) monitor_validation_loss = MonitorSeries('Validation loss', monitor, interval=1) monitor_lr = MonitorSeries('learning rate', monitor, interval=1) monitor_time = MonitorTimeElapsed("training time per iteration", monitor, interval=1) if comm.rank == 0: if not os.path.isdir(args.output): os.makedirs(args.output) # Initialize DataIterator for MUSDB18. train_source, valid_source, args = load_datasources(parser, args) train_iter = data_iterator( train_source, args.batch_size, RandomState(args.seed), with_memory_cache=False, ) valid_iter = data_iterator( valid_source, 1, RandomState(args.seed), with_memory_cache=False, ) if comm.n_procs > 1: train_iter = train_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) valid_iter = valid_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) # Calculate maxiter per GPU device. # Change max_iter, learning_rate and weight_decay according no. of gpu devices for multi-gpu training. default_batch_size = 16 train_scale_factor = (comm.n_procs * args.batch_size) / default_batch_size max_iter = int((train_source._size // args.batch_size) // comm.n_procs) weight_decay = args.weight_decay * train_scale_factor args.lr = args.lr * train_scale_factor # Calculate the statistics (mean and variance) of the dataset scaler_mean, scaler_std = utils.get_statistics(args, train_source) # clear cache memory ext.clear_memory_cache() max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft, args.bandwidth) # Get X-UMX/UMX computation graph and variables as namedtuple model = get_model(args, scaler_mean, scaler_std, max_bin=max_bin) # Create Solver and set parameters. solver = S.Adam(args.lr) solver.set_parameters(nn.get_parameters()) # Initialize Early Stopping es = utils.EarlyStopping(patience=args.patience) # Initialize LR Scheduler (ReduceLROnPlateau) lr_scheduler = ReduceLROnPlateau(lr=args.lr, factor=args.lr_decay_gamma, patience=args.lr_decay_patience) best_epoch = 0 # AverageMeter for mean loss calculation over the epoch losses = utils.AverageMeter() # Training loop. for epoch in trange(args.epochs): # TRAINING losses.reset() for batch in range(max_iter): model.mixture_audio.d, model.target_audio.d = train_iter.next() solver.zero_grad() model.loss.forward(clear_no_need_grad=True) if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() model.loss.backward(clear_buffer=True, communicator_callbacks=all_reduce_callback) else: model.loss.backward(clear_buffer=True) solver.weight_decay(weight_decay) solver.update() losses.update(model.loss.d.copy(), args.batch_size) training_loss = losses.get_avg() # clear cache memory ext.clear_memory_cache() # VALIDATION losses.reset() for batch in range(int(valid_source._size // comm.n_procs)): x, y = valid_iter.next() dur = int(valid_source.sample_rate * args.valid_dur) sp, cnt = 0, 0 loss_tmp = nn.NdArray() loss_tmp.zero() while 1: model.vmixture_audio.d = x[Ellipsis, sp:sp + dur] model.vtarget_audio.d = y[Ellipsis, sp:sp + dur] model.vloss.forward(clear_no_need_grad=True) cnt += 1 sp += dur loss_tmp += model.vloss.data if x[Ellipsis, sp:sp + dur].shape[-1] < dur or x.shape[-1] == cnt * dur: break loss_tmp = loss_tmp / cnt if comm.n_procs > 1: comm.all_reduce(loss_tmp, division=True, inplace=True) losses.update(loss_tmp.data.copy(), 1) validation_loss = losses.get_avg() # clear cache memory ext.clear_memory_cache() lr = lr_scheduler.update_lr(validation_loss, epoch=epoch) solver.set_learning_rate(lr) stop = es.step(validation_loss) if comm.rank == 0: monitor_best_epoch.add(epoch, best_epoch) monitor_traing_loss.add(epoch, training_loss) monitor_validation_loss.add(epoch, validation_loss) monitor_lr.add(epoch, lr) monitor_time.add(epoch) if validation_loss == es.best: best_epoch = epoch # save best model if args.umx_train: nn.save_parameters(os.path.join(args.output, 'best_umx.h5')) else: nn.save_parameters( os.path.join(args.output, 'best_xumx.h5')) if args.umx_train: # Early stopping for UMX after `args.patience` (140) number of epochs if stop: print("Apply Early Stopping") break
def __init__(self, decay): self.decay = decay self.shadow_variable = nn.NdArray()
def train(): """ Main script. """ args = get_args() _ = nn.load_parameters(args.pretrained_model_path) if args.fine_tune: nnabla.parameter.pop_parameter('decoder/logits/affine/conv/W') nnabla.parameter.pop_parameter('decoder/logits/affine/conv/b') n_train_samples = args.train_samples n_val_samples = args.val_samples distributed = args.distributed compute_acc = args.compute_acc if distributed: # Communicator and Context from nnabla.ext_utils import get_extension_context extension_module = "cudnn" ctx = get_extension_context( extension_module, type_config=args.type_config) comm = C.MultiProcessDataParalellCommunicator(ctx) comm.init() n_devices = comm.size mpi_rank = comm.rank device_id = mpi_rank ctx.device_id = str(device_id) nn.set_default_context(ctx) else: # Get context. from nnabla.ext_utils import get_extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = get_extension_context( extension_module, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) n_devices = 1 device_id = 0 # training data data = data_iterator_segmentation( args.train_samples, args.batch_size, args.train_dir, args.train_label_dir, target_width=args.image_width, target_height=args.image_height) # validation data vdata = data_iterator_segmentation(args.val_samples, args.batch_size, args.val_dir, args.val_label_dir, target_width=args.image_width, target_height=args.image_height) if distributed: data = data.slice( rng=None, num_of_slices=n_devices, slice_pos=device_id) vdata = vdata.slice( rng=None, num_of_slices=n_devices, slice_pos=device_id) num_classes = args.num_class # Workaround to start with the same initialized weights for all workers. np.random.seed(313) t_model = get_model( args, test=False) t_model.pred.persistent = True # Not clearing buffer of pred in backward t_pred2 = t_model.pred.unlinked() t_e = F.sum(F.top_n_error(t_pred2, t_model.label, axis=1) * t_model.mask) / F.sum(t_model.mask) v_model = get_model( args, test=True) v_model.pred.persistent = True # Not clearing buffer of pred in forward v_pred2 = v_model.pred.unlinked() v_e = F.sum(F.top_n_error(v_pred2, v_model.label, axis=1) * v_model.mask) / F.sum(t_model.mask) # Create Solver solver = S.Momentum(args.learning_rate, 0.9) solver.set_parameters(nn.get_parameters()) # Load checkpoint start_point = 0 if args.checkpoint is not None: # load weights and solver state info from specified checkpoint file. start_point = load_checkpoint(args.checkpoint, solver) # Setting warmup. base_lr = args.learning_rate / n_devices warmup_iter = int(1. * n_train_samples / args.batch_size / args.accum_grad / n_devices) * args.warmup_epoch warmup_slope = base_lr * (n_devices - 1) / warmup_iter solver.set_learning_rate(base_lr) # Create monitor import nnabla.monitor as M monitor = M.Monitor(args.monitor_path) monitor_loss = M.MonitorSeries("Training loss", monitor, interval=10) monitor_err = M.MonitorSeries("Training error", monitor, interval=10) monitor_vloss = M.MonitorSeries("Validation loss", monitor, interval=1) monitor_verr = M.MonitorSeries("Validation error", monitor, interval=1) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=10) monitor_miou = M.MonitorSeries("mean IOU", monitor, interval=10) monitor_vtime = M.MonitorTimeElapsed( "Validation time", monitor, interval=1) # save_nnp contents = save_nnp({'x': v_model.image}, { 'y': v_model.pred}, args.batch_size) save.save(os.path.join(args.model_save_path, 'Deeplabv3plus_result_epoch0.nnp'), contents, variable_batch_size=False) # Training loop for i in range(start_point, int(args.max_iter / n_devices)): # Save parameters if i % (args.model_save_interval // n_devices) == 0 and device_id == 0: save_checkpoint(args.model_save_path, i, solver) # Validation if i % (args.val_interval // n_devices) == 0 and i != 0: vmiou_local = 0. val_iter_local = n_val_samples // args.batch_size vl_local = nn.NdArray() vl_local.zero() ve_local = nn.NdArray() ve_local.zero() for j in range(val_iter_local): images, labels, masks = vdata.next() v_model.image.d = images v_model.label.d = labels v_model.mask.d = masks v_model.image.data.cast(np.float32, ctx) v_model.label.data.cast(np.int32, ctx) v_model.loss.forward(clear_buffer=True) v_e.forward(clear_buffer=True) vl_local += v_model.loss.data ve_local += v_e.data # Mean IOU computation if compute_acc: vmiou_local += compute_miou(num_classes, labels, np.argmax(v_model.pred.d, axis=1), masks) vl_local /= val_iter_local ve_local /= val_iter_local if compute_acc: vmiou_local /= val_iter_local vmiou_ndarray = nn.NdArray.from_numpy_array( np.array(vmiou_local)) if distributed: comm.all_reduce(vl_local, division=True, inplace=True) comm.all_reduce(ve_local, division=True, inplace=True) if compute_acc: comm.all_reduce(vmiou_ndarray, division=True, inplace=True) if device_id == 0: monitor_vloss.add(i * n_devices, vl_local.data.copy()) monitor_verr.add(i * n_devices, ve_local.data.copy()) if compute_acc: monitor_miou.add(i * n_devices, vmiou_local) monitor_vtime.add(i * n_devices) # Training l = 0.0 e = 0.0 solver.zero_grad() e_acc = nn.NdArray(t_e.shape) e_acc.zero() l_acc = nn.NdArray(t_model.loss.shape) l_acc.zero() # Gradient accumulation loop for j in range(args.accum_grad): images, labels, masks = data.next() t_model.image.d = images t_model.label.d = labels t_model.mask.d = masks t_model.image.data.cast(np.float32, ctx) t_model.label.data.cast(np.int32, ctx) t_model.loss.forward(clear_no_need_grad=True) t_model.loss.backward(clear_buffer=True) # Accumulating gradients t_e.forward(clear_buffer=True) e_acc += t_e.data l_acc += t_model.loss.data # AllReduce if distributed: params = [x.grad for x in nn.get_parameters().values()] comm.all_reduce(params, division=False, inplace=False) comm.all_reduce(l_acc, division=True, inplace=True) comm.all_reduce(e_acc, division=True, inplace=True) solver.scale_grad(1./args.accum_grad) solver.weight_decay(args.weight_decay) solver.update() # Linear Warmup if i <= warmup_iter: lr = base_lr + warmup_slope * i solver.set_learning_rate(lr) if distributed: # Synchronize by averaging the weights over devices using allreduce if (i+1) % args.sync_weight_every_itr == 0: weights = [x.data for x in nn.get_parameters().values()] comm.all_reduce(weights, division=True, inplace=True) if device_id == 0: monitor_loss.add( i * n_devices, (l_acc / args.accum_grad).data.copy()) monitor_err.add( i * n_devices, (e_acc / args.accum_grad).data.copy()) monitor_time.add(i * n_devices) # Learning rate decay at scheduled iter --> changed to poly learning rate decay policy # if i in args.learning_rate_decay_at: solver.set_learning_rate(base_lr * ((1 - i / args.max_iter)**0.1)) if device_id == 0: nn.save_parameters(os.path.join(args.model_save_path, 'param_%06d.h5' % args.max_iter)) contents = save_nnp({'x': v_model.image}, { 'y': v_model.pred}, args.batch_size) save.save(os.path.join(args.model_save_path, 'Deeplabv3plus_result.nnp'), contents, variable_batch_size=False)
def train(): # Check NNabla version if utils.get_nnabla_version_integer() < 11900: raise ValueError( 'Please update the nnabla version to v1.19.0 or latest version since memory efficiency of core engine is improved in v1.19.0' ) parser, args = get_train_args() # Get context. ctx = get_extension_context(args.context, device_id=args.device_id) comm = CommunicatorWrapper(ctx) nn.set_default_context(comm.ctx) ext = import_extension_module(args.context) # Monitors # setting up monitors for logging monitor_path = args.output monitor = Monitor(monitor_path) monitor_best_epoch = MonitorSeries('Best epoch', monitor, interval=1) monitor_traing_loss = MonitorSeries('Training loss', monitor, interval=1) monitor_validation_loss = MonitorSeries('Validation loss', monitor, interval=1) monitor_lr = MonitorSeries('learning rate', monitor, interval=1) monitor_time = MonitorTimeElapsed("training time per iteration", monitor, interval=1) if comm.rank == 0: print("Mixing coef. is {}, i.e., MDL = {}*TD-Loss + FD-Loss".format( args.mcoef, args.mcoef)) if not os.path.isdir(args.output): os.makedirs(args.output) # Initialize DataIterator for MUSDB. train_source, valid_source, args = load_datasources(parser, args) train_iter = data_iterator(train_source, args.batch_size, RandomState(args.seed), with_memory_cache=False, with_file_cache=False) valid_iter = data_iterator(valid_source, 1, RandomState(args.seed), with_memory_cache=False, with_file_cache=False) if comm.n_procs > 1: train_iter = train_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) valid_iter = valid_iter.slice(rng=None, num_of_slices=comm.n_procs, slice_pos=comm.rank) # Calculate maxiter per GPU device. max_iter = int((train_source._size // args.batch_size) // comm.n_procs) weight_decay = args.weight_decay * comm.n_procs print("max_iter", max_iter) # Calculate the statistics (mean and variance) of the dataset scaler_mean, scaler_std = utils.get_statistics(args, train_source) max_bin = utils.bandwidth_to_max_bin(train_source.sample_rate, args.nfft, args.bandwidth) unmix = OpenUnmix_CrossNet(input_mean=scaler_mean, input_scale=scaler_std, nb_channels=args.nb_channels, hidden_size=args.hidden_size, n_fft=args.nfft, n_hop=args.nhop, max_bin=max_bin) # Create input variables. mixture_audio = nn.Variable([args.batch_size] + list(train_source._get_data(0)[0].shape)) target_audio = nn.Variable([args.batch_size] + list(train_source._get_data(0)[1].shape)) vmixture_audio = nn.Variable( [1] + [2, valid_source.sample_rate * args.valid_dur]) vtarget_audio = nn.Variable([1] + [8, valid_source.sample_rate * args.valid_dur]) # create training graph mix_spec, M_hat, pred = unmix(mixture_audio) Y = Spectrogram(*STFT(target_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop), mono=(unmix.nb_channels == 1)) loss_f = mse_loss(mix_spec, M_hat, Y) loss_t = sdr_loss(mixture_audio, pred, target_audio) loss = args.mcoef * loss_t + loss_f loss.persistent = True # Create Solver and set parameters. solver = S.Adam(args.lr) solver.set_parameters(nn.get_parameters()) # create validation graph vmix_spec, vM_hat, vpred = unmix(vmixture_audio, test=True) vY = Spectrogram(*STFT(vtarget_audio, n_fft=unmix.n_fft, n_hop=unmix.n_hop), mono=(unmix.nb_channels == 1)) vloss_f = mse_loss(vmix_spec, vM_hat, vY) vloss_t = sdr_loss(vmixture_audio, vpred, vtarget_audio) vloss = args.mcoef * vloss_t + vloss_f vloss.persistent = True # Initialize Early Stopping es = utils.EarlyStopping(patience=args.patience) # Initialize LR Scheduler (ReduceLROnPlateau) lr_scheduler = ReduceLROnPlateau(lr=args.lr, factor=args.lr_decay_gamma, patience=args.lr_decay_patience) best_epoch = 0 # Training loop. for epoch in trange(args.epochs): # TRAINING losses = utils.AverageMeter() for batch in range(max_iter): mixture_audio.d, target_audio.d = train_iter.next() solver.zero_grad() loss.forward(clear_no_need_grad=True) if comm.n_procs > 1: all_reduce_callback = comm.get_all_reduce_callback() loss.backward(clear_buffer=True, communicator_callbacks=all_reduce_callback) else: loss.backward(clear_buffer=True) solver.weight_decay(weight_decay) solver.update() losses.update(loss.d.copy(), args.batch_size) training_loss = losses.avg # clear cache memory ext.clear_memory_cache() # VALIDATION vlosses = utils.AverageMeter() for batch in range(int(valid_source._size // comm.n_procs)): x, y = valid_iter.next() dur = int(valid_source.sample_rate * args.valid_dur) sp, cnt = 0, 0 loss_tmp = nn.NdArray() loss_tmp.zero() while 1: vmixture_audio.d = x[Ellipsis, sp:sp + dur] vtarget_audio.d = y[Ellipsis, sp:sp + dur] vloss.forward(clear_no_need_grad=True) cnt += 1 sp += dur loss_tmp += vloss.data if x[Ellipsis, sp:sp + dur].shape[-1] < dur or x.shape[-1] == cnt * dur: break loss_tmp = loss_tmp / cnt if comm.n_procs > 1: comm.all_reduce(loss_tmp, division=True, inplace=True) vlosses.update(loss_tmp.data.copy(), 1) validation_loss = vlosses.avg # clear cache memory ext.clear_memory_cache() lr = lr_scheduler.update_lr(validation_loss, epoch=epoch) solver.set_learning_rate(lr) stop = es.step(validation_loss) if comm.rank == 0: monitor_best_epoch.add(epoch, best_epoch) monitor_traing_loss.add(epoch, training_loss) monitor_validation_loss.add(epoch, validation_loss) monitor_lr.add(epoch, lr) monitor_time.add(epoch) if validation_loss == es.best: # save best model nn.save_parameters(os.path.join(args.output, 'best_xumx.h5')) best_epoch = epoch if stop: print("Apply Early Stopping") break
def __next__(self): if self._first_batch is not None: batch = self._first_batch self._first_batch = None return batch if self._counter >= self._size: if self._auto_reset: self.reset() # raise StopIteration # Gather outputs outputs = [] for p in self._pipes: outputs.append(p._share_outputs()) for i in range(self._num_gpus): device_id = self._pipes[i].device_id # initialize dict for all output categories category_outputs = dict() # segregate outputs into categories for j, out in enumerate(outputs[i]): category_outputs[self.output_map[j]] = out # Change DALI TensorLists into Tensors category_tensors = dict() category_shapes = dict() for category, out in category_outputs.items(): category_tensors[category] = out.as_tensor() category_shapes[category] = category_tensors[category].shape() # If we did not yet allocate memory for that batch, do it now if self._data_batches[i][self._current_data_batch] is None: self._category_nnabla_type = dict() self._category_device = dict() nnabla_gpu_device = get_extension_context('cudnn', device_id=device_id) nnabla_cpu_device = get_extension_context('cpu') # check category and device for category in self._output_categories: self._category_nnabla_type[category] = np.dtype( category_tensors[category].dtype()) if type(category_tensors[category]) is TensorGPU: self._category_device[category] = nnabla_gpu_device else: self._category_device[category] = nnabla_cpu_device nnabla_tensors = dict() for category in self._output_categories: nnabla_tensors[category] = nn.NdArray( category_shapes[category]) self._data_batches[i][ self._current_data_batch] = nnabla_tensors else: nnabla_tensors = self._data_batches[i][ self._current_data_batch] # Copy data from DALI Tensors to nnabla tensors for category, tensor in category_tensors.items(): feed_ndarray(tensor, nnabla_tensors[category], dtype=self._category_nnabla_type[category], ctx=self._category_device[category]) for p in self._pipes: p._release_outputs() p._run() copy_db_index = self._current_data_batch # Change index for double buffering self._current_data_batch = (self._current_data_batch + 1) % 2 self._counter += self._num_gpus * self.batch_size if (self._stop_at_epoch) and (self._counter > self._size): # First calculate how much data is required to return exactly self._size entries. diff = self._num_gpus * self.batch_size - \ (self._counter - self._size) # Figure out how many GPUs to grab from. numGPUs_tograb = int(np.ceil(diff / self.batch_size)) # Figure out how many results to grab from the last GPU (as a fractional GPU batch may be required to # bring us right up to self._size). mod_diff = diff % self.batch_size data_fromlastGPU = mod_diff if mod_diff else self.batch_size # Grab the relevant data. # 1) Grab everything from the relevant GPUs. # 2) Grab the right data from the last GPU. # 3) Append data together correctly and return. output = [ db[copy_db_index] for db in self._data_batches[0:numGPUs_tograb] ] output[-1] = output[-1].copy() for category in self._output_categories: output[-1][category] = output[-1][category][0:data_fromlastGPU] return output return [db[copy_db_index] for db in self._data_batches]
def CNN_run(args, ops, alphas_dict): """ Based on the given model architecture, construct CNN and execute training. input: args: arguments set by user. ops: operations used in the network. arch_dict: a dictionary containing architecture information. """ data_iterator = data_iterator_cifar10 all_data = data_iterator(args.batch_size, True) tdata = all_data.slice(rng=None, slice_start=0, slice_end=25000) vdata = all_data.slice(rng=None, slice_start=25000, slice_end=50000) # CIFAR10 statistics, mean and variance CIFAR_MEAN = np.reshape([0.49139968, 0.48215827, 0.44653124], (1, 3, 1, 1)) CIFAR_STD = np.reshape([0.24703233, 0.24348505, 0.26158768], (1, 3, 1, 1)) channels, image_height, image_width = 3, 32, 32 batch_size = args.batch_size initial_model_lr = args.model_lr one_epoch = tdata.size // batch_size max_iter = args.epoch * one_epoch # Create monitor. monitor = Monitor(args.monitor_path) monitor_loss = MonitorSeries("Training loss", monitor, interval=100) monitor_err = MonitorSeries("Training error", monitor, interval=100) monitor_vloss = MonitorSeries("Validation loss", monitor, interval=100) monitor_verr = MonitorSeries("Validation error", monitor, interval=100) # prepare variables and graph used for training image_train = nn.Variable( (batch_size, channels, image_height, image_width)) label_train = nn.Variable((batch_size, 1)) input_image_train = {"image": image_train, "label": label_train} pred_train = construct_networks(args, ops, image_train, test=False) loss_train = loss_function(pred_train, label_train) # prepare solvers for model parameters model_params_dict = \ {k: v for k, v in nn.get_parameters().items() if "alpha_" not in k} solver_model = S.Momentum(initial_model_lr) solver_model.set_parameters( { k: v for k, v in nn.get_parameters().items() if k in model_params_dict.keys() }, reset=False, retain_state=True) # prepare solvers for architecture parameters solver_archs = S.Adam(alpha=args.arch_lr, beta1=0.5, beta2=0.999) solver_archs.set_parameters( { k: v for k, v in nn.get_parameters().items() if k in alphas_dict.keys() }, reset=False, retain_state=True) # Training-loop for i in range(max_iter): # Update Model Parameters. if args.second_order: # store the weights before update. original_weights = { k: nn.Variable(v.shape, need_grad=True).apply( data=nn.NdArray(v.shape).copy_from(v.data)) for k, v in nn.get_parameters().items() if "alpha_" not in k } # gradients refuge accumulated_gradient = \ {k: nn.Variable(v.shape).apply(d=0) for k, v in alphas_dict.items()} image, label = tdata.next() image = image / 255.0 image = (image - CIFAR_MEAN) / CIFAR_STD input_image_train["image"].d = image input_image_train["label"].d = label loss_train.forward() e = categorical_error(pred_train.d, input_image_train["label"].d) monitor_loss.add(i, loss_train.d.copy()) monitor_err.add(i, e) if args.lr_control_model: new_lr = learning_rate_scheduler(i, max_iter, initial_model_lr, 0) solver_model.set_learning_rate(new_lr) solver_model.zero_grad() loss_train.backward(clear_buffer=True) if args.with_grad_clip_model: for k, v in model_params_dict.items(): v.grad.copy_from( F.clip_by_norm(v.grad, args.grad_clip_value_model)) solver_model.weight_decay(args.weight_decay_model) solver_model.update() # weights update ( w -> w') if args.second_order: updated_weights = { k: nn.Variable(v.shape, need_grad=True).apply( data=nn.NdArray(v.shape).copy_from(v.data)) for k, v in nn.get_parameters().items() if "alpha_" not in k } # Update Architecture Parameters. ve, vloss = 0., 0. v_image, v_label = vdata.next() v_image = v_image / 255.0 v_image = (v_image - CIFAR_MEAN) / CIFAR_STD input_image_train["image"].d = v_image input_image_train["label"].d = v_label # compute Loss_on_valid(w', alpha) loss_train.forward(clear_no_need_grad=True) ve = categorical_error(pred_train.d, input_image_train["label"].d) monitor_vloss.add(i, loss_train.d.copy()) monitor_verr.add(i, ve) solver_archs.zero_grad() solver_model.zero_grad() loss_train.backward(clear_buffer=True) # its gradient is stored if args.second_order: accumulated_gradient = store_gradient(accumulated_gradient, alphas_dict, coeff=1.) # grad_alpha_L_val(w', alpha). Note that gradient stored into .data delta_gradient_w = { k: nn.Variable(v.shape).apply(data=nn.NdArray( v.shape).copy_from(v.grad), need_grad=True) for k, v in nn.get_parameters().items() if "alpha_" not in k } epsilon = 0.01 / np.sum( [np.linalg.norm(v.d) for v in delta_gradient_w.values()]) coeff = 1.0 * epsilon # w -> w+ (= w + epsilon*grad_Loss_on_val(w', alpha)) weight_modify(original_weights, delta_gradient_w, model_params_dict, coeff) input_image_train["image"].d = image # reuse the same data input_image_train["label"].d = label # compute Loss_on_train(w+, alpha) loss_train.forward() solver_archs.zero_grad() solver_model.zero_grad() loss_train.backward(clear_buffer=True) # its gradient is stored # accumulate currently registered gradient coeff = (-1.) * args.eta / 2. * epsilon accumulated_gradient = store_gradient(accumulated_gradient, alphas_dict, coeff) coeff = -1.0 * epsilon # w -> w- (= w - epsilon*grad_Loss_on_val(w', alpha)) weight_modify(original_weights, delta_gradient_w, model_params_dict, coeff) # compute Loss_on_train(w-, alpha) loss_train.forward() solver_archs.zero_grad() solver_model.zero_grad() loss_train.backward(clear_buffer=True) # its gradient is stored # accumulate currently registered gradient again coeff = (+1.) * args.eta / 2. * epsilon accumulated_gradient = store_gradient(accumulated_gradient, alphas_dict, coeff) # replace the weights for k, v in alphas_dict.items(): nn.parameter.set_parameter( k, nn.Variable(v.shape).apply(data=v.data, grad=accumulated_gradient[k], need_grad=True)) for k, v in model_params_dict.items(): nn.parameter.set_parameter( k, nn.Variable(v.shape).apply(data=updated_weights[k].data, need_grad=True)) solver_archs.weight_decay(args.weight_decay_archs) solver_archs.update() if i % 1000 == 0: for k, v in alphas_dict.items(): keynames = k.split("_") print("\nParameters for {} cell, node {} to {};".format( keynames[1], keynames[2], keynames[3])) show_ops_and_prob(v.d, ops) return alphas_dict