def test_maml_update_var(): model = OmniglotFC(28 * 28, 5) model.train() loss_fn = F.cross_entropy_with_softmax old_params = list(model.parameters()) maml = MAML(model) params = list(maml.named_parameters.values()) optimizer = optim.SGD(old_params, lr=0.05) optimizer.zero_grad() support_out = model.forward( meg.tensor(np.random.randn(5, 28 * 28), dtype='float32')) support_loss = loss_fn( support_out, meg.tensor(np.random.randint(0, 5, (5)), dtype='int32')) optimizer.backward(support_loss) optimizer.step() assert id(old_params[0]) == id(params[0]) # 手动update grads = F.grad(support_loss, params, use_virtual_grad=False) fast_weights = [p - 0.5 * g for g, p in zip(grads, params)] named_update = dict(zip(maml.named_parameters.keys(), fast_weights)) named_old = dict(zip(maml.named_parameters.keys(), old_params)) maml.replace_parameter(maml.module_table, named_update) # 被替换为新的值后就无法通过model.parameters()找到了。 after_params = list(model.parameters()) maml.module_table['classifier'].bias named_update['classifier.bias'] mods = list(model.modules()) mods[1].bias maml.replace_parameter(maml.module_table, named_old)
def run_syncbn(trace_mode): x = F.ones([2, 16, 4, 4], dtype="float32") net = Sequential( Conv2d(16, 16, 1), SyncBatchNorm(16), Conv2d(16, 16, 1), SyncBatchNorm(16), ) gm = ad.GradManager().attach( net.parameters(), callbacks=dist.make_allreduce_cb("MEAN") ) opt = optimizer.SGD(net.parameters(), 1e-3) def train_func(x): with gm: y = net(x) loss = y.mean() gm.backward(loss) opt.step().clear_grad() return loss if trace_mode is not None: train_func = trace(train_func, symbolic=trace_mode) for _ in range(3): loss = train_func(x) loss.numpy()
def test_output_copy_trace(): class Simple(Module): def __init__(self): super().__init__() self.a = Parameter([1.0], dtype=np.float32) def forward(self, x): x = x * self.a # will result into a copy of output in grad x = F.exp(x) return x ys = {False: [], True: []} for symbolic in [False, True]: net = Simple() gm = GradManager().attach(net.parameters()) opt = optim.SGD(net.parameters(), 1e-3, momentum=0.9) data = tensor(np.arange(4).reshape(2, 2), dtype="float32") @trace(symbolic=symbolic) def train_func(d): with gm: loss = net(d) gm.backward(loss) opt.step().clear_grad() return loss for i in range(3): y = train_func(data).numpy() ys[symbolic].append(y) for i in range(3): np.testing.assert_equal(ys[False][i], ys[True][i])
def test_none_in_out_grad(): class Test(Function): def forward(self, a, b): return a, b def backward(self, grad_a, grad_b): assert grad_b is None return (grad_a, None) class Simple(Module): def __init__(self, a, b): super().__init__() self.a = Parameter(a, dtype=np.float32) self.b = Parameter(b, dtype=np.float32) self.layer = Test() def forward(self): aa, bb = self.layer(self.a, self.b) return aa, bb a = tensor(np.array([1.0], dtype=np.float32)) b = tensor(np.array([2.0], dtype=np.float32)) net = Simple(a, b) optim = optimizer.SGD(net.parameters(), lr=1.0) gm = ad.GradManager().attach(net.parameters()) optim.clear_grad() with gm: loss, _ = net() gm.backward(loss) optim.step() np.testing.assert_almost_equal(net.a.numpy(), np.array([1.0 - 1.0], dtype=np.float32)) np.testing.assert_almost_equal(net.b.numpy(), np.array([2.0 - 0.0], dtype=np.float32))
def test_single_input(): data_shape = (9, 2, 6) av = np.random.random(data_shape).astype(np.float32) class MulFunc(Function): def forward(self, a): self.a = a return a * 10 def backward(self, grad_o): return grad_o * 10 class Simple(Module): def __init__(self, a): super().__init__() self.a = Parameter(a, dtype=np.float32) self.layer1 = MulFunc() def forward(self): x = self.layer1(self.a) return x net = Simple(av) gm = ad.GradManager().attach(net.parameters()) opt = optimizer.SGD(net.parameters(), lr=1.0) opt.clear_grad() with gm: loss = net() gm.backward(loss.sum()) opt.step() np.testing.assert_almost_equal(loss.numpy(), (av * 10)) np.testing.assert_almost_equal(net.a.numpy(), (av - 10))
def run(step, enable_trace, use_symbolic): def train_func(data, net=None, opt=None): pred = net(data) opt.backward(pred) return pred if enable_trace: train_func = trace(train_func, symbolic=use_symbolic) net = Mixed() data = tensor() opt = optimizer.SGD(net.parameters(), lr=lr) saved_param = init_param for i in range(step): opt.zero_grad() data.set_value([i + 1.0]) output = train_func(data, net=net, opt=opt) opt.step() expect_param = ( saved_param[0] - lr * saved_param[1] * data.numpy(), saved_param[1] - lr * saved_param[0] * data.numpy(), ) assertTensorClose( output.numpy(), saved_param[0] * saved_param[1] * data.numpy() ) torch_param = net.torch_module._torch_params[0].detach().cpu() assertTensorClose(torch_param.numpy(), expect_param[0]) assertTensorClose(net.multiplier.numpy(), expect_param[1]) saved_param = expect_param
def test_Clone_model(): # 必须要将新参数clone到另一个模型中,才可以继续 train_loader = build_dataloader() image_support = meg.tensor(dtype='float32') label_support = meg.tensor(dtype="int32") model = OmniglotFC(28 * 28, 5) model.train() loss_fn = F.cross_entropy_with_softmax optimizer = optim.SGD(model.parameters(), lr=0.05) iters = iter(train_loader) (images_support, labels_support, images_query, labels_query) = next(iters) i = 0 image_support.set_value(images_support[i]) label_support.set_value(labels_support[i]) image_support = F.remove_axis(image_support, 1) label_support = F.remove_axis(label_support, 1) support_out = model.forward(image_support) support_loss = loss_fn(support_out, label_support) # 对需要梯度更新的参数进行更新 params = list(model.parameters(requires_grad=True)) params[0] = meg.tensor(np.ones((5)), dtype='float32') grads = F.grad(support_loss, params, use_virtual_grad=False) fast_weights = [p - 0.5 * g for g, p in zip(grads, params)]
def test_sgd_momentum(): net = Simple() optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9) optim.clear_grad() gm = ad.GradManager().attach(net.parameters()) data = tensor([2.34]) # do a step of train with gm: loss = net(data) gm.backward(loss) optim.step() np.testing.assert_almost_equal( optim._state[net.a]["momentum_buffer"].numpy(), 2.34) # do a step of infer loss = net(data) np.testing.assert_almost_equal(loss.numpy(), 2.34 * (1.23 - 2.34), 5) np.testing.assert_almost_equal( optim._state[net.a]["momentum_buffer"].numpy(), 2.34) # do a step of train optim.clear_grad() with gm: loss = net(data) gm.backward(loss) optim.step() np.testing.assert_almost_equal(loss.numpy(), 2.34 * (1.23 - 2.34), 5) np.testing.assert_almost_equal( optim._state[net.a]["momentum_buffer"].numpy(), 0.9 * 2.34 + 2.34)
def test_grad_twice_method_3(): # model define model = CustomModel3() model.train() named_param = dict(list(model.named_parameters(requires_grad=True))) params = list(named_param.values()) external_params = [ meg.Parameter(np.random.normal(size=p.shape), dtype='float32') for p in params ] loss_fn = F.cross_entropy_with_softmax optimizer = optim.SGD(external_params, lr=0.003) # forward once optimizer.zero_grad() x1 = meg.tensor(np.random.randn(5, 10), dtype='float32') y1 = meg.tensor(np.random.randint(0, 5, (5)), dtype='int32') x2 = meg.tensor(np.random.randn(5, 10), dtype='float32') y2 = meg.tensor(np.random.randint(0, 5, (5)), dtype='int32') train_func3(x1, y1, x2, y2, loss_fn=loss_fn, opt=optimizer, net=model, params=external_params) optimizer.step()
def test_grad_twice_method_2(): # model define model = CustomModel() model.train() named_param = dict(list(model.named_parameters(requires_grad=True))) name_keys = list(named_param.keys()) params = list(named_param.values()) loss_fn = F.cross_entropy_with_softmax optimizer = optim.SGD(params, lr=0.003) # forward once optimizer.zero_grad() x1 = meg.tensor(np.random.randn(5, 10), dtype='float32') y1 = meg.tensor(np.random.randint(0, 5, (5)), dtype='int32') x2 = meg.tensor(np.random.randn(5, 10), dtype='float32') y2 = meg.tensor(np.random.randint(0, 5, (5)), dtype='int32') train_func(x1, y1, x2, y2, loss_fn=loss_fn, opt=optimizer, net=model, keys=name_keys, params=params) optimizer.step()
def test_grad_twice(): # model define model = M.Sequential(M.Linear(10, 20), M.Linear(20, 10), M.Linear(10, 5)) model.train() named_param = dict(list(model.named_parameters(requires_grad=True))) named_module = dict(list(model.named_children())) name_keys = list(named_param.keys()) params = list(named_param.values()) loss_fn = F.cross_entropy_with_softmax optimizer = optim.SGD(params, lr=0.003) # forward once optimizer.zero_grad() x1 = meg.tensor(np.random.randn(5, 10), dtype='float32') y1 = meg.tensor(np.random.randint(0, 5, (5)), dtype='int32') loss = loss_fn(model(x1), y1) grads = F.grad(loss, params, use_virtual_grad=False, return_zero_for_nodep=False) fast_weights = [p - 0.5 * g for g, p in zip(grads, params)] # manual update params replace_parameter(named_module, dict(zip(name_keys, fast_weights))) # forward twice x2 = meg.tensor(np.random.randn(5, 10), dtype='float32') y2 = meg.tensor(np.random.randint(0, 5, (5)), dtype='int32') loss2 = loss_fn(model(x2), y2) # got error replace_parameter(named_module, named_param) optimizer.backward(loss2) optimizer.step()
def test_clear_grad(): class StopGradient(Function): def forward(self, a): return a def backward(self, *_): return None class Simple(Module): def __init__(self, a): super().__init__() self.a = Parameter(a, dtype=np.float32) self.layer = StopGradient() def forward(self): b = self.a * 3.0 c = self.a * 4.0 return self.layer(b) + c a = tensor(np.array([1.0], dtype=np.float32)) net = Simple(a) optim = optimizer.SGD(net.parameters(), lr=1.0) gm = ad.GradManager().attach(net.parameters()) optim.clear_grad() with gm: loss = net() gm.backward(loss.sum()) optim.step() np.testing.assert_almost_equal( net.a.numpy(), np.array([1.0 - 4.0], dtype=np.float32), )
def worker(): rank = dist.get_rank() size = dist.get_world_size() x = mge.tensor(np.random.randn(1, rank * 2 + 2), dtype=np.float32) m = M.Linear(rank * 2 + 2, rank * 2 + 4) gm = GradManager().attach(m.parameters()) opt = optim.SGD(m.parameters(), 1e-3, momentum=0.9) def train_func(x): with gm: if rank != 0: x = dist.functional.remote_recv(rank - 1, shape=(1, rank * 2 + 2), dtype=np.float32) y = m(x) if rank != size - 1: dist.functional.remote_send(y, dest_rank=rank + 1) gm.backward() else: y = y.mean() gm.backward(y) opt.step().clear_grad() train_funcs = [ train_func, trace(symbolic=False)(train_func), trace(symbolic=True)(train_func), ] for func in train_funcs: for i in range(3): func(x)
def worker(rank, world_size, args): if world_size > 1: dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) logger.info("Init process group for gpu%d done", rank) sys.path.insert(0, os.path.dirname(args.file)) current_network = importlib.import_module( os.path.basename(args.file).split(".")[0]) model = current_network.Net(current_network.Cfg(), batch_size=args.batch_size) params = model.parameters(requires_grad=True) model.train() if rank == 0: logger.info(get_config_info(model.cfg)) opt = optim.SGD( params, lr=model.cfg.basic_lr * world_size * model.batch_size, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay, ) if args.weight_file is not None: weights = mge.load(args.weight_file) model.backbone.bottom_up.load_state_dict(weights) logger.info("Prepare dataset") loader = build_dataloader(model.batch_size, args.dataset_dir, model.cfg) train_loader = iter(loader["train"]) for epoch_id in range(model.cfg.max_epoch): for param_group in opt.param_groups: param_group["lr"] = (model.cfg.basic_lr * world_size * model.batch_size * (model.cfg.lr_decay_rate**bisect.bisect_right( model.cfg.lr_decay_sates, epoch_id))) tot_steps = model.cfg.nr_images_epoch // (model.batch_size * world_size) train_one_epoch(model, train_loader, opt, tot_steps, rank, epoch_id, world_size) if rank == 0: save_path = "log-of-{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch_id) mge.save( { "epoch": epoch_id, "state_dict": model.state_dict() }, save_path, ) logger.info("dump weights to %s", save_path)
def test_save_load(): net = Simple() optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9) optim.clear_grad() gm = ad.GradManager().attach(net.parameters()) data = tensor([2.34]) with gm: loss = net(data) gm.backward(loss) optim.step() model_name = "simple.pkl" print("save to {}".format(model_name)) mge.save( { "name": "simple", "state_dict": net.state_dict(), "opt_state": optim.state_dict(), }, model_name, ) # Load param to cpu checkpoint = mge.load(model_name, map_location="cpu0") device_save = mge.get_default_device() mge.set_default_device("cpu0") net = Simple() net.load_state_dict(checkpoint["state_dict"]) optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9) optim.load_state_dict(checkpoint["opt_state"]) print("load done") os.remove("simple.pkl") with gm: loss = net([1.23]) gm.backward(loss) optim.step() # Restore device mge.set_default_device(device_save)
def worker(rank, gpu_num, args): # using sublinear os.environ[ "MGB_COMP_GRAPH_OPT"] = "enable_sublinear_memory_opt=1;seq_opt.enable_seq_comp_node_opt=0" os.environ["MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER"] = '50' if gpu_num > 1: dist.init_process_group( master_ip="localhost", master_port=23456, world_size=gpu_num, rank=rank, dev=rank, ) logger.info("Init process group for gpu%d done", rank) model = network.Network() params = model.parameters(requires_grad=True) model.train() opt = optim.SGD( params, lr=cfg.basic_lr * gpu_num * cfg.batch_per_gpu, momentum=cfg.momentum, weight_decay=cfg.weight_decay, ) if cfg.pretrain_weight is not None: weights = mge.load(cfg.pretrain_weight) del weights['fc.weight'] del weights['fc.bias'] model.resnet50.load_state_dict(weights) logger.info("Prepare dataset") train_loader = dataset.train_dataset(rank) logger.info("Training...") for epoch_id in range(cfg.max_epoch): for param_group in opt.param_groups: param_group["lr"] = (cfg.basic_lr * gpu_num * cfg.batch_per_gpu * (cfg.lr_decay_rate**bisect.bisect_right( cfg.lr_decay_sates, epoch_id))) max_steps = cfg.nr_images_epoch // (cfg.batch_per_gpu * gpu_num) train_one_epoch(model, train_loader, opt, max_steps, rank, epoch_id, gpu_num) if rank == 0: save_path = os.path.join(cfg.model_dir, 'epoch_{}.pkl'.format(epoch_id + 1)) mge.save( { "epoch": epoch_id, "state_dict": model.state_dict() }, save_path, ) logger.info("dump weights to %s", save_path)
def test_bn_no_track_stat(): nchannel = 3 m = BatchNorm2d(nchannel, track_running_stats=False) gm = ad.GradManager().attach(m.parameters()) optim = optimizer.SGD(m.parameters(), lr=1.0) optim.clear_grad() data = np.random.random((6, nchannel, 2, 2)).astype("float32") with gm: loss = m(data).sum() gm.backward(loss) optim.step()
def test_hello_world(): net = Simple() optim = optimizer.SGD(net.parameters(), lr=1.0) optim.clear_grad() gm = ad.GradManager().attach(net.parameters()) data = tensor([2.34]) with gm: loss = net(data) gm.backward(loss) optim.step() np.testing.assert_almost_equal(net.a.numpy(), np.array([1.23 - 2.34]).astype(np.float32))
def test_optimizer_serialization(): data, data_shape, label, label_shape = get_input() mlp = MLP() opt = optimizer.SGD(mlp.parameters(), lr=0.01, momentum=0.9) slots = TensorDict() for param in mlp.parameters(): slots[param] = np.zeros(param.shape).astype(np.float32) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) opt.step() for param in mlp.parameters(): slots[param] = slots[param] * 0.9 + param.grad.numpy() with BytesIO() as fout: save(opt.state_dict(), fout) fout.seek(0) state_dict = load(fout) opt1 = optimizer.SGD(mlp.parameters(), lr=0.02, momentum=0.8) opt1.load_state_dict(state_dict) data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt1.zero_grad() opt1.backward(loss) orig_params = TensorDict() for param in mlp.parameters(): orig_params[param] = np.copy(param.numpy()) opt1.step() for param in mlp.parameters(): orig_param = orig_params[param] slots[param] = slots[param] * 0.9 + param.grad.numpy() assertTensorClose(param.numpy(), orig_param - 0.01 * slots[param])
def train_pipeline(): m = ResNet18Pipeline() x = F.ones([32, 3, 224, 224]) label = F.zeros([ 32, ], dtype="int32") gm = ad.GradManager().attach(m.parameters()) opt = optim.SGD(m.parameters(), 1e-3, 0.9, 1e-4) for _ in range(2): m(x) loss = m.backward(label, gm) opt.step().clear_grad() print(loss)
def test_clip_grad_norm(): net = Net() x = mge.tensor(np.random.randn(10, 3, 224, 224)) gm = ad.GradManager().attach(net.parameters()) opt = optim.SGD(net.parameters(), 1e-3, momentum=0.9) with gm: loss = net(x).sum() gm.backward(loss) save_grad_value(net) max_norm = 1.0 original_norm = optim.clip_grad_norm(net.parameters(), max_norm=max_norm, ord=2) scale = max_norm / original_norm for param in net.parameters(): np.testing.assert_almost_equal(param.grad.numpy(), param.grad_backup * scale) opt.step().clear_grad()
def test_detach(): net = Simple() optim = optimizer.SGD(net.parameters(), lr=1.0) optim.clear_grad() gm = ad.GradManager().attach(net.parameters()) dshape = (10, 10) data = tensor(np.ones(dshape).astype(np.float32)) with gm: loss = net(data).sum() gm.backward(loss) optim.step() np.testing.assert_equal(net.a.numpy(), np.array([1.0]).astype(np.float32)) np.testing.assert_equal(net.b.numpy(), np.array([1.0 - 10.0 * 10.0]).astype(np.float32))
def test_sgd_momentum_trace(): origin_inplace = os.getenv("MEGENGINE_INPLACE_UPDATE") symbolic = (True, False) inplace = (0, 1) for symbolic, inplace in itertools.product(symbolic, inplace): os.environ["MEGENGINE_INPLACE_UPDATE"] = str(inplace) @trace(symbolic=symbolic) def train_func(data, *, model=None, optim=None, gm=None): optim.clear_grad() with gm: loss = net(data) gm.backward(loss) optim.step() return loss @trace(symbolic=symbolic) def eval_func(data, *, model=None, optim=None, gm=None): loss = net(data) return loss net = Simple() optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9) gm = ad.GradManager().attach(net.parameters()) data = tensor([2.34]) train_func(data, model=net, optim=optim, gm=gm) np.testing.assert_almost_equal( optim._state[net.a]["momentum_buffer"].numpy(), 2.34) # do 3 steps of infer for _ in range(3): loss = eval_func(data) np.testing.assert_almost_equal(loss.numpy(), 2.34 * (1.23 - 2.34), 5) np.testing.assert_almost_equal( optim._state[net.a]["momentum_buffer"].numpy(), 2.34) # do a step of train train_func(data, model=net, optim=optim, gm=gm) np.testing.assert_almost_equal(loss.numpy(), 2.34 * (1.23 - 2.34), 5) np.testing.assert_almost_equal( optim._state[net.a]["momentum_buffer"].numpy(), 0.9 * 2.34 + 2.34, 5) if origin_inplace: os.environ["MEGENGINE_INPLACE_UPDATE"] = origin_inplace else: del os.environ["MEGENGINE_INPLACE_UPDATE"]
def test_advance_indexing_with_subtensor(): net = Simple2() gm = ad.GradManager().attach(net.parameters()) optim = optimizer.SGD(net.parameters(), lr=1.0) optim.clear_grad() dshape = (2, 3, 4, 3, 4, 2) raw_data = np.arange(576).reshape(dshape).astype(np.float32) data = tensor(raw_data) answer = 1.0 - raw_data[1, ..., :, 0:4:2, 0:2].sum() with gm: loss = net(data).sum() gm.backward(loss) optim.step() np.testing.assert_almost_equal(net.a.numpy(), np.array([answer]).astype(np.float32))
def run_frozen_bn(BNModule, is_training, use_trace, use_symbolic): nchannel = 3 m = BNModule(nchannel, freeze=True) if is_training: m.train() else: m.eval() var = 4.0 bias = 1.0 shape = (1, nchannel, 1, 1) m.running_var[...] = var * F.ones(shape) m.running_mean[...] = bias * F.ones(shape) saved_var = m.running_var.numpy() saved_mean = m.running_mean.numpy() saved_wt = m.weight.numpy() saved_bias = m.bias.numpy() gm = ad.GradManager().attach(m.parameters()) optim = optimizer.SGD(m.parameters(), lr=1.0) optim.clear_grad() data = np.random.random((6, nchannel, 2, 2)).astype("float32") def train_fn(d): for _ in range(3): with gm: loss = m(d).mean() gm.backward(loss) optim.step() return loss if use_trace: train_fn = trace(train_fn, symbolic=use_symbolic) for _ in range(3): loss = train_fn(megengine.tensor(data)) if not is_training: np.testing.assert_equal(m.running_var.numpy(), saved_var) np.testing.assert_equal(m.running_mean.numpy(), saved_mean) np.testing.assert_almost_equal( loss.numpy(), ((data - bias) / np.sqrt(var)).mean(), 5 ) np.testing.assert_equal(m.weight.numpy(), saved_wt) np.testing.assert_equal(m.bias.numpy(), saved_bias)
def test_load_state_dict_no_cache(monkeypatch): with monkeypatch.context() as mk: mk.setenv("MEGENGINE_INPLACE_UPDATE", "1") net = Net() optim = optimizer.SGD(net.parameters(), lr=0.1) gm = ad.GradManager().attach(net.parameters()) state = { "fc.weight": np.array([[0]], dtype=np.float32), "fc.bias": np.array([0.0], dtype=np.float32), } net.load_state_dict(state) images = mge.tensor([[0]], dtype=np.float32) with gm: loss = net(images) gm.backward(loss) optim.step() optim.clear_grad()
def test_sgd_momentum(monkeypatch, trace_mode, inplace_mode): with monkeypatch.context() as mk: mk.setenv("MEGENGINE_INPLACE_UPDATE", str(int(inplace_mode))) def train_func(data, *, model=None, optim=None, gm=None): optim.clear_grad() with gm: loss = net(data) gm.backward(loss) optim.step() return loss if trace_mode is not None: train_func = trace(symbolic=trace_mode)(train_func) def eval_func(data, *, model=None, optim=None, gm=None): loss = net(data) return loss if trace_mode is not None: eval_func = trace(symbolic=trace_mode)(eval_func) net = Simple() optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9) gm = ad.GradManager().attach(net.parameters()) data = tensor([2.34]) train_func(data, model=net, optim=optim, gm=gm) np.testing.assert_almost_equal( optim._state[net.a]["momentum_buffer"].numpy(), 2.34) # do 3 steps of infer for _ in range(3): loss = eval_func(data) np.testing.assert_almost_equal(loss.numpy(), 2.34 * (1.23 - 2.34), 5) np.testing.assert_almost_equal( optim._state[net.a]["momentum_buffer"].numpy(), 2.34) # do a step of train train_func(data, model=net, optim=optim, gm=gm) np.testing.assert_almost_equal(loss.numpy(), 2.34 * (1.23 - 2.34), 5) np.testing.assert_almost_equal( optim._state[net.a]["momentum_buffer"].numpy(), 0.9 * 2.34 + 2.34, 5)
def test_advance_indexing(): net = Simple() gm = ad.GradManager().attach(net.parameters()) optim = optimizer.SGD(net.parameters(), lr=1.0) optim.clear_grad() dshape = (10, 10) raw_data = np.arange(100).reshape(dshape).astype(np.float32) raw_mask = (np.random.random_sample(dshape) > 0.5).astype(np.bool_) data = tensor(raw_data) mask = tensor(raw_mask) answer = 1.0 - raw_data[raw_mask].sum() with gm: loss = net(data, mask).sum() gm.backward(loss) optim.step() np.testing.assert_almost_equal(net.a.numpy(), np.array([answer]).astype(np.float32))
def test_elemwise_fuse_in_grad(trace_mode): w = Parameter(np.ones([4, 6]), dtype="float32") gm = GradManager().attach(w) opt = optim.SGD([w], lr=0.01, momentum=0.9, weight_decay=5e-4) # explicitly declare opt_level as 2 @trace(symbolic=trace_mode, opt_level=2) def f(): with gm: wm = F.sum(w**2, axis=1)**0.5 loss = wm.mean() gm.backward(loss) opt.step().clear_grad() return loss for i in range(3): y = f() y.numpy()
def test_clip_grad_value(): net = Net() x = np.random.randn(10, 3, 224, 224).astype("float32") gm = ad.GradManager().attach(net.parameters()) opt = optim.SGD(net.parameters(), 1e-3, momentum=0.9) with gm: y = net(mge.tensor(x)) y = y.mean() gm.backward(y) save_grad_value(net) max_val = 5 min_val = -2 optim.clip_grad_value(net.parameters(), lower=min_val, upper=max_val) for param in net.parameters(): np.testing.assert_almost_equal( param.grad.numpy(), np.maximum(np.minimum(param.grad_backup, max_val), min_val), ) opt.step().clear_grad()