def testCompressConvBnRelu(self): inp, oup, kernel_size, groups = 3, 5, 3, 1 m0 = mb.ConvBNReLU(inp, oup, kernel_size, groups=groups, active_fn=nn.ReLU) m0.apply(random_bn) mask = torch.tensor([False, True, True, False, False]) num_remain = mask.sum().item() m1 = mb.ConvBNReLU(inp, num_remain, kernel_size, groups=groups, active_fn=nn.ReLU) m1.apply(random_bn) infos = cu.compress_conv_bn_relu(m1, m0, mask, prefix_new='new', prefix_old='old') inputs = torch.randn(2, 3, 10, 10) self._apply_info(infos) for m in [m0, m1]: m.train() lhs = m0(inputs) rhs = m1(inputs) assertAllClose(lhs[:, mask], rhs) for m in [m0, m1]: m.eval() lhs = m0(inputs) rhs = m1(inputs) assertAllClose(lhs[:, mask], rhs)
def testAdjustEmaRate(self): num_repeat = 3 for num_repeat in [1, 5, 9]: for momentum in [0.25, 0.9999]: momentum = 0.25 name = 'v' values = torch.randn(5) values_long = values.repeat(num_repeat, 1).permute( (1, 0)).contiguous().view(-1) ema = optim.ExponentialMovingAverage(momentum) ema.register(name, values[0]) for v in values: ema(name, v) lhs = ema.average(name) momentum = optim.ExponentialMovingAverage.adjust_momentum( momentum, num_repeat) ema = optim.ExponentialMovingAverage(momentum) ema.register(name, values[0]) for v in values_long: ema(name, v) rhs = ema.average(name) assertAllClose(lhs, rhs)
def testCalMaskNetworkSlimmingByThreshold(self): x = [ torch.tensor(val, dtype=torch.float32) for val in [[1, 2, 5], [3, 6, 0, 1.1]] ] mask = prune.cal_mask_network_slimming_by_threshold(x, 1.5) expected = [ torch.tensor([False, True, True]), torch.tensor([True, True, False, False]) ] assertAllClose(mask, expected)
def testAverageVariablesUpdateNumUpdates_Vector(self): ema = optim.ExponentialMovingAverage(0.25) name = 'tens' tens = _Repeat(10.0, dim=5) var = torch.tensor(tens) ema.register(name, var, zero_init=False) for num_updates in range(2): var.add_(1) ema(name, var, num_updates=num_updates) expected = _Repeat( (10 * 0.1 + 11 * 0.9) * 2.0 / 11.0 + 12 * 9.0 / 11.0, dim=5) assertAllClose(expected, ema.average(name))
def testRhoScheduler(self): prune_params = { 'rho': 1.0, 'epoch_free': 1, 'epoch_warmup': 3, 'scheduler': 'linear', 'stepwise': True, } rho_scheduler = prune.get_rho_scheduler(prune_params, 2) res = [rho_scheduler(i) for i in range(15)] expected = [0, 0, 0, 0.25, 0.50, 0.75] + [1.0] * 9 assertAllClose(expected, res)
def testCompressConv(self): inp, oup, kernel_size, groups = 3, 5, 3, 1 conv0 = nn.Conv2d(inp, oup, kernel_size, groups=groups) mask = torch.tensor([False, True, True, False, False]) conv1 = nn.Conv2d(inp, mask.sum().item(), kernel_size, groups=groups) infos = cu.compress_conv(conv1, conv0, mask, 0) self._apply_info(infos) inputs = torch.randn(1, 3, 10, 10) lhs = conv0(inputs) rhs = conv1(inputs) assertAllClose(lhs[:, mask], rhs)
def testBnL1Loss(self): rho = 1.0 penalties = [1.0] for i in range(10): var = torch.rand(10, requires_grad=True) var.grad = torch.zeros_like(var) update_bn_network_slimming([var], penalties, rho) lhs = var.grad.detach().cpu().numpy() var.grad.zero_() loss = prune.cal_bn_l1_loss([var], penalties, rho) loss.backward() rhs = var.grad.detach().cpu().numpy() assertAllClose(lhs, rhs)
def testSaveLoad(self): ema = optim.ExponentialMovingAverage(0.25) name = 'tens' tens = _Repeat(10.0, dim=5) var = torch.tensor(tens) ema.register(name, var, zero_init=False) state_dict = ema.state_dict() for name in ['info', 'shadow', 'param']: assert name in state_dict assert 'tens' in state_dict['shadow'] assertAllClose(state_dict['shadow']['tens'], var) ema.load_state_dict(state_dict) state_dict['param']['momentum'] = 0.5 self.assertWarns(RuntimeWarning, lambda: ema.load_state_dict(state_dict))
def testCompressConvDepthwise(self): inp, oup, kernel_size, groups = 5, 5, 3, 5 conv0 = nn.Conv2d(inp, oup, kernel_size, groups=groups) mask = torch.tensor([False, True, True, False, False]) num_remain = mask.sum().item() conv1 = nn.Conv2d(num_remain, num_remain, kernel_size, groups=num_remain) infos = cu.compress_conv(conv1, conv0, mask, 0) self._apply_info(infos) inputs = torch.randn(1, 5, 10, 10) lhs = conv0(inputs) rhs = conv1(inputs[:, mask]) assertAllClose(lhs[:, mask], rhs)
def testSgdDecay(self): var = np.random.randn(10) weight_decay = 1e-1 var0 = torch.tensor(var, requires_grad=True, dtype=torch.float32) var0.grad = torch.zeros_like(var0) optimizer = torch.optim.SGD([var0], weight_decay=weight_decay, lr=0.1) optimizer.zero_grad() optimizer.step() var1 = torch.tensor(var, requires_grad=True, dtype=torch.float32) optimizer = torch.optim.SGD([var1], weight_decay=0, lr=0.1) optimizer.zero_grad() loss = (weight_decay * 0.5) * (var1**2).sum() loss.backward() optimizer.step() assertAllClose(to_numpy(var0.grad), to_numpy(var1.grad)) assertAllClose(to_numpy(var0), to_numpy(var1))
def testCompress(self): ema = optim.ExponentialMovingAverage(0.25) ema.register('var_prune', torch.arange(5).float()) ema.register('var_keep', torch.arange(5, 10).float()) ema('var_prune', torch.arange(5).float()) info = { 'var_old_name': 'var_prune', 'var_new_name': 'var_new', 'var_new': torch.randn(3), 'mask': torch.tensor([False, True, False, True, True]), 'mask_hook': lambda lhs, rhs, mask: lhs.data.copy_(rhs.data[mask]) } ema.compress_mask(info, verbose=False) self.assertTrue(info['var_new_name'] in ema._shadow) self.assertTrue(info['var_new_name'] in ema._info) self.assertTrue(info['var_old_name'] not in ema._shadow) self.assertTrue(info['var_old_name'] not in ema._info) self.assertEqual(ema._info[info['var_new_name']]['num_updates'], 1) assertAllClose(ema.average(info['var_new_name']), [1, 3, 4])
def testSoftmaxLabelSmoothing(self): # Softmax Cross Entropy Loss is: # -\sum_i p_i \log q_i # where for a softmax activation # \log q_i = x_i - \log \sum_j \exp x_j # = x_i - x_max - \log \sum_j \exp (x_j - x_max) # For our activations, [100, -100, -100] the log partition function # becomes \log ( exp(0) + exp(-200) + exp(-200) ) = 0 # so our log softmaxes become: [0, -200, -200] # so our cross entropy loss is: # -(1 - L + L/n) * 0 + 400 * L/n = 400 L/n logits = torch.tensor([[100.0, -100.0, -100.0]]) labels = torch.tensor([0], dtype=torch.int64) label_smoothing = 0.1 criterion = optim.CrossEntropyLabelSmooth(logits.size(1), label_smoothing) expected_value = 400.0 * label_smoothing / 3.0 res = criterion(logits, labels).item() assertAllClose(res, expected_value)
def testCalMaskNetworkSlimmingByFlops(self): names = ['one', 'two'] x = [ torch.tensor(val, dtype=torch.float32) for val in [[1, 2, 5], [3, 6, 0, 1.1]] ] per_channel_flops = [3, 5] prune_info = prune.PruneInfo(names, [0, 1]) prune_info.add_info_list('per_channel_flops', per_channel_flops) flops_total = sum( flops * len(val) for flops, val in zip(per_channel_flops, x)) flops_to_prune = 12 mask, threshold = prune.cal_mask_network_slimming_by_flops( x, prune_info, flops_to_prune) prune_info.add_info_list('mask', mask) assertAllClose(threshold, 1.1) expected = [ torch.tensor([False, True, True]), torch.tensor([True, True, False, False]) ] assertAllClose(mask, expected) pruned_flops, info = prune.cal_pruned_flops(prune_info) self.assertTrue(pruned_flops >= flops_to_prune) flops_to_prune = 13 mask, threshold = prune.cal_mask_network_slimming_by_flops( x, prune_info, flops_to_prune) prune_info.add_info_list('mask', mask) assertAllClose(threshold, 2) expected = [ torch.tensor([False, False, True]), torch.tensor([True, True, False, False]) ] assertAllClose(mask, expected) pruned_flops, info = prune.cal_pruned_flops(prune_info) self.assertTrue(pruned_flops >= flops_to_prune)
def _CheckDecay(self, ema, actual_decay, dim, num_updates=None, vars_pre_hooks=None, num_updates_post_hook=None): def _Update(): nonlocal num_updates if vars_pre_hooks is not None: assert len(vals) == vars_pre_hooks for val, var_prehook in zip(vals, vars_pre_hooks): var_prehook(val) for name, val in zip(names, vals): ema(name, val, num_updates) if num_updates_post_hook: num_updates = num_updates_post_hook def _Scale(dk, steps): if ema._zero_debias: return 1 - dk**steps else: return 1 tens = _Repeat(10.0, dim) thirties = _Repeat(30.0, dim) var0 = torch.tensor(tens) var1 = torch.tensor(thirties) # Note that tensor2 is not a Variable but just a plain Tensor resulting # from the sum operation. tensor2 = var0 + var1 names = ['tens', 'thirties', 'tensor2'] vals = [var0, var1, tensor2] zero_inits = [False, False, True] for name, var, zero_init in zip(names, vals, zero_inits): ema.register(name, var, zero_init) # Check that averages are initialized correctly. assertAllClose(tens, ema.average('tens')) assertAllClose(thirties, ema.average('thirties')) # Note that averages of Tensor's initialize to zeros_like since no value # of the Tensor is known because the Op has not been run (yet). assertAllClose(_Repeat(0.0, dim), ema.average('tensor2')) # Update the averages and check. _Update() dk = actual_decay expected = _Repeat(10.0 * dk + 10.0 * (1 - dk), dim) assertAllClose(expected, ema.average('tens')) expected = _Repeat(30.0 * dk + 30.0 * (1 - dk), dim) assertAllClose(expected, ema.average('thirties')) expected = _Repeat(0.0 * dk + (10.0 + 30.0) * (1 - dk) / _Scale(dk, 1), dim) assertAllClose(expected, ema.average('tensor2')) # Again, update the averages and check. _Update() expected = _Repeat( (10.0 * dk + 10.0 * (1 - dk)) * dk + 10.0 * (1 - dk), dim) assertAllClose(expected, ema.average('tens')) expected = _Repeat( (30.0 * dk + 30.0 * (1 - dk)) * dk + 30.0 * (1 - dk), dim) assertAllClose(expected, ema.average('thirties')) expected = _Repeat(((0.0 * dk + (10.0 + 30.0) * (1 - dk)) * dk + (10.0 + 30.0) * (1 - dk)) / _Scale(dk, 2), dim) assertAllClose(expected, ema.average('tensor2'))
def testCompressUpdate(self): params, info = self._construct_info() params0 = copy.deepcopy(params) apply_gradients([p.grad for p in params], params0) optimizer = RMSprop(params0, lr=0.1, momentum=0.5) optimizer.step() params1 = copy.deepcopy(params) apply_gradients([p.grad for p in params], params1) optimizer1 = RMSprop(params1, lr=0.1, momentum=0.5) optimizer1.step() assertAllClose(params0[1], params1[1]) assertAllClose(params0[2], params1[2]) assertAllClose(params0[0], params1[0]) info['var_old'] = params1[0] optimizer1.compress_mask(info, verbose=True) optimizer1.compress_drop({'var_old': params1[2], 'type': 'variable'}) info['mask_hook'](info['var_new'], info['var_old'], info['mask']) params1[0] = info['var_new'] params1[0].grad = params0[0].grad.data[info['mask']] optimizer1.step() # params1[2] not updated assertAllClose(params0[2], params1[2]) optimizer.step() assertAllClose(params0[1], params1[1]) assertAllClose(params0[0][info['mask']], params1[0])