예제 #1
0
def analyzing_mask(mask_gen, layer_size, mode, iteration, iter_interval, text_dir):
    sparse_r = {}
    #if mode == 'test' and (iteration % iter_interval == 0):
    if (iteration % iter_interval == 0):
      mask = mask_gen.sample_mask()
      mask = ParamsFlattener(mask)
      for k, v in layer_size.items():
        r = (mask > 0.5).sum().unflat[k].tolist() / v
        sparse_r[f"sparse_{k.split('_')[1]}"] = r
      alpha = mask_gen.a
      beta = mask_gen.b
      expectation = alpha / (alpha + beta)
      #variance =  (alpha * beta) / ((alpha + beta) * (alpha + beta) * (alpha + beta +1))
      bern_n = 1
      #import pdb; pdb.set_trace()
      variance =  bern_n * (alpha * beta) * (alpha + beta + bern_n) / ((alpha + beta) * (alpha + beta) * (alpha + beta +1))
      sorted_layer_0 = np.sort(tensor2numpy(expectation[0:layer_size['layer_0']]))[::-1]
      mask_thr_layer_0 = sorted_layer_0[int(sparse_r['sparse_0'] * layer_size['layer_0'])]
      sorted_layer_1 = np.sort(tensor2numpy(expectation[layer_size['layer_0']: layer_size['layer_0']+layer_size['layer_1']]))[::-1]
      mask_thr_layer_1 = sorted_layer_1[int(sparse_r['sparse_1'] * layer_size['layer_1'])]
      drop = tensor2numpy(expectation) < tensor2numpy(expectation.median())
      sparse_r_drop = np.concatenate(((tensor2numpy(expectation) < mask_thr_layer_0)[0:layer_size['layer_0']], (tensor2numpy(expectation) < mask_thr_layer_1)[layer_size['layer_0']: layer_size['layer_0']+layer_size['layer_1']]), axis=0)
      for index, drop in enumerate([drop, sparse_r_drop]):
        drop = np.array(drop, dtype=int)
        retain = 1 - drop
        certain = tensor2numpy(variance < variance.median())
        uncertain = 1 - certain
        certain_drop = certain * drop
        certain_retain = certain * retain
        uncertain_drop = uncertain * drop
        uncertain_retain = uncertain * retain
        if index == 0:
          print("\niteration {} (median) certain drop : {} certain retain : {} uncertain drop : {} uncertain_retain : {}\n".format(iteration, certain_drop.sum(), certain_retain.sum(), uncertain_drop.sum(), uncertain_retain.sum()))
        else:
          print("\niteration {} (sparse) certain drop : {} certain retain : {} uncertain drop : {} uncertain_retain : {}\n".format(iteration, certain_drop.sum(), certain_retain.sum(), uncertain_drop.sum(), uncertain_retain.sum()))
예제 #2
0
 def __init__(self, params=None):
     super().__init__()
     if params is not None:
         if not isinstance(params, ParamsFlattener):
             raise TypeError("params argumennts has to be "
                             "an instance of ParamsFlattener!")
         self.params = params
     else:
         theta = torch.zeros(10)
         self.params = ParamsFlattener({'theta': theta})
예제 #3
0
 def layers2params(self, layers):
     j = 0
     params = {}
     for i, layer in enumerate(layers):
         if layer.__class__.__name__ in ['Linear', 'Conv2d']:
             """FIX LATER: there is no need to use diffrent names for the same
     matrices. (weight, mat)"""
             params['mat_' + str(j)] = C(layer.weight.data.clone())
             if layer.bias is not None:
                 params['bias_' + str(j)] = C(layer.bias.data.clone())
             j += 1
     return ParamsFlattener(params)
예제 #4
0
def eval_gauss_var(model_cls, data, params, n_sample=200, std=1e-4):
    assert isinstance(params, ParamsFlattener)
    params = params.detach()
    losses = []
    for _ in range(n_sample):
        p = {}
        for k, v in params.unflat.items():
            p[k] = v + C(torch.zeros(v.size()).normal_(0, std))
        params_perturbed = ParamsFlattener(p)
        model = C(model_cls(params=params_perturbed))
        losses.append(model(*data['in_train'].load()))
    return torch.stack(losses).var()
예제 #5
0
 def topk(cls, grad, set_size, topk, mode='ratio'):
     # NOTE: act is used to match the size (fix later)
     # dirty work...
     assert mode in ['ratio', 'number']
     # set_size = act.tsize(1)
     if mode == 'ratio':
         topk_n = cls.compute_topk_n_with_ratio(topk, set_size)
     else:
         topk_n = cls.compute_topk_n_with_number(topk, set_size)
     # abs_sum = grad.abs().sum(0, keepdim=True)
     layer_0 = torch.cat(
         [grad.unflat['mat_0'], grad.unflat['bias_0'].unsqueeze(0)],
         dim=0).abs().sum(0)
     layer_1 = torch.cat(
         [grad.unflat['mat_1'], grad.unflat['bias_1'].unsqueeze(0)],
         dim=0).abs().sum(0)
     abs_sum = ParamsFlattener({'layer_0': layer_0, 'layer_1': layer_1})
     ids = abs_sum.topk_id(topk_n, sorted=False)
     ids = {k: v.view(1, -1).long() for k, v in ids.unflat.items()}
     # ids = abs_sum.topk_id(topk_n, sorted=False)
     mask = ParamsFlattener(
         {k: C(torch.zeros(1, v))
          for k, v in set_size.items()})
     mask = mask.scatter_float_(1, ids, 1)
     return mask
예제 #6
0
 def randk(cls, set_size, topk, mode='ratio', *args, **kwargs):
     # NOTE: act is used to match the size (fix later)
     assert mode in ['ratio', 'number']
     # set_size = act.tsize(1)
     if mode == 'ratio':
         topk_n = cls.compute_topk_n_with_ratio(topk, set_size)
     else:
         topk_n = cls.compute_topk_n_with_number(topk, set_size)
     sizes = set_size
     rand_id = lambda k, v: random.sample(range(v), topk_n[k])
     sizes = {k: rand_id(k, v) for k, v in sizes.items()}
     ids = {
         k: torch.tensor(v).cuda().view(1, -1).long()
         for k, v in sizes.items()
     }
     # import pdb; pdb.set_trace()
     # abs_sum = grad.abs().sum(0, keepdim=True) # fix later!
     mask = ParamsFlattener(
         {k: C(torch.zeros(1, v))
          for k, v in set_size.items()})
     mask = mask.scatter_float_(1, ids, 1)
     return mask
예제 #7
0
def inversed_masked_params(params, mask, step, r, thres=0.5):
    inv_mask = {k: (m < thres) for k, m in mask.unflat.items()}
    # import pdb; pdb.set_trace()
    num_ones = {k: v.sum() for k, v in inv_mask.items()}
    for k, v in inv_mask.items():
        diff = num_ones[k] - mask.t_size(0)[k] * r[f"sparse_{k.split('_')[1]}"]
        if diff >= 0:
            m = inv_mask[k]
            nonzero_ids = m.nonzero().squeeze().tolist()
            drop_ids = random.sample(nonzero_ids, diff.tolist())
            m[drop_ids] = torch.tensor(0)
        else:
            m = inv_mask[k]
            zero_ids = (m == 0).nonzero().squeeze().tolist()
            drop_ids = random.sample(zero_ids, -diff.tolist())
            m[drop_ids] = torch.tensor(1)
        inv_mask[k] = inv_mask[k].float()
    inv_mask = ParamsFlattener(inv_mask)
    mask_layout = inv_mask.expand_as(params)
    step_sparse = step * mask_layout
    params_sparse = params + step_sparse
    params_pruned = params_sparse.prune(inv_mask)
    return params_pruned, params_sparse
예제 #8
0
    def meta_optimize(self,
                      cfg,
                      meta_optim,
                      data,
                      model_cls,
                      writer=None,
                      mode='train'):
        assert mode in ['train', 'valid', 'test']
        self.set_mode(mode)

        # mask_mode = ['no_mask', 'structured', 'unstructured'][2]
        # data_mode = ['in_train', 'in_test'][0]

        ############################################################################
        analyze_model = False
        analyze_surface = False
        ############################################################################

        result_dict = ResultDict()
        unroll_losses = 0
        walltime = Walltime()
        test_kld = torch.tensor(0.)

        params = C(model_cls()).params
        self.feature_gen.new()
        self.step_gen.new()
        sparse_r = {}  # sparsity
        iter_pbar = tqdm(range(1, cfg['iter_' + mode] + 1), 'Inner_loop')

        for iter in iter_pbar:
            debug_1 = sigint.is_active(iter == 1 or iter % 10 == 0)
            debug_2 = sigstp.is_active()

            with WalltimeChecker(walltime):
                model_train = C(model_cls(params.detach()))
                data_train = data['in_train'].load()
                train_nll, train_acc = model_train(*data_train)
                train_nll.backward()

                grad = model_train.params.grad.detach()
                # g = model_train.params.grad.flat.detach()
                # w = model_train.params.flat.detach()

                # step & mask genration
                feature, v_sqrt = self.feature_gen(grad.flat.detach())
                # step = self.step_gen(feature, v_sqrt, debug=debug_1)
                # step = params.new_from_flat(step[0])
                size = params.size().unflat()

                if cfg.mask_mode == 'structured':
                    mask = self.mask_gen(feature, size, debug=debug_1)
                    mask = ParamsFlattener(mask)
                    mask_layout = mask.expand_as(params)
                    params = params + grad.detach() * mask_layout * (
                        -cfg.inner_lr)
                elif cfg.mask_mode == 'unstructured':
                    mask_flat = self.mask_gen.unstructured(feature, size)
                    mask = params.new_from_flat(mask_flat)
                    params = params + grad.detach() * mask * (-cfg.inner_lr)
                    # update = params.new_from_flat(params.flat + grad.flat.detach() * mask * (-cfg.lr))
                    # params = params + update
                elif cfg.mask_mode == 'no_mask':
                    params = params + grad.detach() * (-cfg.inner_lr)
                else:
                    raise Exception('Unknown setting!')

                # import pdb; pdb.set_trace()
                # step_masked = step * mask_layout
                # params = params + step_masked

            with WalltimeChecker(walltime if mode == 'train' else None):
                model_test = C(model_cls(params))
                if cfg.data_mode == 'in_train':
                    data_test = data_train
                elif cfg.data_mode == 'in_test':
                    data_test = data['in_test'].load()
                test_nll, test_acc = utils.isnan(*model_test(*data_test))

                if debug_2: pdb.set_trace()

                if mode == 'train':
                    unroll_losses += test_nll  # + test_kld
                    if iter % cfg.unroll == 0:
                        meta_optim.zero_grad()
                        unroll_losses.backward()
                        nn.utils.clip_grad_value_(self.parameters(), 0.01)
                        meta_optim.step()
                        unroll_losses = 0

            with WalltimeChecker(walltime):
                if not mode == 'train' or iter % cfg.unroll == 0:
                    params = params.detach_()

            ##########################################################################
            if analyze_model:
                analyzers.model_analyzer(self,
                                         mode,
                                         model_train,
                                         params,
                                         model_cls,
                                         mask.tsize(0),
                                         data,
                                         iter,
                                         optim_it,
                                         analyze_mask=True,
                                         sample_mask=True,
                                         draw_loss=False)
            if analyze_surface:
                analyzers.surface_analyzer(params, best_mask, step, writer,
                                           iter)
            ##########################################################################

            result = dict(
                train_nll=train_nll.tolist(),
                test_nll=test_nll.tolist(),
                train_acc=train_acc.tolist(),
                test_acc=test_acc.tolist(),
                test_kld=test_kld.tolist(),
                walltime=walltime.time,
            )
            if not cfg.mask_mode == 'no_mask':
                result.update(
                    **mask.sparsity(overall=True),
                    **mask.sparsity(overall=False),
                )
            result_dict.append(result)
            log_pbar(result, iter_pbar)

        return result_dict, params
예제 #9
0
    def meta_optimize(self,
                      meta_optim,
                      data,
                      model_cls,
                      optim_it,
                      unroll,
                      out_mul,
                      k_obsrv=0,
                      no_mask=False,
                      writer=None,
                      mode='train'):
        assert mode in ['train', 'valid', 'test']
        self.set_mode(mode)

        ############################################################################
        analyze_model = False
        analyze_surface = False
        ############################################################################

        result_dict = ResultDict()
        unroll_losses = 0
        walltime = Walltime()
        test_kld = torch.tensor(0.)

        params = C(model_cls()).params
        self.feature_gen.new()
        self.step_gen.new()
        sparse_r = {}  # sparsity
        iter_pbar = tqdm(range(1, optim_it + 1), 'Inner_loop')

        for iter in iter_pbar:
            debug_1 = sigint.is_active(iter == 1 or iter % 10 == 0)
            debug_2 = sigstp.is_active()

            with WalltimeChecker(walltime):
                model_train = C(model_cls(params.detach()))
                data_ = data['in_train'].load()
                train_nll, train_acc = model_train(*data_)
                train_nll.backward()

                g = model_train.params.grad.flat.detach()
                w = model_train.params.flat.detach()

                # step & mask genration
                feature, v_sqrt = self.feature_gen(g)
                step = self.step_gen(feature, v_sqrt, debug=debug_1)
                step = params.new_from_flat(step[0])
                size = params.size().unflat()

                if no_mask:
                    params = params + step
                else:
                    kld = self.mask_gen(feature, size, debug=debug_1)
                    test_kld = kld / data['in_test'].full_size / unroll
                    ## kl annealing function 'linear' / 'logistic' / None
                    test_kld2 = test_kld * kl_anneal_function(
                        anneal_function=None, step=iter, k=0.0025, x0=optim_it)
                    mask = self.mask_gen.sample_mask()
                    mask = ParamsFlattener(mask)
                    mask_layout = mask.expand_as(params)
                    import pdb
                    pdb.set_trace()
                    step_masked = step * mask_layout
                    params = params + step_masked

            with WalltimeChecker(walltime if mode == 'train' else None):
                model_test = C(model_cls(params))
                test_nll, test_acc = utils.isnan(*model_test(
                    *data['in_test'].load()))

                if debug_2: pdb.set_trace()

                if mode == 'train':
                    unroll_losses += test_nll  # + test_kld
                    if iter % unroll == 0:
                        meta_optim.zero_grad()
                        unroll_losses.backward()
                        nn.utils.clip_grad_value_(self.parameters(), 0.01)
                        meta_optim.step()
                        unroll_losses = 0

            with WalltimeChecker(walltime):
                if not mode == 'train' or iter % unroll == 0:
                    params = params.detach_()

            ##########################################################################
            if analyze_model:
                analyzers.model_analyzer(self,
                                         mode,
                                         model_train,
                                         params,
                                         model_cls,
                                         mask.tsize(0),
                                         data,
                                         iter,
                                         optim_it,
                                         analyze_mask=True,
                                         sample_mask=True,
                                         draw_loss=False)
            if analyze_surface:
                analyzers.surface_analyzer(params, best_mask, step, writer,
                                           iter)
            ##########################################################################

            result = dict(
                train_nll=train_nll.tolist(),
                test_nll=test_nll.tolist(),
                train_acc=train_acc.tolist(),
                test_acc=test_acc.tolist(),
                test_kld=test_kld.tolist(),
                walltime=walltime.time,
            )
            if no_mask is False:
                result.update(
                    **mask.sparsity(overall=True),
                    **mask.sparsity(overall=False),
                )
            result_dict.append(result)
            log_pbar(result, iter_pbar)

        return result_dict, params
예제 #10
0
    def meta_optimize(self,
                      meta_optim,
                      data,
                      model_cls,
                      optim_it,
                      unroll,
                      out_mul,
                      k_obsrv=1,
                      no_mask=False,
                      writer=None,
                      mode='train'):
        assert mode in ['train', 'valid', 'test']
        if no_mask is True:
            raise Exception(
                "this module currently does NOT suport no_mask option")
        self.set_mode(mode)

        ############################################################################
        n_samples = k_obsrv
        """MSG: better postfix?"""
        analyze_model = False
        analyze_surface = False
        ############################################################################

        if analyze_surface:
            writer.new_subdirs('best', 'any', 'rand', 'inv', 'dense')

        result_dict = ResultDict()
        unroll_losses = 0
        walltime = Walltime()
        test_kld = torch.tensor(0.)

        params = C(model_cls()).params
        self.feature_gen.new()
        self.step_gen.new()
        iter_pbar = tqdm(range(1, optim_it + 1), 'Inner_loop')
        set_size = {'layer_0': 500, 'layer_1': 10}  # NOTE: make it smarter

        for iter in iter_pbar:
            debug_1 = sigint.is_active(iter == 1 or iter % 10 == 0)
            debug_2 = sigstp.is_active()

            best_loss = 9999999
            best_params = None

            with WalltimeChecker(walltime):
                model_train = C(model_cls(params.detach()))
                train_nll, train_acc = model_train(*data['in_train'].load())
                train_nll.backward()

                g = model_train.params.grad.flat.detach()
                w = model_train.params.flat.detach()

                feature, v_sqrt = self.feature_gen(g)

                size = params.size().unflat()
                kld = self.mask_gen(feature, size)

                losses = []
                lips = []
                valid_mask_patience = 100
                assert n_samples > 0
                """FIX LATER:
        when n_samples == 0 it can behave like no_mask flag is on."""
                for i in range(n_samples):
                    # step & mask genration
                    for j in range(valid_mask_patience):
                        mask = self.mask_gen.sample_mask()
                        mask = ParamsFlattener(mask)
                        if mask.is_valid_sparsity():
                            if j > 0:
                                print(
                                    f'\n\n[!]Resampled {j + 1} times to get valid mask!'
                                )
                            break
                        if j == valid_mask_patience - 1:
                            raise Exception(
                                "[!]Could not sample valid mask for "
                                f"{j+1} trials.")

                    step_out = self.step_gen(feature, v_sqrt, debug=debug_1)
                    step = params.new_from_flat(step_out[0])

                    mask_layout = mask.expand_as(params)
                    step_sparse = step * mask_layout
                    params_sparse = params + step_sparse
                    params_pruned = params_sparse.prune(mask > 0.5)

                    if params_pruned.size().unflat()['mat_0'][1] == 0:
                        continue

                    # cand_loss = model(*outer_data_s)
                    sparse_model = C(model_cls(params_pruned.detach()))
                    loss, _ = sparse_model(*data['in_train'].load())

                    if (loss < best_loss) or i == 0:
                        best_loss = loss
                        best_params = params_sparse
                        best_pruned = params_pruned
                        best_mask = mask

                if best_params is not None:
                    params = best_params

            with WalltimeChecker(walltime if mode == 'train' else None):
                model_test = C(model_cls(params))
                test_nll, test_acc = utils.isnan(*model_test(
                    *data['in_test'].load()))
                test_kld = kld / data['in_test'].full_size / unroll
                ## kl annealing function 'linear' / 'logistic' / None
                test_kld2 = test_kld * kl_anneal_function(
                    anneal_function=None, step=iter, k=0.0025, x0=optim_it)
                total_test = test_nll + test_kld2

                if mode == 'train':
                    unroll_losses += total_test
                    if iter % unroll == 0:
                        meta_optim.zero_grad()
                        unroll_losses.backward()
                        nn.utils.clip_grad_value_(self.parameters(), 0.01)
                        meta_optim.step()
                        unroll_losses = 0

            with WalltimeChecker(walltime):
                if not mode == 'train' or iter % unroll == 0:
                    params = params.detach_()

            ##########################################################################
            """Analyzers"""
            if analyze_model:
                analyzers.model_analyzer(self,
                                         mode,
                                         model_train,
                                         params,
                                         model_cls,
                                         set_size,
                                         data,
                                         iter,
                                         optim_it,
                                         analyze_mask=True,
                                         sample_mask=True,
                                         draw_loss=False)
            if analyze_surface:
                analyzers.surface_analyzer(params, best_mask, step, writer,
                                           iter)
            ##########################################################################

            result = dict(
                train_nll=train_nll.tolist(),
                test_nll=test_nll.tolist(),
                train_acc=train_acc.tolist(),
                test_acc=test_acc.tolist(),
                test_kld=test_kld.tolist(),
                walltime=walltime.time,
                **best_mask.sparsity(overall=True),
                **best_mask.sparsity(overall=False),
            )
            result_dict.append(result)
            log_pbar(result, iter_pbar)

        return result_dict, params
예제 #11
0
def sampling_mask(mask_gen, layer_size, model_train, params, sample_num=10000, mode='test', iteration=0, iter_interval=10, result_dir='result/mask_compare'):
    topk = True
    sparse_r = {}
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    mask_result = None
    #import pdb; pdb.set_trace()
    #if mask=='test' and (iteration % iter_interval == 0):
    if (iteration % iter_interval == 0):
        for i in range(sample_num):
            mask = mask_gen.sample_mask()
            mask = ParamsFlattener(mask)
            mask_flat = (mask.flat>0.5).float()
            mask.unflat
            if i == 0:
                mask_cat = mask_flat.unsqueeze(dim=0)
                mask_sum = mask
            else:
                mask_cat = torch.cat((mask_cat, mask_flat.unsqueeze(dim=0)), dim=0)
                mask_sum += mask
        #import pdb; pdb.set_trace()
        mask_mean, mask_var = mask_cat.mean(dim=0).squeeze(), mask_cat.var(dim=0).squeeze()
        
        mask_sum /= sample_num
        mask = mask>0.5

        grad = model_train.params.grad.detach()
        act = model_train.activations.detach()
        for k, v in layer_size.items():
            r = (mask > 0.5).sum().unflat[k].tolist() / v
            sparse_r[f"sparse_{k.split('_')[1]}"] = r

        topk_mask_gen = MaskGenerator2.topk if topk else MaskGenerator2.randk
        layer_0_topk = topk_mask_gen(grad=grad, set_size=layer_size, topk=sparse_r['sparse_0'])._unflat['layer_0'].view(-1)
        layer_0_topk = layer_0_topk>0.5
        layer_1_topk = topk_mask_gen(grad=grad, set_size=layer_size, topk=sparse_r['sparse_1'])._unflat['layer_1'].view(-1)
        layer_1_topk = layer_1_topk>0.5

        layer_0_prefer_topk = topk_mask_gen(grad=mask_sum.expand_as(params), set_size=layer_size, topk=sparse_r['sparse_0'])._unflat['layer_0'].view(-1)
        layer_1_prefer_topk = topk_mask_gen(grad=mask_sum.expand_as(params), set_size=layer_size, topk=sparse_r['sparse_1'])._unflat['layer_1'].view(-1)
        layer_0_prefer_topk = layer_0_prefer_topk>0.5
        layer_1_prefer_topk = layer_1_prefer_topk>0.5
        layer_0 = torch.cat([grad.unflat['mat_0'],
        grad.unflat['bias_0'].unsqueeze(0)], dim=0).abs().sum(0)
        layer_1 = torch.cat([grad.unflat['mat_1'],
        grad.unflat['bias_1'].unsqueeze(0)], dim=0).abs().sum(0)
        layer_0_abs = ParamsFlattener({'layer_0': layer_0})
        layer_1_abs = ParamsFlattener({'layer_1': layer_1})

        hist_mean, bins_mean  = np.histogram(tensor2numpy(mask_mean), bins=20)
        hist_var, bins_var = np.histogram(tensor2numpy(mask_var), bins=20)

        sorted_layer_0 = np.sort(tensor2numpy(mask_mean[0:layer_size['layer_0']]))[::-1]
        mask_thr_layer_0 = sorted_layer_0[int(sparse_r['sparse_0'] * layer_size['layer_0'])]
        sorted_layer_1 = np.sort(tensor2numpy(mask_mean[layer_size['layer_0']: layer_size['layer_0']+layer_size['layer_1']]))[::-1]
        mask_thr_layer_1 = sorted_layer_1[int(sparse_r['sparse_1'] * layer_size['layer_1'])]
        drop = tensor2numpy(mask_mean) < tensor2numpy(mask_mean.median())
        sparse_r_drop = np.concatenate(((tensor2numpy(mask_mean) < mask_thr_layer_0)[0:layer_size['layer_0']], (tensor2numpy(mask_mean) < mask_thr_layer_1)[layer_size['layer_0']: layer_size['layer_0']+layer_size['layer_1']]), axis=0)
        for index, drop in enumerate([drop, sparse_r_drop]):
            drop = np.array(drop, dtype=int)
            retain = 1 - drop
            certain = tensor2numpy(mask_var < mask_var.median())
            uncertain = 1 - certain
            certain_drop = certain * drop
            certain_retain = certain * retain
            uncertain_drop = uncertain * drop
            uncertain_retain = uncertain * retain
            if index == 0:
                print("\niteration {} (median) certain drop : {} certain retain : {} uncertain drop : {} uncertain_retain : {}\n".format(iteration, certain_drop.sum(), certain_retain.sum(), uncertain_drop.sum(), uncertain_retain.sum()))
            else:
                print("\niteration {} (sparse) certain drop : {} certain retain : {} uncertain drop : {} uncertain_retain : {}\n".format(iteration, certain_drop.sum(), certain_retain.sum(), uncertain_drop.sum(), uncertain_retain.sum()))
        #import pdb; pdb.set_trace()

        sparse_r, overlap_mask_ratio_0, overlap_mask_ratio_1, overlap_prefer_ratio_0, overlap_prefer_ratio_1 = plot_masks(mask, layer_0_topk, layer_1_topk, mask_sum, layer_0_prefer_topk, layer_1_prefer_topk, 
                        layer_0, layer_1, result_dir, iteration, sparse_r, mask_mean, mask_var)
        mask_result =  dict(
                    sparse_0 = sparse_r['sparse_0'],
                    sparse_1 = sparse_r['sparse_1'],
                    overlap_mask_ratio_0 = overlap_mask_ratio_0.tolist(),
                    overlap_mask_ratio_1 = overlap_mask_ratio_1.tolist(),
                    overlap_prefer_ratio_0 = overlap_prefer_ratio_0.tolist(),
                    overlap_prefer_ratio_1 = overlap_prefer_ratio_1.tolist(),
                    certain_drop=certain_drop,
                    certain_retain=certain_retain,
                    uncertain_drop=uncertain_drop,
                    uncertain_retain=uncertain_retain
                    )

    return mask_result
예제 #12
0
def plot_loss(model_cls, model, params, input_data, dataset, feature_gen, mask_gen, step_gen, scale_way, xmin=-2.0, xmax=0.5, num_x=20, mode='test', iteration=0, iter_interval=10, loss_dir='result/draw_loss'):
    #if mode == 'test' and ((iteration-1) % iter_interval == 0) and (iteration >1):
    if ((iteration-1) % iter_interval == 0) and (iteration >1):
        X = np.linspace(xmin, xmax, num_x)
        Y = np.linspace(xmin, xmax, num_x)

        model_train = C(model_cls(params=params.detach()))
        #step_data = data['in_train'].load()
        step_data = input_data
        train_nll, train_acc = model_train(*step_data)
        train_nll.backward()

        g = model_train.params.grad.flat.detach()
        w = model_train.params.flat.detach()

        feature, v_sqrt = feature_gen(g)

        size = params.size().unflat()
        kld = mask_gen(feature, size)
        # step & mask genration
        mask = mask_gen.sample_mask()
        mask = ParamsFlattener(mask)
        mask_layout = mask.expand_as(params)
        step_X = step_gen(feature, v_sqrt)
        step_X = params.new_from_flat(step_X[0]) * mask_layout
        step_X = step_X.flat.view(-1)
        step_Y = model_train.params.grad.flat.view(-1) 
        step_X_ = params.new_from_flat(-1.0 * step_X)
        step_X2_ = params.new_from_flat(-10.0 * step_X)
        step_Y_ = params.new_from_flat(1.0 * step_Y)
        step_Y2_ = params.new_from_flat(0.1 * step_Y)
        #import pdb; pdb.set_trace()
        #step_Y = step_Y * step_X.abs().sum() / step_Y.abs().sum()
        L2_X = (step_X * step_X).sum()
        L2_Y = (step_Y * step_Y).sum()

        layer_settings = [['mat_0', 'bias_0', 'mat_1', 'bias_1'], ['mat_0', 'bias_0'], ['mat_1', 'bias_1']]
        normalize_way = 'filter_norm'
        #result_dirs = ['loss_all_scale_{}'.format(scale_way), 'loss_layer0_scale_{}'.format(scale_way), 'loss_layer1_scale_{}'.format(scale_way)]
        result_dirs = ['loss_all_scale_{}'.format(scale_way)]
        for layer_set, result_dir in zip(layer_settings, result_dirs):
            grad_dir = os.path.join(loss_dir, normalize_way, result_dir, 'gradient')
            if not os.path.exists(grad_dir):
                os.makedirs(grad_dir)
            step_dir = os.path.join(loss_dir, normalize_way, result_dir, 'step')
            if not os.path.exists(step_dir):
                os.makedirs(step_dir)
            step_X_ = params.new_from_flat(-1.0 * step_X)
            step_Y_ = params.new_from_flat(1.0 * step_Y)
            #step_X2_ = params.new_from_flat(-10.0 * step_X)
            #step_Y2_ = params.new_from_flat(0.1 * step_Y)

            if normalize_way is not None:
              for step in (step_X_, step_Y_):
                for matrix in ['mat_0', 'bias_0', 'mat_1', 'bias_1']:
                  di = step.unflat[matrix]
                  norm_di = torch.norm(di,2, dim=0)
                  thetai = params.unflat[matrix]
                  if normalize_way == 'filter_norm':
                    norm_thetai = torch.norm(di,2, dim=0) ## TODO Division by zero bug fix
                    normalize_di = di * norm_thetai / (norm_di+1e-5)
                  elif normalize_way == 'weight_norm':
                    normalize_di = di * thetai / (di + 1e-5)
                  step.unflat[matrix] = normalize_di

            abs_X = step_X_.abs().sum()
            abs_Y = step_Y_.abs().sum()
            L2_X = (step_X_ * step_X_).sum()
            L2_Y = (step_Y_ * step_Y_).sum()
            scale_X, scale_Y = 0, 0
                
            #import pdb; pdb.set_trace()
            for layer in ['mat_1', 'bias_1']:
                scale_X += abs_X.unflat[layer].item()
                scale_Y += abs_Y.unflat[layer].item()
            
            scale_g_s = scale_X/scale_Y
            scale_s_g = scale_Y/scale_X

            for layer in layer_set:
                scale_X += abs_X.unflat[layer].item()
                scale_Y += abs_Y.unflat[layer].item()
            
            #import pdb; pdb.set_trace()
            #step_Y_ = step_Y_ * (step_X_.abs().sum() /  step_Y_.abs().sum())
            if scale_way == 's':
                scale_s = scale_s_g
                scale_g = 1.0
            elif scale_way == 'g':
                scale_s = 1.0
                scale_g = scale_g_s
            else:
                scale_s = 1.0
                scale_g = 1.0       
            Z_X = get_1D_Loss(X, step_X_, scale_s, step_X.size(), layer_set, dataset, model_cls, params)
            Z_Y = get_1D_Loss(Y, step_Y_, scale_g, step_Y.size(), layer_set, dataset, model_cls, params)
            #Z_X2 = get_1D_Loss(X, step_X2_, scale, step_X.size(), layer_set, data['in_train'], model_cls, params)
            #Z_Y2 = get_1D_Loss(Y, step_Y2_, scale,step_Y.size(), layer_set,data['in_train'], model_cls, params)
            plot_2d(X, Z_X, os.path.join(step_dir, 'iter_{:04d}_STEPxMASK_1dLoss.png'.format(iteration)), 'L1 norm = {:.2f}'.format(scale_X*scale_s))
            plot_2d(Y, Z_Y, os.path.join(grad_dir, 'iter_{:04d}_1.0xGradient_1dLoss.png'.format(iteration)),'L1 norm = {:.2f}'.format(scale_Y*scale_g))
예제 #13
0
    def meta_optimize(self, meta_optimizer, data, model_cls, optim_it, unroll,
                      out_mul, mode):
        assert mode in ['train', 'valid', 'test']
        if mode == 'train':
            self.train()
            # data.new_train_data()
            inner_data = data.loaders['inner_train']
            outer_data = data.loaders['inner_valid']
            # inner_data = data.loaders['train']
            # outer_data = inner_data
            drop_mode = 'soft_drop'
        elif mode == 'valid':
            self.eval()
            inner_data = data.loaders['valid']
            outer_data = None
            drop_mode = 'hard_drop'
        elif mode == 'test':
            self.eval()
            inner_data = data.loaders['test']
            outer_data = None
            drop_mode = 'hard_drop'

        # data = dataset(mode=mode)
        model = C(model_cls(sb_mode=self.sb_mode))
        model(*data.pseudo_sample())
        params = model.params

        result_dict = ResultDict()
        unroll_losses = 0
        walltime = 0

        self.feature_gen.new()
        ################
        mask_dict = ResultDict()
        analyze_mask = False
        sample_mask = False
        draw_loss = False
        ################
        iter_pbar = tqdm(range(1, optim_it + 1), 'optim_iteration')
        iter_watch = utils.StopWatch('optim_iteration')

        res1 = None
        res2 = []
        res3 = []
        lamb = []
        gamm_g = []
        gamm_l1 = []
        gamm_l2 = []
        iters = []

        for iteration in iter_pbar:
            iter_watch.touch()
            if debug_sigint.signal_on:
                debug_1 = iteration == 1 or iteration % 10 == 0
            else:
                debug_1 = False
            if debug_sigstp.signal_on:
                debug_2 = True
            else:
                debug_2 = False

            model_detached = C(model_cls(params.detach()))
            inner_data_s = inner_data.load()
            loss_detached = model_detached(*inner_data_s)
            loss_detached.backward()

            g = model_detached.params.grad.flat.detach()
            w = model_detached.params.flat.detach()

            cand_params = []
            cand_losses = []
            best_loss = 9999999
            best_params = None
            n_samples = 5

            for m in range(1):
                feature = self.feature_gen(g,
                                           w,
                                           n=n_samples,
                                           m_update=(m == 0))
                p_size = params.size().unflat()
                mask_gen_out = self.mask_gen(feature, p_size, n=n_samples)
                step_out = self.step_gen(feature, n=n_samples)

                # step & mask genration
                for i in range(n_samples):
                    mask, _, kld = mask_gen_out[i]
                    step = step_out[i]
                    mask = ParamsFlattener(mask)
                    mask_layout = mask.expand_as(params)
                    step = params.new_from_flat(step)
                    # prunning
                    step = step * mask_layout
                    params_ = params + step
                    # cand_params.append(params_.flat)
                    sparse_params = params_.prune(mask > 1e-6)
                    if sparse_params.size().unflat()['mat_0'][1] == 0:
                        continue
                    if debug_2:
                        import pdb
                        pdb.set_trace()
                    # cand_loss = model(*outer_data_s)
                    sparse_model = C(model_cls(sparse_params.detach()))
                    loss = sparse_model(*inner_data_s)
                    try:
                        if loss < best_loss:
                            best_loss = loss
                            best_params = params_
                            best_kld = kld
                    except:
                        best_params = params_
                        best_kld = kld

            if best_params is not None:
                params = best_params
                best_kld = 0

            if mode == 'train':
                model = C(model_cls(params))
                optim_loss = model(*outer_data.load())
                if torch.isnan(optim_loss):
                    import pdb
                    pdb.set_trace()
                unroll_losses += optim_loss + best_kld / outer_data.full_size

            if mode == 'train' and iteration % unroll == 0:
                meta_optimizer.zero_grad()
                unroll_losses.backward()
                # import pdb; pdb.set_trace()
                nn.utils.clip_grad_value_(self.parameters(), 0.1)
                meta_optimizer.step()
                unroll_losses = 0

            if not mode == 'train' or iteration % unroll == 0:
                # self.mask_gen.detach_lambdas_()
                params = params.detach_()

            # import pdb; pdb.set_trace()
            if params is None:
                import pdb
                pdb.set_trace()

            iter_pbar.set_description(
                f'optim_iteration'
                f'[optim_loss:{loss_detached.tolist():5.5}')
            # f' sparse_loss:{sparsity_loss.tolist():5.5}]')
            # iter_pbar.set_description(f'optim_iteration[loss:{loss_dense.tolist()}/dist:{dist.tolist()}]')
            # torch.optim.SGD(sparse_params, lr=0.1).step()
            walltime += iter_watch.touch('interval')
            ##############################################################
            text_dir = 'test/analyze_mask'
            result_dir = 'test/drawloss'
            sample_dir = 'test/mask_compare'
            iter_interval = 10
            sample_num = 10000
            if mode == 'test' and analyze_mask:
                analyzing_mask(self.mask_gen, layer_size, mode, iter,
                               iter_interval, text_dir)
            if mode == 'test' and sample_mask:
                mask_result = sampling_mask(self.mask_gen, layer_size,
                                            model_train, params, sample_num,
                                            mode, iter, iter_interval,
                                            sample_dir)
                if mask_result is not None:
                    mask_dict.append(mask_result)
            if mode == 'test' and draw_loss:
                plot_loss(model_cls=model_cls,
                          model=model_train,
                          params,
                          input_data=data['in_train'].load(),
                          dataset=data['in_train'],
                          feature_gen=self.feature_gen,
                          mask_gen=self.mask_gen,
                          step_gen=self.step_gen,
                          scale_way=None,
                          xmin=-2.0,
                          xmax=0.5,
                          num_x=20,
                          mode=mode,
                          iteration=iter,
                          iter_interval=iter_interval,
                          loss_dir=result_dir)
            ##############################################################
            result_dict.append(loss=loss_detached)
            if not mode == 'train':
                result_dict.append(
                    walltime=walltime,
                    # **self.params_tracker(
                    #   grad=grad,
                    #   update=batch_arg.updates,
                    # )
                )
        return result_dict