def analyzing_mask(mask_gen, layer_size, mode, iteration, iter_interval, text_dir): sparse_r = {} #if mode == 'test' and (iteration % iter_interval == 0): if (iteration % iter_interval == 0): mask = mask_gen.sample_mask() mask = ParamsFlattener(mask) for k, v in layer_size.items(): r = (mask > 0.5).sum().unflat[k].tolist() / v sparse_r[f"sparse_{k.split('_')[1]}"] = r alpha = mask_gen.a beta = mask_gen.b expectation = alpha / (alpha + beta) #variance = (alpha * beta) / ((alpha + beta) * (alpha + beta) * (alpha + beta +1)) bern_n = 1 #import pdb; pdb.set_trace() variance = bern_n * (alpha * beta) * (alpha + beta + bern_n) / ((alpha + beta) * (alpha + beta) * (alpha + beta +1)) sorted_layer_0 = np.sort(tensor2numpy(expectation[0:layer_size['layer_0']]))[::-1] mask_thr_layer_0 = sorted_layer_0[int(sparse_r['sparse_0'] * layer_size['layer_0'])] sorted_layer_1 = np.sort(tensor2numpy(expectation[layer_size['layer_0']: layer_size['layer_0']+layer_size['layer_1']]))[::-1] mask_thr_layer_1 = sorted_layer_1[int(sparse_r['sparse_1'] * layer_size['layer_1'])] drop = tensor2numpy(expectation) < tensor2numpy(expectation.median()) sparse_r_drop = np.concatenate(((tensor2numpy(expectation) < mask_thr_layer_0)[0:layer_size['layer_0']], (tensor2numpy(expectation) < mask_thr_layer_1)[layer_size['layer_0']: layer_size['layer_0']+layer_size['layer_1']]), axis=0) for index, drop in enumerate([drop, sparse_r_drop]): drop = np.array(drop, dtype=int) retain = 1 - drop certain = tensor2numpy(variance < variance.median()) uncertain = 1 - certain certain_drop = certain * drop certain_retain = certain * retain uncertain_drop = uncertain * drop uncertain_retain = uncertain * retain if index == 0: print("\niteration {} (median) certain drop : {} certain retain : {} uncertain drop : {} uncertain_retain : {}\n".format(iteration, certain_drop.sum(), certain_retain.sum(), uncertain_drop.sum(), uncertain_retain.sum())) else: print("\niteration {} (sparse) certain drop : {} certain retain : {} uncertain drop : {} uncertain_retain : {}\n".format(iteration, certain_drop.sum(), certain_retain.sum(), uncertain_drop.sum(), uncertain_retain.sum()))
def __init__(self, params=None): super().__init__() if params is not None: if not isinstance(params, ParamsFlattener): raise TypeError("params argumennts has to be " "an instance of ParamsFlattener!") self.params = params else: theta = torch.zeros(10) self.params = ParamsFlattener({'theta': theta})
def layers2params(self, layers): j = 0 params = {} for i, layer in enumerate(layers): if layer.__class__.__name__ in ['Linear', 'Conv2d']: """FIX LATER: there is no need to use diffrent names for the same matrices. (weight, mat)""" params['mat_' + str(j)] = C(layer.weight.data.clone()) if layer.bias is not None: params['bias_' + str(j)] = C(layer.bias.data.clone()) j += 1 return ParamsFlattener(params)
def eval_gauss_var(model_cls, data, params, n_sample=200, std=1e-4): assert isinstance(params, ParamsFlattener) params = params.detach() losses = [] for _ in range(n_sample): p = {} for k, v in params.unflat.items(): p[k] = v + C(torch.zeros(v.size()).normal_(0, std)) params_perturbed = ParamsFlattener(p) model = C(model_cls(params=params_perturbed)) losses.append(model(*data['in_train'].load())) return torch.stack(losses).var()
def topk(cls, grad, set_size, topk, mode='ratio'): # NOTE: act is used to match the size (fix later) # dirty work... assert mode in ['ratio', 'number'] # set_size = act.tsize(1) if mode == 'ratio': topk_n = cls.compute_topk_n_with_ratio(topk, set_size) else: topk_n = cls.compute_topk_n_with_number(topk, set_size) # abs_sum = grad.abs().sum(0, keepdim=True) layer_0 = torch.cat( [grad.unflat['mat_0'], grad.unflat['bias_0'].unsqueeze(0)], dim=0).abs().sum(0) layer_1 = torch.cat( [grad.unflat['mat_1'], grad.unflat['bias_1'].unsqueeze(0)], dim=0).abs().sum(0) abs_sum = ParamsFlattener({'layer_0': layer_0, 'layer_1': layer_1}) ids = abs_sum.topk_id(topk_n, sorted=False) ids = {k: v.view(1, -1).long() for k, v in ids.unflat.items()} # ids = abs_sum.topk_id(topk_n, sorted=False) mask = ParamsFlattener( {k: C(torch.zeros(1, v)) for k, v in set_size.items()}) mask = mask.scatter_float_(1, ids, 1) return mask
def randk(cls, set_size, topk, mode='ratio', *args, **kwargs): # NOTE: act is used to match the size (fix later) assert mode in ['ratio', 'number'] # set_size = act.tsize(1) if mode == 'ratio': topk_n = cls.compute_topk_n_with_ratio(topk, set_size) else: topk_n = cls.compute_topk_n_with_number(topk, set_size) sizes = set_size rand_id = lambda k, v: random.sample(range(v), topk_n[k]) sizes = {k: rand_id(k, v) for k, v in sizes.items()} ids = { k: torch.tensor(v).cuda().view(1, -1).long() for k, v in sizes.items() } # import pdb; pdb.set_trace() # abs_sum = grad.abs().sum(0, keepdim=True) # fix later! mask = ParamsFlattener( {k: C(torch.zeros(1, v)) for k, v in set_size.items()}) mask = mask.scatter_float_(1, ids, 1) return mask
def inversed_masked_params(params, mask, step, r, thres=0.5): inv_mask = {k: (m < thres) for k, m in mask.unflat.items()} # import pdb; pdb.set_trace() num_ones = {k: v.sum() for k, v in inv_mask.items()} for k, v in inv_mask.items(): diff = num_ones[k] - mask.t_size(0)[k] * r[f"sparse_{k.split('_')[1]}"] if diff >= 0: m = inv_mask[k] nonzero_ids = m.nonzero().squeeze().tolist() drop_ids = random.sample(nonzero_ids, diff.tolist()) m[drop_ids] = torch.tensor(0) else: m = inv_mask[k] zero_ids = (m == 0).nonzero().squeeze().tolist() drop_ids = random.sample(zero_ids, -diff.tolist()) m[drop_ids] = torch.tensor(1) inv_mask[k] = inv_mask[k].float() inv_mask = ParamsFlattener(inv_mask) mask_layout = inv_mask.expand_as(params) step_sparse = step * mask_layout params_sparse = params + step_sparse params_pruned = params_sparse.prune(inv_mask) return params_pruned, params_sparse
def meta_optimize(self, cfg, meta_optim, data, model_cls, writer=None, mode='train'): assert mode in ['train', 'valid', 'test'] self.set_mode(mode) # mask_mode = ['no_mask', 'structured', 'unstructured'][2] # data_mode = ['in_train', 'in_test'][0] ############################################################################ analyze_model = False analyze_surface = False ############################################################################ result_dict = ResultDict() unroll_losses = 0 walltime = Walltime() test_kld = torch.tensor(0.) params = C(model_cls()).params self.feature_gen.new() self.step_gen.new() sparse_r = {} # sparsity iter_pbar = tqdm(range(1, cfg['iter_' + mode] + 1), 'Inner_loop') for iter in iter_pbar: debug_1 = sigint.is_active(iter == 1 or iter % 10 == 0) debug_2 = sigstp.is_active() with WalltimeChecker(walltime): model_train = C(model_cls(params.detach())) data_train = data['in_train'].load() train_nll, train_acc = model_train(*data_train) train_nll.backward() grad = model_train.params.grad.detach() # g = model_train.params.grad.flat.detach() # w = model_train.params.flat.detach() # step & mask genration feature, v_sqrt = self.feature_gen(grad.flat.detach()) # step = self.step_gen(feature, v_sqrt, debug=debug_1) # step = params.new_from_flat(step[0]) size = params.size().unflat() if cfg.mask_mode == 'structured': mask = self.mask_gen(feature, size, debug=debug_1) mask = ParamsFlattener(mask) mask_layout = mask.expand_as(params) params = params + grad.detach() * mask_layout * ( -cfg.inner_lr) elif cfg.mask_mode == 'unstructured': mask_flat = self.mask_gen.unstructured(feature, size) mask = params.new_from_flat(mask_flat) params = params + grad.detach() * mask * (-cfg.inner_lr) # update = params.new_from_flat(params.flat + grad.flat.detach() * mask * (-cfg.lr)) # params = params + update elif cfg.mask_mode == 'no_mask': params = params + grad.detach() * (-cfg.inner_lr) else: raise Exception('Unknown setting!') # import pdb; pdb.set_trace() # step_masked = step * mask_layout # params = params + step_masked with WalltimeChecker(walltime if mode == 'train' else None): model_test = C(model_cls(params)) if cfg.data_mode == 'in_train': data_test = data_train elif cfg.data_mode == 'in_test': data_test = data['in_test'].load() test_nll, test_acc = utils.isnan(*model_test(*data_test)) if debug_2: pdb.set_trace() if mode == 'train': unroll_losses += test_nll # + test_kld if iter % cfg.unroll == 0: meta_optim.zero_grad() unroll_losses.backward() nn.utils.clip_grad_value_(self.parameters(), 0.01) meta_optim.step() unroll_losses = 0 with WalltimeChecker(walltime): if not mode == 'train' or iter % cfg.unroll == 0: params = params.detach_() ########################################################################## if analyze_model: analyzers.model_analyzer(self, mode, model_train, params, model_cls, mask.tsize(0), data, iter, optim_it, analyze_mask=True, sample_mask=True, draw_loss=False) if analyze_surface: analyzers.surface_analyzer(params, best_mask, step, writer, iter) ########################################################################## result = dict( train_nll=train_nll.tolist(), test_nll=test_nll.tolist(), train_acc=train_acc.tolist(), test_acc=test_acc.tolist(), test_kld=test_kld.tolist(), walltime=walltime.time, ) if not cfg.mask_mode == 'no_mask': result.update( **mask.sparsity(overall=True), **mask.sparsity(overall=False), ) result_dict.append(result) log_pbar(result, iter_pbar) return result_dict, params
def meta_optimize(self, meta_optim, data, model_cls, optim_it, unroll, out_mul, k_obsrv=0, no_mask=False, writer=None, mode='train'): assert mode in ['train', 'valid', 'test'] self.set_mode(mode) ############################################################################ analyze_model = False analyze_surface = False ############################################################################ result_dict = ResultDict() unroll_losses = 0 walltime = Walltime() test_kld = torch.tensor(0.) params = C(model_cls()).params self.feature_gen.new() self.step_gen.new() sparse_r = {} # sparsity iter_pbar = tqdm(range(1, optim_it + 1), 'Inner_loop') for iter in iter_pbar: debug_1 = sigint.is_active(iter == 1 or iter % 10 == 0) debug_2 = sigstp.is_active() with WalltimeChecker(walltime): model_train = C(model_cls(params.detach())) data_ = data['in_train'].load() train_nll, train_acc = model_train(*data_) train_nll.backward() g = model_train.params.grad.flat.detach() w = model_train.params.flat.detach() # step & mask genration feature, v_sqrt = self.feature_gen(g) step = self.step_gen(feature, v_sqrt, debug=debug_1) step = params.new_from_flat(step[0]) size = params.size().unflat() if no_mask: params = params + step else: kld = self.mask_gen(feature, size, debug=debug_1) test_kld = kld / data['in_test'].full_size / unroll ## kl annealing function 'linear' / 'logistic' / None test_kld2 = test_kld * kl_anneal_function( anneal_function=None, step=iter, k=0.0025, x0=optim_it) mask = self.mask_gen.sample_mask() mask = ParamsFlattener(mask) mask_layout = mask.expand_as(params) import pdb pdb.set_trace() step_masked = step * mask_layout params = params + step_masked with WalltimeChecker(walltime if mode == 'train' else None): model_test = C(model_cls(params)) test_nll, test_acc = utils.isnan(*model_test( *data['in_test'].load())) if debug_2: pdb.set_trace() if mode == 'train': unroll_losses += test_nll # + test_kld if iter % unroll == 0: meta_optim.zero_grad() unroll_losses.backward() nn.utils.clip_grad_value_(self.parameters(), 0.01) meta_optim.step() unroll_losses = 0 with WalltimeChecker(walltime): if not mode == 'train' or iter % unroll == 0: params = params.detach_() ########################################################################## if analyze_model: analyzers.model_analyzer(self, mode, model_train, params, model_cls, mask.tsize(0), data, iter, optim_it, analyze_mask=True, sample_mask=True, draw_loss=False) if analyze_surface: analyzers.surface_analyzer(params, best_mask, step, writer, iter) ########################################################################## result = dict( train_nll=train_nll.tolist(), test_nll=test_nll.tolist(), train_acc=train_acc.tolist(), test_acc=test_acc.tolist(), test_kld=test_kld.tolist(), walltime=walltime.time, ) if no_mask is False: result.update( **mask.sparsity(overall=True), **mask.sparsity(overall=False), ) result_dict.append(result) log_pbar(result, iter_pbar) return result_dict, params
def meta_optimize(self, meta_optim, data, model_cls, optim_it, unroll, out_mul, k_obsrv=1, no_mask=False, writer=None, mode='train'): assert mode in ['train', 'valid', 'test'] if no_mask is True: raise Exception( "this module currently does NOT suport no_mask option") self.set_mode(mode) ############################################################################ n_samples = k_obsrv """MSG: better postfix?""" analyze_model = False analyze_surface = False ############################################################################ if analyze_surface: writer.new_subdirs('best', 'any', 'rand', 'inv', 'dense') result_dict = ResultDict() unroll_losses = 0 walltime = Walltime() test_kld = torch.tensor(0.) params = C(model_cls()).params self.feature_gen.new() self.step_gen.new() iter_pbar = tqdm(range(1, optim_it + 1), 'Inner_loop') set_size = {'layer_0': 500, 'layer_1': 10} # NOTE: make it smarter for iter in iter_pbar: debug_1 = sigint.is_active(iter == 1 or iter % 10 == 0) debug_2 = sigstp.is_active() best_loss = 9999999 best_params = None with WalltimeChecker(walltime): model_train = C(model_cls(params.detach())) train_nll, train_acc = model_train(*data['in_train'].load()) train_nll.backward() g = model_train.params.grad.flat.detach() w = model_train.params.flat.detach() feature, v_sqrt = self.feature_gen(g) size = params.size().unflat() kld = self.mask_gen(feature, size) losses = [] lips = [] valid_mask_patience = 100 assert n_samples > 0 """FIX LATER: when n_samples == 0 it can behave like no_mask flag is on.""" for i in range(n_samples): # step & mask genration for j in range(valid_mask_patience): mask = self.mask_gen.sample_mask() mask = ParamsFlattener(mask) if mask.is_valid_sparsity(): if j > 0: print( f'\n\n[!]Resampled {j + 1} times to get valid mask!' ) break if j == valid_mask_patience - 1: raise Exception( "[!]Could not sample valid mask for " f"{j+1} trials.") step_out = self.step_gen(feature, v_sqrt, debug=debug_1) step = params.new_from_flat(step_out[0]) mask_layout = mask.expand_as(params) step_sparse = step * mask_layout params_sparse = params + step_sparse params_pruned = params_sparse.prune(mask > 0.5) if params_pruned.size().unflat()['mat_0'][1] == 0: continue # cand_loss = model(*outer_data_s) sparse_model = C(model_cls(params_pruned.detach())) loss, _ = sparse_model(*data['in_train'].load()) if (loss < best_loss) or i == 0: best_loss = loss best_params = params_sparse best_pruned = params_pruned best_mask = mask if best_params is not None: params = best_params with WalltimeChecker(walltime if mode == 'train' else None): model_test = C(model_cls(params)) test_nll, test_acc = utils.isnan(*model_test( *data['in_test'].load())) test_kld = kld / data['in_test'].full_size / unroll ## kl annealing function 'linear' / 'logistic' / None test_kld2 = test_kld * kl_anneal_function( anneal_function=None, step=iter, k=0.0025, x0=optim_it) total_test = test_nll + test_kld2 if mode == 'train': unroll_losses += total_test if iter % unroll == 0: meta_optim.zero_grad() unroll_losses.backward() nn.utils.clip_grad_value_(self.parameters(), 0.01) meta_optim.step() unroll_losses = 0 with WalltimeChecker(walltime): if not mode == 'train' or iter % unroll == 0: params = params.detach_() ########################################################################## """Analyzers""" if analyze_model: analyzers.model_analyzer(self, mode, model_train, params, model_cls, set_size, data, iter, optim_it, analyze_mask=True, sample_mask=True, draw_loss=False) if analyze_surface: analyzers.surface_analyzer(params, best_mask, step, writer, iter) ########################################################################## result = dict( train_nll=train_nll.tolist(), test_nll=test_nll.tolist(), train_acc=train_acc.tolist(), test_acc=test_acc.tolist(), test_kld=test_kld.tolist(), walltime=walltime.time, **best_mask.sparsity(overall=True), **best_mask.sparsity(overall=False), ) result_dict.append(result) log_pbar(result, iter_pbar) return result_dict, params
def sampling_mask(mask_gen, layer_size, model_train, params, sample_num=10000, mode='test', iteration=0, iter_interval=10, result_dir='result/mask_compare'): topk = True sparse_r = {} if not os.path.exists(result_dir): os.makedirs(result_dir) mask_result = None #import pdb; pdb.set_trace() #if mask=='test' and (iteration % iter_interval == 0): if (iteration % iter_interval == 0): for i in range(sample_num): mask = mask_gen.sample_mask() mask = ParamsFlattener(mask) mask_flat = (mask.flat>0.5).float() mask.unflat if i == 0: mask_cat = mask_flat.unsqueeze(dim=0) mask_sum = mask else: mask_cat = torch.cat((mask_cat, mask_flat.unsqueeze(dim=0)), dim=0) mask_sum += mask #import pdb; pdb.set_trace() mask_mean, mask_var = mask_cat.mean(dim=0).squeeze(), mask_cat.var(dim=0).squeeze() mask_sum /= sample_num mask = mask>0.5 grad = model_train.params.grad.detach() act = model_train.activations.detach() for k, v in layer_size.items(): r = (mask > 0.5).sum().unflat[k].tolist() / v sparse_r[f"sparse_{k.split('_')[1]}"] = r topk_mask_gen = MaskGenerator2.topk if topk else MaskGenerator2.randk layer_0_topk = topk_mask_gen(grad=grad, set_size=layer_size, topk=sparse_r['sparse_0'])._unflat['layer_0'].view(-1) layer_0_topk = layer_0_topk>0.5 layer_1_topk = topk_mask_gen(grad=grad, set_size=layer_size, topk=sparse_r['sparse_1'])._unflat['layer_1'].view(-1) layer_1_topk = layer_1_topk>0.5 layer_0_prefer_topk = topk_mask_gen(grad=mask_sum.expand_as(params), set_size=layer_size, topk=sparse_r['sparse_0'])._unflat['layer_0'].view(-1) layer_1_prefer_topk = topk_mask_gen(grad=mask_sum.expand_as(params), set_size=layer_size, topk=sparse_r['sparse_1'])._unflat['layer_1'].view(-1) layer_0_prefer_topk = layer_0_prefer_topk>0.5 layer_1_prefer_topk = layer_1_prefer_topk>0.5 layer_0 = torch.cat([grad.unflat['mat_0'], grad.unflat['bias_0'].unsqueeze(0)], dim=0).abs().sum(0) layer_1 = torch.cat([grad.unflat['mat_1'], grad.unflat['bias_1'].unsqueeze(0)], dim=0).abs().sum(0) layer_0_abs = ParamsFlattener({'layer_0': layer_0}) layer_1_abs = ParamsFlattener({'layer_1': layer_1}) hist_mean, bins_mean = np.histogram(tensor2numpy(mask_mean), bins=20) hist_var, bins_var = np.histogram(tensor2numpy(mask_var), bins=20) sorted_layer_0 = np.sort(tensor2numpy(mask_mean[0:layer_size['layer_0']]))[::-1] mask_thr_layer_0 = sorted_layer_0[int(sparse_r['sparse_0'] * layer_size['layer_0'])] sorted_layer_1 = np.sort(tensor2numpy(mask_mean[layer_size['layer_0']: layer_size['layer_0']+layer_size['layer_1']]))[::-1] mask_thr_layer_1 = sorted_layer_1[int(sparse_r['sparse_1'] * layer_size['layer_1'])] drop = tensor2numpy(mask_mean) < tensor2numpy(mask_mean.median()) sparse_r_drop = np.concatenate(((tensor2numpy(mask_mean) < mask_thr_layer_0)[0:layer_size['layer_0']], (tensor2numpy(mask_mean) < mask_thr_layer_1)[layer_size['layer_0']: layer_size['layer_0']+layer_size['layer_1']]), axis=0) for index, drop in enumerate([drop, sparse_r_drop]): drop = np.array(drop, dtype=int) retain = 1 - drop certain = tensor2numpy(mask_var < mask_var.median()) uncertain = 1 - certain certain_drop = certain * drop certain_retain = certain * retain uncertain_drop = uncertain * drop uncertain_retain = uncertain * retain if index == 0: print("\niteration {} (median) certain drop : {} certain retain : {} uncertain drop : {} uncertain_retain : {}\n".format(iteration, certain_drop.sum(), certain_retain.sum(), uncertain_drop.sum(), uncertain_retain.sum())) else: print("\niteration {} (sparse) certain drop : {} certain retain : {} uncertain drop : {} uncertain_retain : {}\n".format(iteration, certain_drop.sum(), certain_retain.sum(), uncertain_drop.sum(), uncertain_retain.sum())) #import pdb; pdb.set_trace() sparse_r, overlap_mask_ratio_0, overlap_mask_ratio_1, overlap_prefer_ratio_0, overlap_prefer_ratio_1 = plot_masks(mask, layer_0_topk, layer_1_topk, mask_sum, layer_0_prefer_topk, layer_1_prefer_topk, layer_0, layer_1, result_dir, iteration, sparse_r, mask_mean, mask_var) mask_result = dict( sparse_0 = sparse_r['sparse_0'], sparse_1 = sparse_r['sparse_1'], overlap_mask_ratio_0 = overlap_mask_ratio_0.tolist(), overlap_mask_ratio_1 = overlap_mask_ratio_1.tolist(), overlap_prefer_ratio_0 = overlap_prefer_ratio_0.tolist(), overlap_prefer_ratio_1 = overlap_prefer_ratio_1.tolist(), certain_drop=certain_drop, certain_retain=certain_retain, uncertain_drop=uncertain_drop, uncertain_retain=uncertain_retain ) return mask_result
def plot_loss(model_cls, model, params, input_data, dataset, feature_gen, mask_gen, step_gen, scale_way, xmin=-2.0, xmax=0.5, num_x=20, mode='test', iteration=0, iter_interval=10, loss_dir='result/draw_loss'): #if mode == 'test' and ((iteration-1) % iter_interval == 0) and (iteration >1): if ((iteration-1) % iter_interval == 0) and (iteration >1): X = np.linspace(xmin, xmax, num_x) Y = np.linspace(xmin, xmax, num_x) model_train = C(model_cls(params=params.detach())) #step_data = data['in_train'].load() step_data = input_data train_nll, train_acc = model_train(*step_data) train_nll.backward() g = model_train.params.grad.flat.detach() w = model_train.params.flat.detach() feature, v_sqrt = feature_gen(g) size = params.size().unflat() kld = mask_gen(feature, size) # step & mask genration mask = mask_gen.sample_mask() mask = ParamsFlattener(mask) mask_layout = mask.expand_as(params) step_X = step_gen(feature, v_sqrt) step_X = params.new_from_flat(step_X[0]) * mask_layout step_X = step_X.flat.view(-1) step_Y = model_train.params.grad.flat.view(-1) step_X_ = params.new_from_flat(-1.0 * step_X) step_X2_ = params.new_from_flat(-10.0 * step_X) step_Y_ = params.new_from_flat(1.0 * step_Y) step_Y2_ = params.new_from_flat(0.1 * step_Y) #import pdb; pdb.set_trace() #step_Y = step_Y * step_X.abs().sum() / step_Y.abs().sum() L2_X = (step_X * step_X).sum() L2_Y = (step_Y * step_Y).sum() layer_settings = [['mat_0', 'bias_0', 'mat_1', 'bias_1'], ['mat_0', 'bias_0'], ['mat_1', 'bias_1']] normalize_way = 'filter_norm' #result_dirs = ['loss_all_scale_{}'.format(scale_way), 'loss_layer0_scale_{}'.format(scale_way), 'loss_layer1_scale_{}'.format(scale_way)] result_dirs = ['loss_all_scale_{}'.format(scale_way)] for layer_set, result_dir in zip(layer_settings, result_dirs): grad_dir = os.path.join(loss_dir, normalize_way, result_dir, 'gradient') if not os.path.exists(grad_dir): os.makedirs(grad_dir) step_dir = os.path.join(loss_dir, normalize_way, result_dir, 'step') if not os.path.exists(step_dir): os.makedirs(step_dir) step_X_ = params.new_from_flat(-1.0 * step_X) step_Y_ = params.new_from_flat(1.0 * step_Y) #step_X2_ = params.new_from_flat(-10.0 * step_X) #step_Y2_ = params.new_from_flat(0.1 * step_Y) if normalize_way is not None: for step in (step_X_, step_Y_): for matrix in ['mat_0', 'bias_0', 'mat_1', 'bias_1']: di = step.unflat[matrix] norm_di = torch.norm(di,2, dim=0) thetai = params.unflat[matrix] if normalize_way == 'filter_norm': norm_thetai = torch.norm(di,2, dim=0) ## TODO Division by zero bug fix normalize_di = di * norm_thetai / (norm_di+1e-5) elif normalize_way == 'weight_norm': normalize_di = di * thetai / (di + 1e-5) step.unflat[matrix] = normalize_di abs_X = step_X_.abs().sum() abs_Y = step_Y_.abs().sum() L2_X = (step_X_ * step_X_).sum() L2_Y = (step_Y_ * step_Y_).sum() scale_X, scale_Y = 0, 0 #import pdb; pdb.set_trace() for layer in ['mat_1', 'bias_1']: scale_X += abs_X.unflat[layer].item() scale_Y += abs_Y.unflat[layer].item() scale_g_s = scale_X/scale_Y scale_s_g = scale_Y/scale_X for layer in layer_set: scale_X += abs_X.unflat[layer].item() scale_Y += abs_Y.unflat[layer].item() #import pdb; pdb.set_trace() #step_Y_ = step_Y_ * (step_X_.abs().sum() / step_Y_.abs().sum()) if scale_way == 's': scale_s = scale_s_g scale_g = 1.0 elif scale_way == 'g': scale_s = 1.0 scale_g = scale_g_s else: scale_s = 1.0 scale_g = 1.0 Z_X = get_1D_Loss(X, step_X_, scale_s, step_X.size(), layer_set, dataset, model_cls, params) Z_Y = get_1D_Loss(Y, step_Y_, scale_g, step_Y.size(), layer_set, dataset, model_cls, params) #Z_X2 = get_1D_Loss(X, step_X2_, scale, step_X.size(), layer_set, data['in_train'], model_cls, params) #Z_Y2 = get_1D_Loss(Y, step_Y2_, scale,step_Y.size(), layer_set,data['in_train'], model_cls, params) plot_2d(X, Z_X, os.path.join(step_dir, 'iter_{:04d}_STEPxMASK_1dLoss.png'.format(iteration)), 'L1 norm = {:.2f}'.format(scale_X*scale_s)) plot_2d(Y, Z_Y, os.path.join(grad_dir, 'iter_{:04d}_1.0xGradient_1dLoss.png'.format(iteration)),'L1 norm = {:.2f}'.format(scale_Y*scale_g))
def meta_optimize(self, meta_optimizer, data, model_cls, optim_it, unroll, out_mul, mode): assert mode in ['train', 'valid', 'test'] if mode == 'train': self.train() # data.new_train_data() inner_data = data.loaders['inner_train'] outer_data = data.loaders['inner_valid'] # inner_data = data.loaders['train'] # outer_data = inner_data drop_mode = 'soft_drop' elif mode == 'valid': self.eval() inner_data = data.loaders['valid'] outer_data = None drop_mode = 'hard_drop' elif mode == 'test': self.eval() inner_data = data.loaders['test'] outer_data = None drop_mode = 'hard_drop' # data = dataset(mode=mode) model = C(model_cls(sb_mode=self.sb_mode)) model(*data.pseudo_sample()) params = model.params result_dict = ResultDict() unroll_losses = 0 walltime = 0 self.feature_gen.new() ################ mask_dict = ResultDict() analyze_mask = False sample_mask = False draw_loss = False ################ iter_pbar = tqdm(range(1, optim_it + 1), 'optim_iteration') iter_watch = utils.StopWatch('optim_iteration') res1 = None res2 = [] res3 = [] lamb = [] gamm_g = [] gamm_l1 = [] gamm_l2 = [] iters = [] for iteration in iter_pbar: iter_watch.touch() if debug_sigint.signal_on: debug_1 = iteration == 1 or iteration % 10 == 0 else: debug_1 = False if debug_sigstp.signal_on: debug_2 = True else: debug_2 = False model_detached = C(model_cls(params.detach())) inner_data_s = inner_data.load() loss_detached = model_detached(*inner_data_s) loss_detached.backward() g = model_detached.params.grad.flat.detach() w = model_detached.params.flat.detach() cand_params = [] cand_losses = [] best_loss = 9999999 best_params = None n_samples = 5 for m in range(1): feature = self.feature_gen(g, w, n=n_samples, m_update=(m == 0)) p_size = params.size().unflat() mask_gen_out = self.mask_gen(feature, p_size, n=n_samples) step_out = self.step_gen(feature, n=n_samples) # step & mask genration for i in range(n_samples): mask, _, kld = mask_gen_out[i] step = step_out[i] mask = ParamsFlattener(mask) mask_layout = mask.expand_as(params) step = params.new_from_flat(step) # prunning step = step * mask_layout params_ = params + step # cand_params.append(params_.flat) sparse_params = params_.prune(mask > 1e-6) if sparse_params.size().unflat()['mat_0'][1] == 0: continue if debug_2: import pdb pdb.set_trace() # cand_loss = model(*outer_data_s) sparse_model = C(model_cls(sparse_params.detach())) loss = sparse_model(*inner_data_s) try: if loss < best_loss: best_loss = loss best_params = params_ best_kld = kld except: best_params = params_ best_kld = kld if best_params is not None: params = best_params best_kld = 0 if mode == 'train': model = C(model_cls(params)) optim_loss = model(*outer_data.load()) if torch.isnan(optim_loss): import pdb pdb.set_trace() unroll_losses += optim_loss + best_kld / outer_data.full_size if mode == 'train' and iteration % unroll == 0: meta_optimizer.zero_grad() unroll_losses.backward() # import pdb; pdb.set_trace() nn.utils.clip_grad_value_(self.parameters(), 0.1) meta_optimizer.step() unroll_losses = 0 if not mode == 'train' or iteration % unroll == 0: # self.mask_gen.detach_lambdas_() params = params.detach_() # import pdb; pdb.set_trace() if params is None: import pdb pdb.set_trace() iter_pbar.set_description( f'optim_iteration' f'[optim_loss:{loss_detached.tolist():5.5}') # f' sparse_loss:{sparsity_loss.tolist():5.5}]') # iter_pbar.set_description(f'optim_iteration[loss:{loss_dense.tolist()}/dist:{dist.tolist()}]') # torch.optim.SGD(sparse_params, lr=0.1).step() walltime += iter_watch.touch('interval') ############################################################## text_dir = 'test/analyze_mask' result_dir = 'test/drawloss' sample_dir = 'test/mask_compare' iter_interval = 10 sample_num = 10000 if mode == 'test' and analyze_mask: analyzing_mask(self.mask_gen, layer_size, mode, iter, iter_interval, text_dir) if mode == 'test' and sample_mask: mask_result = sampling_mask(self.mask_gen, layer_size, model_train, params, sample_num, mode, iter, iter_interval, sample_dir) if mask_result is not None: mask_dict.append(mask_result) if mode == 'test' and draw_loss: plot_loss(model_cls=model_cls, model=model_train, params, input_data=data['in_train'].load(), dataset=data['in_train'], feature_gen=self.feature_gen, mask_gen=self.mask_gen, step_gen=self.step_gen, scale_way=None, xmin=-2.0, xmax=0.5, num_x=20, mode=mode, iteration=iter, iter_interval=iter_interval, loss_dir=result_dir) ############################################################## result_dict.append(loss=loss_detached) if not mode == 'train': result_dict.append( walltime=walltime, # **self.params_tracker( # grad=grad, # update=batch_arg.updates, # ) ) return result_dict