def eigenvalues(self, maxIter=100, tol=1e-3, top_n=1): """ compute the top_n eigenvalues using power iteration method maxIter: maximum iterations used to compute each single eigenvalue tol: the relative tolerance between two consecutive eigenvalue computations from power iteration top_n: top top_n eigenvalues will be computed """ assert top_n >= 1 device = self.device eigenvalues = [] eigenvectors = [] computed_dim = 0 while computed_dim < top_n: eigenvalue = None v = [torch.randn(p.size()).to(device) for p in self.params ] # generate random vector # print(v) v = normalization(v) # normalize the vector for i in range(maxIter): v = orthnormal(v, eigenvectors) self.model.zero_grad() if self.full_dataset: tmp_eigenvalue, Hv = self.dataloader_hv_product(v) else: Hv = hessian_vector_product(self.gradsH, self.params, v) tmp_eigenvalue = group_product(Hv, v).cpu().item() v = normalization(Hv) if eigenvalue == None: eigenvalue = tmp_eigenvalue else: if abs(eigenvalue - tmp_eigenvalue) / (abs(eigenvalue) + 1e-6) < tol: break else: eigenvalue = tmp_eigenvalue eigenvalues.append(eigenvalue) eigenvectors.append(v) computed_dim += 1 return eigenvalues, eigenvectors
def sketch(self, d, debug=False): """ Sketch function and scale down from a nxn matrix to a dxd matrix Do this by right multiplying by d nx1 column vectors to get a nxd matrix then left multiplying by a dxn matrix Sketch is made up of Rademacher variables (helps with trace calculation as well) Output is a dxd numpy array """ device = self.device # Generate d Rademacher vectors v and calculate corresponding Hv print("starting") print("d = " + str(d)) print(time.time()) vs = [] Hvs = [] for i in range(d): print(i) # Generate Rademacher random variables v = [ torch.randint_like(p, high=2, device=device) for p in self.params ] for v_i in v: v_i[v_i == 0] = -1 # print(norm(v)) v = normalization(v) # print(norm(v)) # print(group_product(v, v).cpu().item()) vs.append(v) # Calculate Hv self.model.zero_grad() if self.full_dataset: _, Hv = self.dataloader_hv_product(v) else: Hv = hessian_vector_product(self.gradsH, self.params, v) Hvs.append(Hv) # Create sketched matrix template print(time.time()) sketched_hessian = np.zeros((d, d)) # Fill in matrix as A_ij = v_i' * Hv_j for i in range(d): for j in range(d): # print("({}, {})".format(i, j)) sketched_hessian[i, j] = group_product(vs[i], Hvs[j]).cpu().item() / d print(time.time()) return sketched_hessian
def density(self, iter=100, n_v=1): """ compute estimated eigenvalue density using stochastic lanczos algorithm (SLQ) iter: number of iterations used to compute trace n_v: number of SLQ runs """ device = self.device eigen_list_full = [] weight_list_full = [] for k in range(n_v): v = [ torch.randint_like(p, high=2, device=device) for p in self.params ] # generate Rademacher random variables for v_i in v: v_i[v_i == 0] = -1 v = normalization(v) # standard lanczos algorithm initlization v_list = [v] w_list = [] alpha_list = [] beta_list = [] ############### Lanczos for i in range(iter): self.model.zero_grad() w_prime = [torch.zeros(p.size()).to(device) for p in self.params] if i == 0: if self.full_dataset: _, w_prime = self.dataloader_hv_product(v) else: w_prime = hessian_vector_product( self.gradsH, self.params, v) alpha = group_product(w_prime, v) alpha_list.append(alpha.cpu().item()) w = group_add(w_prime, v, alpha=-alpha) w_list.append(w) else: beta = torch.sqrt(group_product(w, w)) beta_list.append(beta.cpu().item()) if beta_list[-1] != 0.: # We should re-orth it v = orthnormal(w, v_list) v_list.append(v) else: # generate a new vector w = [torch.randn(p.size()).to(device) for p in self.params] v = orthnormal(w, v_list) v_list.append(v) if self.full_dataset: _, w_prime = self.dataloader_hv_product(v) else: w_prime = hessian_vector_product( self.gradsH, self.params, v) alpha = group_product(w_prime, v) alpha_list.append(alpha.cpu().item()) w_tmp = group_add(w_prime, v, alpha=-alpha) w = group_add(w_tmp, v_list[-2], alpha=-beta) T = torch.zeros(iter, iter).to(device) for i in range(len(alpha_list)): T[i, i] = alpha_list[i] if i < len(alpha_list) - 1: T[i + 1, i] = beta_list[i] T[i, i + 1] = beta_list[i] a_, b_ = torch.eig(T, eigenvectors=True) eigen_list = a_[:, 0] weight_list = b_[0, :]**2 eigen_list_full.append(list(eigen_list.cpu().numpy())) weight_list_full.append(list(weight_list.cpu().numpy())) return eigen_list_full, weight_list_full
def eigenvalues_lanczos(self, k, debug=False): """ Compute the top k eigenvalues by Lacnzos Method for approximating eigenvalues """ device = self.device # Prepare to record data if self.record_data: now = datetime.datetime.now() timestamp = "_{:02d}{:02d}_{:02d}{:02d}{:02d}".format( now.day, now.month, now.hour, now.minute, now.second) save_file = self.data_save_dir + "Lanczos" + timestamp + ".txt" total_time_to_compute = [] start_time = time.time() # Pick a random first vector, making sure it has norm 1 print("starting with q1") q0 = [torch.randn(p.size()).to(device) for p in self.params] q0 = normalization(q0) total_time_to_compute.append(time.time() - start_time) # Calculate Hq1 self.model.zero_grad() if self.full_dataset: _, Hq0 = self.dataloader_hv_product(q0) else: Hq0 = hessian_vector_product(self.gradsH, self.params, q0) # First column qs = [q0] Hqs = [Hq0] T = np.zeros((k + 1, k)) T[0, 0] = group_product(qs[0], Hqs[0]).cpu().item() r = multi_add([Hqs[0], qs[0]], [1, -1 * T[0, 0]]) # r = Hq0 - T00*q0 T[1, 0] = norm(r) # T10 = |r| T[0, 1] = T[1, 0] # T symmetric q1 = [ri / T[1, 0] for ri in r] #q2 = r/|r| total_time_to_compute.append(time.time() - start_time) # Calculate Hq2 self.model.zero_grad() if self.full_dataset: _, Hq1 = self.dataloader_hv_product(q1) else: Hq1 = hessian_vector_product(self.gradsH, self.params, q1) qs.append(q1) Hqs.append(Hq1) # Subsequent columns (columns 1 - k-1) for i in range(1, k): print(i) T[i, i] = group_product(qs[i], Hqs[i]).cpu().item() r = multi_add([Hqs[i], qs[i - 1], qs[i]], [1, -1 * T[i - 1, i], -1 * T[i, i]]) T[i + 1, i] = norm(r) if i != k - 1: T[i, i + 1] = T[i + 1, i] q = [ri / T[i + 1, i] for ri in r] total_time_to_compute.append(time.time() - start_time) self.model.zero_grad() if self.full_dataset: _, Hq = self.dataloader_hv_product(q) else: Hq = hessian_vector_product(self.gradsH, self.params, q) qs.append(q) Hqs.append(Hq) # print(T) T_UH = T[0:k, 0:k] #T_UH is square Upper Hessenberg # np.save("T_100", T) # print(T_UH) # print(np.linalg.eigvalsh(T_UH)) # Write data if applicable if self.record_data: with open(save_file, 'w') as f: f.write("Total Elapsed Time(s)\tEigenvalues\n") for i in range(k): eigs_i = np.linalg.eigvalsh(T[0:i, 0:i]) s = "" for e in eigs_i: s += "\t" + str(e) s = str(total_time_to_compute[i]) + s + "\n" f.write(s) return np.linalg.eigvalsh(T_UH)
def density(self, iter=100, n_v=1, debug=False): """ compute estimated eigenvalue density using stochastic lanczos algorithm (SLQ) iter: number of iterations used to compute trace n_v: number of SLQ runs """ device = self.device eigen_list_full = [] weight_list_full = [] # Prepare to record data if self.record_data: now = datetime.datetime.now() timestamp = "_{:02d}{:02d}_{:02d}{:02d}{:02d}".format( now.day, now.month, now.hour, now.minute, now.second) save_file = self.data_save_dir + "ESD" + timestamp + ".txt" start_time = time.time() for k in range(n_v): v = [ torch.randint_like(p, high=2, device=device) for p in self.params ] # generate Rademacher random variables for v_i in v: v_i[v_i == 0] = -1 v = normalization(v) # standard lanczos algorithm initlization v_list = [v] w_list = [] alpha_list = [] beta_list = [] ############### Lanczos for i in range(iter): if debug: print("Iteration {}".format(i)) self.model.zero_grad() w_prime = [ torch.zeros(p.size()).to(device) for p in self.params ] if i == 0: if self.full_dataset: _, w_prime = self.dataloader_hv_product(v) else: w_prime = hessian_vector_product( self.gradsH, self.params, v) alpha = group_product(w_prime, v) alpha_list.append(alpha.cpu().item()) w = group_add(w_prime, v, alpha=-alpha) w_list.append(w) else: beta = torch.sqrt(group_product(w, w)) beta_list.append(beta.cpu().item()) if beta_list[-1] != 0.: # We should re-orth it v = orthnormal(w, v_list) v_list.append(v) else: # generate a new vector w = [ torch.randn(p.size()).to(device) for p in self.params ] v = orthnormal(w, v_list) v_list.append(v) if self.full_dataset: _, w_prime = self.dataloader_hv_product(v) else: w_prime = hessian_vector_product( self.gradsH, self.params, v) alpha = group_product(w_prime, v) alpha_list.append(alpha.cpu().item()) w_tmp = group_add(w_prime, v, alpha=-alpha) w = group_add(w_tmp, v_list[-2], alpha=-beta) T = torch.zeros(iter, iter).to(device) for i in range(len(alpha_list)): T[i, i] = alpha_list[i] if i < len(alpha_list) - 1: T[i + 1, i] = beta_list[i] T[i, i + 1] = beta_list[i] a_, b_ = torch.eig(T, eigenvectors=True) eigen_list = a_[:, 0] weight_list = b_[0, :]**2 eigen_list_full.append(list(eigen_list.cpu().numpy())) weight_list_full.append(list(weight_list.cpu().numpy())) # Write data if applicable stop_time = time.time() if self.record_data: with open(save_file, 'w') as f: f.write("Total Elapsed Time(s)\n") f.write("{}\n".format(stop_time - start_time)) return eigen_list_full, weight_list_full
def eigenvalues(self, maxIter=100, tol=1e-3, top_n=1, debug=False): """ compute the top_n eigenvalues using power iteration method maxIter: maximum iterations used to compute each single eigenvalue tol: the relative tolerance between two consecutive eigenvalue computations from power iteration top_n: top top_n eigenvalues will be computed """ assert top_n >= 1 device = self.device eigenvalues = [] eigenvectors = [] computed_dim = 0 # Prepare to record data if self.record_data: now = datetime.datetime.now() timestamp = "_{:02d}{:02d}_{:02d}{:02d}{:02d}".format( now.day, now.month, now.hour, now.minute, now.second) save_file = self.data_save_dir + "TopEigen" + timestamp + ".txt" total_time_to_compute = [] iters_to_compute = [] start_time = time.time() while computed_dim < top_n: if debug: print("Computing eigenvalue #{}".format(computed_dim + 1)) eigenvalue = None v = [torch.randn(p.size()).to(device) for p in self.params] # generate random vector v = normalization(v) # normalize the vector for i in range(maxIter): if debug: print(" Iteration {}".format(i)) v = orthnormal(v, eigenvectors) self.model.zero_grad() if self.full_dataset: tmp_eigenvalue, Hv = self.dataloader_hv_product(v) else: Hv = hessian_vector_product(self.gradsH, self.params, v) tmp_eigenvalue = group_product(Hv, v).cpu().item() v = normalization(Hv) if eigenvalue == None: eigenvalue = tmp_eigenvalue else: if abs(eigenvalue - tmp_eigenvalue) / (abs(eigenvalue) + 1e-6) < tol: break else: eigenvalue = tmp_eigenvalue # Record data total_time_to_compute.append(time.time() - start_time) iters_to_compute.append(i) eigenvalues.append(eigenvalue) eigenvectors.append(v) computed_dim += 1 # Write data if applicable if self.record_data: with open(save_file, 'w') as f: f.write("Eigenvalue\tTotal Elapsed Time(s)\t#Iterations\n") for i in range(top_n): f.write("{}\t{}\t{}\n".format(i + 1, total_time_to_compute[i], iters_to_compute[i])) return eigenvalues, eigenvectors
def train_hessian(args, trainer, task, epoch_itr, sample_iter=1, maxIter=500, tol=1e-4, top_n=1, ignore_grad=False): """Train the model for one epoch.""" # Update parameters every N batches update_freq = args.update_freq[ epoch_itr.epoch - 1] if epoch_itr.epoch <= len( args.update_freq) else args.update_freq[-1] # Initialize data iterator itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus, shuffle=(epoch_itr.epoch >= args.curriculum), ) itr = iterators.GroupedIterator(itr, update_freq) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, no_progress_bar='simple', ) extra_meters = collections.defaultdict(lambda: AverageMeter()) valid_subsets = args.valid_subset.split(',') max_update = args.max_update or math.inf max_iters = 10 samples_hessian = [] for i, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch): if i > max_iters: break samples = [trainer._prepare_sample(sample) for sample in samples] samples_hessian.extend(samples) eigenvalues = [] eigenvectors = [] computed_dim = 0 params, gradsH = get_params_grad(trainer.model) while computed_dim < top_n: eigenvalue = None v = [torch.randn(p.size()).cuda() for p in params] v = normalization(v) for i in range(maxIter): trainer.model.zero_grad() v = orthnormal(v, eigenvectors) loss, sample_size, logging_output, gradsH, tmp_eigenvalue, Hv = trainer.task.train_step_hessian( samples_hessian, trainer.model, trainer.criterion, trainer.optimizer, ignore_grad, v=v) v = normalization(Hv) if eigenvalue == None: eigenvalue = tmp_eigenvalue else: if abs(eigenvalue - tmp_eigenvalue) / (abs(eigenvalue) + 1e-6) < tol: break else: eigenvalue = tmp_eigenvalue eigenvalues.append(eigenvalue) eigenvectors.append(v) computed_dim += 1 return eigenvalues, eigenvectors