def reduce_dict_hvd(input_dict, average=True): """ Args: input_dict (dict): all the values will be reduced average (bool): whether to do average or sum Reduce the values in the dictionary from all processes so that all processes have the averaged results. Returns a dict with the same fields as input_dict, after reduction. """ global _USE_HVD world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) if _USE_HVD: # TODO: check this in hvd hvd.allreduce_(values, op=hvd.Average if average else hvd.Adasum, name="reduce_dict") else: dist.all_reduce(values) if average: values /= world_size reduced_dict = {k: v for k, v in zip(names, values)} return reduced_dict
def test_horovod_allreduce_inplace(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() dtypes = [torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) tensor = tensor.type(dtype) multiplied = tensor * size hvd.allreduce_(tensor, average=False) max_difference = tensor.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor, torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def all_reduce_and_rescale_tensors(tensors, rescale_denom): """All-reduce and rescale tensors at once (as a flattened tensor) Args: tensors: list of Tensors to all-reduce rescale_denom: denominator for rescaling summed Tensors """ # buffer size in bytes, determine equiv. # of elements based on data type sz = sum(t.numel() for t in tensors) buffer_t = tensors[0].new(sz).zero_() # copy tensors into buffer_t offset = 0 for t in tensors: numel = t.numel() buffer_t[offset:offset + numel].copy_(t.view(-1)) offset += numel # all-reduce and rescale hvd.allreduce_(buffer_t[:offset]) buffer_t.div_(rescale_denom) # copy all-reduced buffer back into tensors offset = 0 for t in tensors: numel = t.numel() t.view(-1).copy_(buffer_t[offset:offset + numel]) offset += numel
def test_horovod_allreduce_inplace(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() dtypes = [torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) tensor = tensor.type(dtype) multiplied = tensor * size hvd.allreduce_(tensor, average=False) max_difference = tensor.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor, torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def test_stability(self): hvd.init() # TODO support non-MPI Adasum operation if not hvd.mpi_enabled(): self.skipTest("MPI not enabled") device = torch.device('cuda:{}'.format(hvd.local_rank())) if torch.cuda.is_available() else torch.device('cpu') np.random.seed(2) torch.manual_seed(2) size = hvd.size() local_size = hvd.local_size() rank = hvd.rank() for data_type in self.data_types: N = 1024 a = np.random.normal(0, np.finfo(data_type).tiny, (N, 1)).astype(np.float64) r = np.random.normal(0, 1, (size, 1)).astype(np.float64) q = np.dot(a,r.T).astype(data_type).astype(np.float64) tensor = np.zeros(N,dtype=data_type) tensor[:] = q[:,hvd.rank()] tensor = torch.from_numpy(tensor).to(device) hvd.allreduce_(tensor, op=hvd.Adasum) expected = np.sum(q,axis=1) / size comp = self.are_close(data_type, expected, tensor.cpu().numpy()) if comp: print('Stability test passed') else: print('computed: ', tensor) print('expected: ', expected) print('off by: ', self.diff_ratio(expected,tensor.cpu().numpy())) assert comp
def all_reduce_and_rescale_tensors_chunked(tensors, rescale_denom, buffer_size=10485760): """All-reduce and rescale tensors in chunks of the specified size. Args: tensors: list of Tensors to all-reduce rescale_denom: denominator for rescaling summed Tensors buffer_size: all-reduce chunk size in bytes """ # buffer size in bytes, determine equiv. # of elements based on data type buffer_t = tensors[0].new( math.ceil(buffer_size / tensors[0].element_size())).zero_() buffer = [] def all_reduce_buffer(): # copy tensors into buffer_t offset = 0 for t in buffer: numel = t.numel() buffer_t[offset:offset + numel].copy_(t.view(-1)) offset += numel # all-reduce and rescale hvd.allreduce_(buffer_t[:offset]) buffer_t.div_(rescale_denom) # copy all-reduced buffer back into tensors offset = 0 for t in buffer: numel = t.numel() t.view(-1).copy_(buffer_t[offset:offset + numel]) offset += numel filled = 0 for t in tensors: sz = t.numel() * t.element_size() if sz > buffer_size: # tensor is bigger than buffer, all-reduce and rescale directly hvd.allreduce_(t) t.div_(rescale_denom) elif filled + sz > buffer_size: # buffer is full, all-reduce and replace buffer with grad all_reduce_buffer() buffer = [t] filled = sz else: # add tensor to buffer buffer.append(t) filled += sz if len(buffer) > 0: all_reduce_buffer()
def clip_grad_norm_2_by_global_(grad, max_norm, name=None): max_norm = float(max_norm) grad_square_sum = torch.sum(grad.square()) total_norm = torch.sqrt(allreduce_(grad_square_sum, average=True, name=name)) clip_coef = max_norm / (total_norm + 1e-6) if clip_coef < 1: grad.data.mul_(clip_coef) return grad
def all_reduce_buffer(): # copy tensors into buffer_t offset = 0 for t in buffer: numel = t.numel() buffer_t[offset:offset + numel].copy_(t.view(-1)) offset += numel # all-reduce and rescale hvd.allreduce_(buffer_t[:offset]) buffer_t.div_(rescale_denom) # copy all-reduced buffer back into tensors offset = 0 for t in buffer: numel = t.numel() t.view(-1).copy_(buffer_t[offset:offset + numel]) offset += numel
def test_stability_2(self): hvd.init() # TODO support non-MPI Adasum operation if not hvd.mpi_enabled(): return device = torch.device('cuda:{}'.format(hvd.local_rank( ))) if torch.cuda.is_available() else torch.device('cpu') np.random.seed(2) torch.manual_seed(2) size = hvd.size() local_size = hvd.local_size() rank = hvd.rank() for data_type in self.data_types: N = 1024 dt_min = np.finfo(data_type).tiny.astype(np.float64) dt_max = math.sqrt(np.finfo(data_type).max.astype(np.float64)) a = np.random.normal(0, 1, (N, 1)).astype(np.float64) r = np.array([ dt_max**(float(i + 1) / float(size)) * dt_min**(float(size - i - 1) / float(size)) for i in range(size) ]).reshape(size, 1).astype(np.float64) np.random.shuffle(r) q = np.dot(a, r.T).astype(data_type).astype(np.float64) tensor = np.zeros(N, dtype=data_type) tensor[:] = q[:, hvd.rank()] tensor = torch.from_numpy(tensor).to(device) hvd.allreduce_(tensor, op=hvd.Adasum) expected = np.sum(q, axis=1) / size comp = self.are_close(data_type, expected, tensor.cpu().numpy()) if comp: print('Stability 2 test passed') else: print('computed: ', tensor) print('expected: ', expected) print('off by: ', self.diff_ratio(expected, tensor.cpu().numpy())) assert comp
def compress(self, tensor, name=""): if tensor.dim() == 1: return [tensor], None shape = tensor.size() matrix = tensor.view([shape[0], -1]) q = self.q_memory[name] # q, _ = torch.qr(q) orthogonalize(q) p = torch.mm(matrix, q) p = allreduce_(p) # p, _ = torch.qr(p) orthogonalize(p) q = torch.mm(matrix.t(), p) q = allreduce_(q) ctx = p, q, shape self.q_memory[name] = q return [], ctx
def test_horovod_allreduce_multi_gpu(self): """Test that the allreduce works on multiple GPUs.""" # Only do this test if there are GPUs available. if not torch.cuda.is_available(): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() iter = 0 dtypes = [ torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor ] if _fp16_supported: dtypes += [torch.cuda.HalfTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): iter += 1 torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) device = local_rank * 2 + (iter + local_rank) % 2 tensor = tensor.cuda(device).type(dtype) multiplied = tensor * size hvd.allreduce_(tensor, average=False) max_difference = tensor.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [ torch.cuda.IntTensor, torch.cuda.LongTensor ]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def test_horovod_allreduce_multi_gpu(self): """Test that the allreduce works on multiple GPUs.""" # Only do this test if there are GPUs available. if not torch.cuda.is_available(): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() iter = 0 dtypes = [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): iter += 1 torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) device = local_rank * 2 + (iter + local_rank) % 2 tensor = tensor.cuda(device).type(dtype) multiplied = tensor * size hvd.allreduce_(tensor, average=False) max_difference = tensor.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def compensate(self, tensor, name): """Update the tensor with the residuals.""" # https://github.com/synxlin/deep-gradient-compression/blob/master/dgc/memory.py grad = self.get_grad(name) if self.gradient_clipping: tensor_squ_sum = torch.sum(grad * grad) clipping_val = torch.sqrt(allreduce_(tensor_squ_sum, average=True, name=name)) grad = grad.clamp(-clipping_val, clipping_val) mmt = self.get_momentum(name) vec = self.get_velocity(name) if self.momentum_masking: mmt.mul_(self.momentum).add_(grad) vec.add_(mmt) else: vec.mul_(self.momentum).add_(grad)
def compensate(self, tensor, name): """Update the tensor with the residuals.""" if self.gradient_clipping: tensor_squ_sum = torch.sum(tensor * tensor) clipping_val = torch.sqrt( allreduce_(tensor_squ_sum, average=True, name=name)) tensor = tensor.clamp(-clipping_val, clipping_val) if name in self.residuals: self.residuals[ name] = self.momentum * self.residuals[name] + tensor else: self.residuals[name] = tensor if name in self.gradients: self.gradients[name] += self.residuals[name] tensor = self.gradients[name] else: self.gradients[name] = tensor return tensor
def array_reduce_(arr: Array, average: bool = True) -> None: t = torch.from_numpy(arr) hvd.allreduce_(t, average=average)
model_parameters = filter(lambda p: p.requires_grad, model.parameters()) num_parameters = sum([np.prod(p.size()) for p in model_parameters]) print(f'Number of trainable parameters in model: {num_parameters}') model.logger.add_text(f'hyperparams', '{num_parameters}', 0) if root_process: print("Load data") # get dataset for training and testing of the model if root_process: train_set = datasets.MNIST(root="data/mnist", train=True, transform=transform_ops, download=True) test_set = datasets.MNIST(root="data/mnist", train=False, transform=transform_ops, download=True) # if distributed over multiple GPU's, set-up barrier a barrier ensuring that all the processes have loaded the data if distributed: hvd.allreduce_(torch.Tensor(0), name='barrier') # get dataset for training and testing of the model if not root_process: train_set = datasets.MNIST(root="data/mnist", train=True, transform=transform_ops, download=True) test_set = datasets.MNIST(root="data/mnist", train=True, transform=transform_ops, download=True) # setup data sampler if distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_set, num_replicas=hvd.size(), rank=hvd.rank()) test_sampler = torch.utils.data.distributed.DistributedSampler( test_set, num_replicas=hvd.size(), rank=hvd.rank()) # setup mini-batch enumerator for both train-set and test-set train_loader = torch.utils.data.DataLoader(
model.train() output = model(data) loss = criterion(output, target) large_batch_loss += loss.item() loss.backward() if inner_loop % large_ratio == 0: num_updates += 1 optimizer.step() optimizer.zero_grad() if num_updates % args.comm_interval == args.comm_interval - 1: allreduce_parameters(model.state_dict()) if batch_idx * large_ratio % 25 == 0: print('Train Epoch: {} [{}/{}]\tLoss: {}'.format(epoch, batch_idx * len(data), len(train_sampler), large_batch_loss)) cur_batch_loss = torch.FloatTensor([loss.item()]) hvd.allreduce_(cur_batch_loss) cur_batch_loss = float(cur_batch_loss) train_losses.append((time.clock() - start_time, epoch, batch_idx, cur_batch_loss)) large_batch_loss = 0 if batch_idx % 100 == 0: model.eval() try: inputs, labels = next(testset_iterator) except StopIteration: testset_iterator = iter(test_loader) inputs, labels = next(testset_iterator) outputs = model(inputs) loss = criterion(outputs, labels) accuracy = outputs.data.max(1)[1].eq(labels).sum().item() / outputs.data.shape[0] loss = metric_average(loss, 'avg_loss')
def forward(data_loader, model, criterion, epoch=0, training=True, optimizer=None, U=None, V=None): # hvd # if args.gpus and len(args.gpus) > 1: # model = torch.nn.DataParallel(model, args.gpus) batch_time = AverageMeter() pruning_time = AverageMeter() select_time = AverageMeter() comm_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() end = time.time() masks = [torch.zeros(w.size()).cuda() for w in list(model.parameters())] for i, (inputs, target) in enumerate(data_loader): # measure data loading time data_time.update(time.time() - end) if args.gpus is not None: target = target.cuda(async=True) input_var = Variable(inputs.type(args.type), volatile=not training) target_var = Variable(target) # compute output if not training: output = model(input_var) loss = criterion(output, target_var) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target_var.data, topk=(1, 5)) losses.update(loss.data[0], input_var.size(0)) top1.update(prec1[0], input_var.size(0)) top5.update(prec5[0], input_var.size(0)) else: # mini_inputs = input_var.chunk(args.batch_size // args.mini_batch_size) # mini_targets = target_var.chunk(args.batch_size // args.mini_batch_size) #TODO for debug shoul be delete optimizer.zero_grad() # fjr simulate distributed senario # acc_grad = [] # if torch.cuda.is_available(): # acc_grad = [torch.zeros(w.size()).cuda() for w in list(model.parameters())] # else: # print("gpu is not avaiable for acc_grad allocation") # for k, mini_input_var in enumerate(mini_inputs): output = model(input_var) loss = criterion(output, target_var) prec1, prec5 = accuracy(output.data, target_var.data, topk=(1, 5)) losses.update(loss.data[0], input_var.size(0)) top1.update(prec1[0], input_var.size(0)) top5.update(prec5[0], input_var.size(0)) loss.backward() if args.use_pruning: clip_grad_norm(model.parameters(), 5. * (hvd.size()**-0.5)) idx = 0 for u, v, p in zip(U, V, model.parameters()): prune_begin = time.time() if args.use_pruning: # TODO how to set rho (momentum) g = p.grad.data / hvd.size() g += p.data * args.weight_decay / hvd.size() if args.use_nesterov: u = args.momentum * (u + g) v = v + u + g else: u = args.momentum * u + g v = v + u select_begin = time.time() ratio = 1 - 0.999 if args.use_sync and i % args.sync_interval == 0: masks[idx] = 1 else: if args.use_warmup: # print("iter", i, "node ", k, " pruning layer ", idx) if (epoch == 0): ratio = 1 - 0.75 elif (epoch == 1): ratio = 1 - 0.9375 elif (epoch == 2): ratio = 1 - 0.984375 elif (epoch == 3): ratio = 1 - 0.996 else: ratio = 1 - 0.999 else: ratio = 1 - 0.999 #masks[idx], compressed_val, compressed_idx = select_top_k(v, ratio, masks[idx]) masks[ idx], compressed_val, compressed_idx = select_top_k_appr( v, ratio, masks[idx]) select_time.update(time.time() - select_begin) # TODO check compress p_tmp = v * masks[idx] g_ref = hvd.allreduce(p_tmp, average=False) v = v * (1 - masks[idx]) u = u * (1 - masks[idx]) comm_begin = time.time() g_size = p.grad.data.size() msg_size = len(compressed_val) # print("compressed_val size is, ", msg_size) gathered_val = hvd.allgather(compressed_val) gathered_idx = hvd.allgather(compressed_idx) p.grad.data = p.grad.data.view(-1) p.grad.data.zero_() # print("gathered_val size is, ", len(gathered_val)) # print("val", gathered_val) # print("idx", gathered_idx) for node_idx in range(hvd.size()): p.grad.data[gathered_idx[node_idx * msg_size:(node_idx + 1) * msg_size]] += gathered_val[ node_idx * msg_size:(node_idx + 1) * msg_size] p.grad.data = p.grad.data.view(g_size) comm_time.update(time.time() - comm_begin) U[idx] = u #new_residue V[idx] = v else: p.grad.data = p.grad.data / hvd.size() hvd.allreduce_(p.grad.data, average=False) idx += 1 pruning_time.update(time.time() - prune_begin) # Master idx = 0 if args.use_pruning: pass else: for p in list(model.parameters()): # print("accumulated sparsity is", check_sparsity(g)) # TODO 1. use pytorch sgd optimizer to calculate mom and weight_decay, set mom and wd # used with pruning # TODO 2. implement weight_decay and momentum by myself, set mom=0 and wd = 0 # used with baseline g = p.grad.data g += p.data * args.weight_decay V[idx] = args.momentum * V[idx] + g p.grad.data = V[k][idx] clip_grad_norm(model.parameters(), 5.) idx = idx + 1 optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: if hvd.local_rank() == 0: logging.info( '{phase} - Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Prune {pruning_time.val:.9f} ({pruning_time.avg:.3f})\t' 'Select {select_time.val:.9f} ({select_time.avg:.3f})\t' 'Communication {comm_time.val:.9f} ({comm_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(data_loader), phase='TRAINING' if training else 'EVALUATING', batch_time=batch_time, data_time=data_time, pruning_time=pruning_time, select_time=select_time, comm_time=comm_time, loss=losses, top1=top1, top5=top5)) return { 'loss': losses.avg, 'prec1': top1.avg, 'prec5': top5.avg, 'U': U, 'V': V }
def clip_grad_value_by_global_norm_(grad, name=None): grad_square_sum = torch.sum(grad.square()) clip_value = torch.sqrt(allreduce_(grad_square_sum, average=True, name=name)) grad.data.clamp_(min=-clip_value, max=clip_value)
def batch_translate(self, input_path, output_path, field=0, remove_subword_tokens=True, max_length=100, resume=False): """Translate a file.""" # Check whether using multiple GPUs try: import horovod.torch as hvd except ImportError: pass # If using multigpu, then separate the input file if self._is_multigpu: sync_tensor = torch.tensor(0) tmp_output_path = "/tmp/{}.{}".format(os.path.basename(output_path), hvd.local_rank()) else: sync_tensor = None tmp_output_path = output_path result_map = {} if self._is_multigpu and resume and os.path.exists(tmp_output_path): for line in open(tmp_output_path): pair = line.strip("\n").split("\t") if len(pair) != 2: print(line) id, line = pair result_map[int(id)] = line print("loaded {} computed results".format(len(result_map))) fout = open(tmp_output_path, "w") test_lines = list(open(input_path)) err = 0 for i, line in enumerate(test_lines): # Gather error counts in multigpu mode if self._is_multigpu: if i % (10 * hvd.size()) == 0: sync_tensor.fill_(err) hvd.allreduce_(sync_tensor, average=False) if i % hvd.size() != hvd.local_rank(): continue # Translate pair = line.strip().split("\t") src_sent = pair[field] if len(src_sent.split()) > max_length: result = "x" else: if i in result_map: result = result_map[i] else: result, _ = self.translate("<s> {} </s>".format(src_sent)) if result is None: result = "" if remove_subword_tokens: if "▁" in result: result = "".join(result.split()).replace("▁", " ").strip() else: result = result.replace("@@ ", "") if not result: err += 1 # Write the results and print progress if self._is_multigpu: fout.write("{}\t{}\n".format(i, result)) else: fout.write("{}\n".format(result)) fout.flush() if self._is_multigpu and hvd.local_rank() == 0: sys.stdout.write("translating: {:.0f}% err: {} \r".format(float(i + 1) * 100 / len(test_lines), int(sync_tensor))) elif not self._is_multigpu: sys.stdout.write("translating: {:.0f}% err: {} \r".format(float(i + 1) * 100 / len(test_lines), err)) sys.stdout.flush() if is_root_node(): sys.stdout.write("\n") fout.close() if self._is_multigpu: # Wait for all process to end hvd.allreduce_(sync_tensor, average=False) # Concatenate all separated translation results if hvd.local_rank() == 0: results = [] for i in range(hvd.size()): for line in open("/tmp/{}.{}".format(os.path.basename(output_path), i)): id, result = line.strip("\n").split("\t") results.append((int(id), result)) results.sort() with open(output_path, "w") as fout: for _, result in results: fout.write(result + "\n")
def tensor_mean_and_var(t: Tensor) -> Tuple[Tensor, Tensor]: mean = hvd.allreduce_(t.mean(dim=0)) var = hvd.allreduce_((t - mean).pow(2).mean(dim=0)) return mean, var
def get_eigen(model, inputs, targets, criterion, maxIter=50, tol=1e-3, comm=True): """ compute the top eigenvalues of model parameters and the corresponding eigenvectors. change the model to evaluation mode, otherwise the batch Normalization Layer will change. If you call this functino during training, remember to change the mode back to training mode. model.eval() """ model.eval() # torch.no_grad() #model_copy = squeezenet1_1(pretrained=False) #model_copy.load_state_dict(model.state_dict()) #optimizer = optim.SGD(model_copy.parameters(), lr=0.001 * hvd.size(), momentum=0.9) outputs = model(inputs) loss = criterion(outputs, targets) loss.backward(create_graph=True) params, gradsH = get_params_grad(model) v = [torch.randn(p.size()) for p in params] v = normalization(v) if comm: hvd.broadcast_parameters(v, root_rank=0) eigenvalue = None for i in range(maxIter): print(i) model.zero_grad() Hv = hessian_vector_product(gradsH, params, v) if comm: handles = [] for i in range(len(Hv)): handles.append( hvd.allreduce_async_( Hv[i], name='reduce random vector update {}'.format(i))) for handle in handles: hvd.synchronize(handle) eigenvalue_tmp = group_product(Hv, v).item() v = normalization(Hv) if eigenvalue == None: eigenvalue = eigenvalue_tmp else: if abs(eigenvalue - eigenvalue_tmp) < tol: if comm: return eigenvalue_tmp, v else: eigenvalue = eigenvalue_tmp if not comm: print("{} is here".format(hvd.rank())) eigenvalue = torch.FloatTensor([eigenvalue]) hvd.allreduce_(eigenvalue, name='eigenvalue') print("allreduced eigs for rank {}".format(hvd.rank())) eigenvalue = float(eigenvalue) if hvd.rank() == 0: print("No Communication eigenvalue approximated at {}".format( eigenvalue)) return eigenvalue, v