def test_timeline_neighbor_allreduce(self): x = torch.FloatTensor(10, 10).fill_(1).mul_(bf.rank()) x = bf.neighbor_allreduce(x, name='test_neighbor_allreduce') time.sleep(0.1) file_name = f"{self.temp_file}{bf.rank()}.json" with open(file_name, 'r') as tf: timeline_text = tf.read() assert 'MPI_NEIGHBOR_ALLREDUCE' in timeline_text, timeline_text assert 'ENQUEUE_NEIGHBOR_ALLREDUCE' in timeline_text, timeline_text
def diffusion(X, y, w_opt, loss, maxite=2000, alpha=1e-1, **kwargs): if loss == 'logistic_regression': rho = kwargs.get('rho', 1e-1) elif loss == 'linear_regression': rho = 0 else: raise NotImplementedError( 'Task not supported. This example only supports' + ' linear_regression and logistic_regression') topology = bf.load_topology() self_weight, neighbor_weights = topology_util.GetRecvWeights( topology, bf.rank()) w = torch.zeros(n, 1, dtype=torch.double, requires_grad=True) phi = w.clone() mse = [] for i in range(maxite): # calculate loccal gradient via pytorch autograd loss_step(X, y, w, tensor_name='neighbor.allreduce.local_variable', loss=loss, rho=rho) # diffusion with torch.no_grad(): phi = w - alpha * w.grad.data w.data = bf.neighbor_allreduce(phi, self_weight=self_weight, src_weights=neighbor_weights, name='local variable') w.grad.data.zero_() # record convergence if bf.rank() == 0: mse.append(torch.norm(w.data - w_opt.data, p=2)) return w, mse
dynamic_neighbor_allreduce_gen = topology_util.GetDynamicOnePeerSendRecvRanks( bf.load_topology(), bf.rank()) for ite in range(args.max_iters): if args.enable_dynamic_topology: send_neighbors, recv_neighbors = next( dynamic_neighbor_allreduce_gen) neighbor_weights = { r: 1 / (len(recv_neighbors) + 1) for r in recv_neighbors } self_weight = 1 / (len(recv_neighbors) + 1) x = bf.neighbor_allreduce(x, name='x', self_weight=self_weight, neighbor_weights=neighbor_weights, send_neighbors=send_neighbors, enable_topo_check=False) mse.append(torch.norm(x - x_bar, p=2) / torch.norm(x_bar, p=2)) else: outdegree = len(bf.out_neighbor_ranks()) indegree = len(bf.in_neighbor_ranks()) if not bf.nccl_built(): # NCCL do not support associated P yet. bf.turn_on_win_ops_with_associated_p() bf.win_create(x, name="x", zero_init=True) for i in range(args.max_iters): if args.enable_dynamic_topology: num_out_neighbors = len(bf.out_neighbor_ranks()) sent_neighbor = bf.out_neighbor_ranks()[i % num_out_neighbors] dst_weights = {sent_neighbor: 0.5}
def exact_diffusion(X, y, w_opt, loss, maxite=2000, alpha=1e-1, use_Abar=True, **kwargs): if loss == 'logistic_regression': rho = kwargs.get('rho', 1e-1) elif loss == 'linear_regression': rho = 0 else: raise NotImplementedError( 'Task not supported. This example only supports' + ' linear_regression and logistic_regression') topology = bf.load_topology() self_weight, neighbor_weights = topology_util.GetRecvWeights( topology, bf.rank()) if bf.rank() == 0: print('self weights with A: {}\n'.format(self_weight)) print('neighbor weights with A:\n') for k, v in neighbor_weights.items(): print(k, v) w = torch.zeros(n, 1, dtype=torch.double, requires_grad=True) phi, psi, psi_prev = w.clone(), w.clone(), w.clone() mse = [] # construct A_bar if use_Abar: self_weight = (self_weight + 1) / 2 for k, v in neighbor_weights.items(): neighbor_weights[k] = v / 2 for i in range(maxite): # calculate loccal gradient via pytorch autograd loss_step(X, y, w, tensor_name='neighbor.allreduce.local_variable', loss=loss, rho=rho) # exact diffusion psi = w - alpha * w.grad.data phi = psi + w.data - psi_prev w.data = bf.neighbor_allreduce(phi, self_weight, neighbor_weights, name='local variable') psi_prev = psi.clone() w.grad.data.zero_() # record convergence if bf.rank() == 0: mse.append(torch.norm(w.data - w_opt.data, p=2)) return w, mse
def benchmark_step(): global args, data for _ in range(args.internal_num_iters): bf.neighbor_allreduce(data)