def worker(): """ Initialize the distributed environment. """ import torch import torch.distributed as dist from torch.multiprocessing import Process print("Initializing distributed pytorch") os.environ['MASTER_ADDR'] = str(args.master_addr) os.environ['MASTER_PORT'] = str(args.master_port) dist.init_process_group(args.backend, rank=args.rank, world_size=args.size) for i in range(100): tensor = torch.ones(args.data_size_mb * 250 * 1000) * (args.rank + 1) # print('before: rank ', args.rank, ' has data ', tensor[0]) start_time = time.perf_counter() if args.rank == 0: dist.send(tensor=tensor, dst=1) else: dist.recv(tensor=tensor, src=0) elapsed_time = time.perf_counter() - start_time # print('after: rank ', args.rank, ' has data ', tensor[0]) rate = args.data_size_mb / elapsed_time print("Process %d transferred %d MB in %.1f ms (%.1f MB/sec)" % (args.rank, args.data_size_mb, elapsed_time * 1000, rate))
def bp_send_proc(rank, bs, subbs, wid, wn, wrank, nproc, gn, gsz, bp_head_list, shared_cnters): #world_sz = nproc * wn *4 #+1 #fp_send:0; fp_recv:1; bp_send:2; bp_recv:3 comm_rank = wid * nproc * 4 + rank * 4 + 2 iter_thresh = int(bs / subbs) #init_processes(comm_rank, world_sz) init_processes(comm_rank, wid, wn, nproc, gn, gsz, backend='gloo') print("bp_send_proc comm_rank=", comm_rank) #if wid == 0: if wrank == 0: shared_cnters[2] = 0 return local_bp_sent_counter = 0 dst_rank = (wid - 1) * nproc * 4 + rank * 4 + 3 while True: if local_bp_sent_counter < shared_cnters[2]: dist.send(tensor=bp_head_list[local_bp_sent_counter], dst=dst_rank) #print("bp send ", bp_head_list[local_bp_sent_counter].numel()) local_bp_sent_counter += 1 else: time.sleep(0.001) if local_bp_sent_counter == iter_thresh: local_bp_sent_counter = 0 shared_cnters[2].zero_()
def transfer4backend1(tag, send_buf, flag=False): if not flag: left, right = get_left_right(tag) dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1), dst=right) send_opt = dist.isend(tensor=send_buf, dst=right) send_opt.wait() return None else: left, right = get_left_right(tag) dist.send(tensor=torch.ShortTensor([send_buf.size()]).view(-1), dst=right) send_opt = dist.isend(tensor=send_buf, dst=right) try: shape_buf = torch.zeros([1], dtype=torch.short) dist.recv(tensor=shape_buf, src=left) recv_buf = torch.zeros(torch.Size(shape_buf.tolist())) dist.recv(tensor=recv_buf, src=left) except RuntimeError as error: print("runtime error") return None send_opt.wait() return recv_buf
def send_tensor_helper(dist, tensor, dst_rank, group, tag, num_iterations, intra_server_broadcast): for i in range(num_iterations): if intra_server_broadcast: dist.broadcast(tensor=tensor, group=group, src=1 - dst_rank) else: dist.send(tensor=tensor, dst=dst_rank, tag=tag)
def get( self, key: str, dst: Optional[torch.Tensor] = None, shared: bool = False, ) -> Optional[torch.Tensor]: """Get a tensor from the server. """ cmd_rpc = torch.tensor( [GET_CMD, len(key), dst is None, 0, 0, 0], dtype=torch.long) td.send(cmd_rpc, self.server_rank) td.send(_fromstring(key), self.server_rank) if dst is None: meta = torch.full((2, ), -1, dtype=torch.long) td.recv(meta, src=self.server_rank) ndim, ttype = meta if ndim.item() == -1: return None size = torch.full((ndim.item(), ), -1, dtype=torch.long) td.recv(size, src=self.server_rank) tensor_type = _tensor_types[ttype.item()] if shared: dst_storage = tensor_type().storage_type()._new_shared( size.prod()) dst = tensor_type(dst_storage).view(*size.tolist()) else: dst = tensor_type(*size.tolist()) td.recv(dst, src=self.server_rank) return dst
def test_send_recv_any_source(self): rank = dist.get_rank() tensor = _build_tensor(10, value=rank) recv_ranks = set() for dst in range(0, dist.get_world_size()): if dst == rank: # Recv mode for dst in range(0, dist.get_world_size()): if dst == rank: continue output_tensor = _build_tensor(10, value=-1) sender = dist.recv(output_tensor) # Assert the scalar value "sender" that should be # equal to the rank of the sender is equal to all # values in the received tensor. self.assertTrue(output_tensor.eq(sender).all()) recv_ranks.add(sender) else: # Send mode dist.send(tensor, dst) self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) self._barrier()
def fp_send_proc(rank, bs, subbs, wid, wn, nproc, fp_tail_list, shared_cnters): world_sz = nproc * wn * 4 #+1 #fp_send:0; fp_recv:1; bp_send:2; bp_recv:3 comm_rank = wid * nproc * 4 + rank * 4 iter_thresh = bs / subbs init_processes(comm_rank, world_sz) print("fp_send_proc comm_rank=", comm_rank) if wid == wn - 1: shared_cnters[1] = 4 return local_fp_sent_counter = 0 dst_rank = (wid + 1) * nproc * 4 + rank * 4 + 1 while True: #print("fp send ", local_fp_sent_counter, " ", shared_cnters[1]) #fp_tail_tensor if local_fp_sent_counter < shared_cnters[1]: # is it okay to directly send gpu tensor? #print("fp send ", comm_rank, " -> ", dst_rank) dist.send(tensor=fp_tail_list[local_fp_sent_counter], dst=dst_rank) #print("fp send ", fp_tail_list[local_fp_sent_counter].numel()) #print("fin fp send ", comm_rank, " -> ", dst_rank) local_fp_sent_counter += 1 else: time.sleep(0.001) if local_fp_sent_counter == iter_thresh: #reset local_fp_sent_counter = 0 shared_cnters[1].zero_()
def send(self, collectiveArgs, dst_rank, retFlag=False, tag=0): dist.send( tensor=collectiveArgs.ipTensor, dst=dst_rank, group=collectiveArgs.group, tag=tag )
def _send(tensor, tensor_name, src_rank, dst_rank, tag, sub_process_group=None): """ Sends tensor by calling PyTorch's send() call. If tensor is being sent not via broadcast(), it will be first copied to the CPU. """ if sub_process_group is not None: assert tensor.is_cuda # Send tensor shape. tensor_shape = torch.tensor(tensor.shape, dtype=torch.int) dist.broadcast(tensor=tensor_shape, src=src_rank, group=sub_process_group) # Send tensor. contiguous_tensor = tensor.detach().clone() dist.broadcast(tensor=contiguous_tensor.contiguous(), src=src_rank, group=sub_process_group) else: assert tensor.is_cuda tensor = tensor.cpu() # Send tensor shape. tensor_shape = torch.tensor(tensor.shape, dtype=torch.int) dist.send(tensor=tensor_shape, dst=dst_rank, tag=tag) # Send tensor. dist.send(tensor=tensor, dst=dst_rank, tag=tag)
def forward(ctx, tensor, dst, group=dist.group.WORLD, tag=0): ctx.save_for_backward(tensor) ctx.dst = dst ctx.group = group ctx.tag = tag dist.send(tensor, dst, group, tag) return tensor.new_tensor([])
def send_gradient_to_server(model): # send the server the msg saying we're not done yet dist.send(torch.zeros(1), 0) data = flatten_many_tensors([p.grad for p in model.parameters()], get_total_size(model)) dist.send(data, 0) receive_model_from_server(model)
def backward_rank1(semaphore, start_event, start_event2): start_event.wait() batch_idx = 0 while True: try: #semaphore.release() print("before grad recv...") grad_recv1 = torch.zeros([args.batch_size, 512, 2, 2], dtype=torch.int8) dist.recv(tensor=grad_recv1, src=2) print("after grad recv.....") except RuntimeError as error: print("backward runtime error") send_opt = dist.isend(tensor=torch.zeros(0), dst=0) send_opt.wait() break grad_recv1 = dequantize(grad_recv1.cuda(0).float()) inputs, outputs = outputs_queue.get(block=False) inputs.requires_grad_() outputs.backward(grad_recv1) if batch_idx % args.buffer_size == 0: optimizer.step() optimizer.zero_grad() inputs_grad = quantize(inputs.grad, char=True).cpu() print(inputs_grad.size()) if batch_idx == 0: start_event2.set() #send_opt = dist.isend(tensor=inputs_grad, dst=0) #send_opt.wait() dist.send(tensor=inputs_grad, dst=0) batch_idx += 1
def send_model_to_worker(model, worker_id=None): assert worker_id is not None # flatten it all so we can send in one go data = flatten_many_tensors([p for p in model.parameters()], get_total_size(model)) # data now contains the entire model flattened dist.send(data, worker_id)
def eval(layer, logger, e, save_event, data_size, testloader): criterion = nn.CrossEntropyLoss() criterion.cuda() layer.eval() with torch.no_grad(): if dist.get_rank() == 0: for batch_idx, (inputs, targets) in enumerate(testloader): print('batch_idx: ' + str(batch_idx)) inputs = inputs.cuda(0) outputs = layer(inputs) dist.send(tensor=outputs.cpu(), dst=1) print("send.....") e.wait() elif dist.get_rank() == 1: batch_idx = 0 while data_size > batch_idx: print("batch_idx:" + str(batch_idx)) rec_val = torch.zeros( [100, 256, 4, 4]) # difference model has difference shape dist.recv(tensor=rec_val, src=0) print("after recv....") outputs = layer(rec_val.cuda()) dist.send(tensor=outputs.cpu(), dst=2) batch_idx += 1 print("send...") e.wait() elif dist.get_rank() == 2: test_loss = 0 correct = 0 total = 0 save_event.clear() global best_acc for batch_idx, (inputs, targets) in enumerate(testloader): rec_val = torch.zeros([100, 512, 2, 2]) dist.recv(tensor=rec_val, src=1) outputs = layer(rec_val.cuda(0)) targets = targets.cuda() loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar( batch_idx, data_size, 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (test_loss / (batch_idx + 1), 100. * correct / total, correct, total)) logger.error("eval:" + str(test_loss / (batch_idx + 1))) acc_str = "eacc: %.3f" % (100. * correct / total, ) logger.error(acc_str) time.sleep(1) acc = 100. * correct / total if acc > best_acc: best_acc = acc save_event.set() time.sleep(1) e.set()
def runServer(model): # model = Net() optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) optimizer.zero_grad() numberOfTimes = dist.get_world_size() - 1 for param in model.parameters(): param.sum().backward() tag = torch.zeros(1) while True: optimizer.zero_grad src = dist.recv(tensor=tag) # print("Reached ", src) if tag[0] == 0: for param in model.parameters(): dist.send(tensor=param.data, dst=src) elif tag[0] == -1: numberOfTimes -= 1 if numberOfTimes == 0: # print("------------- Breaking ----------------") break else: for param in model.parameters(): dist.recv(tensor=param.grad.data, src=src) optimizer.step() optimizer.zero_grad() for param in model.parameters(): dist.send(tensor=param.data, dst=src)
def start(self) -> None: join_count = 0 while True: # 1. receive the command cmd_buffer = torch.full((6, ), -1, dtype=torch.long) rank = td.recv(cmd_buffer) cmd = cmd_buffer[0].item() if cmd == STORE_CMD: key = self._recv_key(rank, cmd_buffer[1].item()) self.handle_store(rank, key, cmd_buffer[2].item(), cmd_buffer[3].item(), cmd_buffer[4].item(), cmd_buffer[5].item()) elif cmd == GET_CMD: key = self._recv_key(rank, cmd_buffer[1].item()) self.handle_get(rank, key, cmd_buffer[2].item()) elif cmd == SWAP_CMD: key = self._recv_key(rank, cmd_buffer[1].item()) self.handle_store(rank, key, cmd_buffer[2].item(), cmd_buffer[3].item(), cmd_buffer[4].item(), cmd_buffer[5].item()) self.handle_get(rank, key, False) elif cmd == JOIN_CMD: join_count += 1 if join_count == self.num_clients: for r in range(self.num_clients): # after sending the join cmd, # each client waits on this ack to know everyone is done # and it's safe to exit td.send(torch.zeros((1, )), dst=r) break else: raise RuntimeError( "Command is unknown value %d from rank %d." % (cmd, rank))
def cumsum(self, dim): new_chunk = self.chunk.cumsum(dim) if self.byrow and dim==0: buf = torch.zeros_like(new_chunk[-1, :]) for i in range(self.size-1): if self.rank == i: synchronize() dist.send(new_chunk[-1,:], i+1) elif self.rank == i + 1: synchronize() dist.recv(buf, i) new_chunk += buf dist.barrier() elif not self.byrow and dim==1: buf = torch.zeros_like(new_chunk[:, -1]) for i in range(self.size-1): if self.rank==i: synchronize() dist.send(new_chunk[:, -1], i+1) elif self.rank == i+1: synchronize() dist.recv(buf, i) new_chunk += buf dist.barrier() return THDistMat(self.shape, self.sizes, new_chunk, self.byrow)
def set_lr(self, group_name, lr): cmd = torch.LongTensor([ getattr(TORCH_PARAMETER_SERVER_CMDS, f"SET_{group_name.upper()}_LR_CMD"), 0 ]) dist.send(cmd, dst=self.server_rank) self.lr_buffer[0] = lr dist.send(self.lr_buffer, dst=self.server_rank)
def get_lr(self, group_name): cmd = torch.LongTensor([ getattr(TORCH_PARAMETER_SERVER_CMDS, f"GET_{group_name.upper()}_LR_CMD"), 0 ]) dist.send(cmd, dst=self.server_rank) dist.recv(self.lr_buffer, src=self.server_rank) return self.lr_buffer[0].item()
def _send_model_to_master(self): dist.barrier() self.conf.logger.log( f"Worker-{self.conf.graph.worker_id} (client-{self.conf.graph.client_id}) sending the model ({self.arch}) back to Master." ) flatten_model = TensorBuffer(list(self.model.state_dict().values())) dist.send(tensor=flatten_model.buffer, dst=0) dist.barrier()
def step_optim(self, group_name): if group_name == "entity": parameter_index = 0 else: parameter_index = 1 cmd = torch.LongTensor( [TORCH_PARAMETER_SERVER_CMDS.STEP_OPTIM_CMD, parameter_index]) dist.send(cmd, dst=self.server_rank)
def start(self, groups: List["td.ProcessGroup"]) -> None: self.groups = ([groups[idx] for idx in self.group_idxs] if self.group_idxs is not None else None) join_count = 0 metadata_pg = self._metadata_pg() while True: # 1. receive the command cmd_buffer = torch.full((6, ), -1, dtype=torch.long) rank = td.recv(cmd_buffer, group=metadata_pg) cmd = cmd_buffer[0].item() if cmd == STORE_CMD: key = self._recv_key(rank, cmd_buffer[1].item(), group=metadata_pg) self.handle_store( rank, key, cmd_buffer[2].item(), cmd_buffer[3].item(), cmd_buffer[4].item(), cmd_buffer[5].item(), ) elif cmd == GET_CMD: key = self._recv_key(rank, cmd_buffer[1].item(), group=metadata_pg) self.handle_get(rank, key, cmd_buffer[2].item()) elif cmd == SWAP_CMD: assert metadata_pg is None, "Swap is not used for partition servers." key = self._recv_key(rank, cmd_buffer[1].item()) self.handle_store( rank, key, cmd_buffer[2].item(), cmd_buffer[3].item(), cmd_buffer[4].item(), cmd_buffer[5].item(), ) self.handle_get(rank, key, False) elif cmd == JOIN_CMD: join_count += 1 logger.info(f"ParameterServer join: join_count= {join_count}") if join_count == self.num_clients: for r in range(self.num_clients): # after sending the join cmd, # each client waits on this ack to know everyone is done # and it's safe to exit td.send(torch.zeros((1, )), dst=r) do_barrier = cmd_buffer[1].item() if do_barrier: logger.info("ParameterServer barrier begin") td.barrier(self.groups[0]) logger.info("ParameterServer barrier end") break else: raise RuntimeError( "Command is unknown value %d from rank %d." % (cmd, rank))
def runWorker(dataset, criterion, group, model): torch.manual_seed(1234) # model = Net() # optimizer = optim.SGD(model.parameters(), lr=lr, momentum = 0.9) size = dist.get_world_size() rank = dist.get_rank() epoch_loss = 0.0 numberOfSamples = 0 train_set, bsz = partition_dataset(dataset) num_batches = ceil(len(train_set.dataset) / float(bsz)) # print("started ",rank) t0 = time.monotonic() dist.send(tensor=torch.Tensor([0]), dst=0) for param in model.parameters(): dist.recv(tensor=param.data, src=0) dist.barrier(group) for epoch in range(epochs): epoch_loss = 0.0 numberOfSamples = 0 for batch_idx, (data, target) in enumerate(train_set): numberOfSamples += data.size()[0] data, target = Variable(data), Variable(target) model.zero_grad() output = model(data) loss = criterion(output, target) epoch_loss += loss.item() loss.backward() dist.send(tensor=torch.Tensor([rank]), dst=0) for param in model.parameters(): dist.send(tensor=param.grad.data, dst=0) for param in model.parameters(): dist.recv(tensor=param.data, src=0) # print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(data), len(train_set.dataset), 100. * batch_idx / len(train_set), loss.item())) dist.send(tensor=torch.Tensor([0]), dst=0) for param in model.parameters(): dist.recv(tensor=param.data, src=0) dist.barrier(group) # print('Rank ', dist.get_rank(), ', epoch ', epoch, ': ', epoch_loss / num_batches) dist.send(tensor=torch.Tensor([-1]), dst=0) t0 = time.monotonic() - t0 t0 /= epochs # if rank == 1: # print(t0) # print('Rank ', dist.get_rank(), ', epoch_loss ', epoch_loss/ num_batches, ', number of samples ', numberOfSamples) execTime = torch.Tensor([t0]) loss_w = torch.Tensor([epoch_loss * numberOfSamples / num_batches]) numberOfSamples = torch.Tensor([numberOfSamples]) dist.all_reduce(loss_w, op=dist.reduce_op.SUM, group=group) dist.all_reduce(numberOfSamples, op=dist.reduce_op.SUM, group=group) dist.all_reduce(execTime, op=dist.reduce_op.SUM, group=group) if rank == 1: print("\n C4 \n") print(loss_w / numberOfSamples, ',', execTime / (size - 1), ' s')
def my_gather(self, rank, size, group, sendbuf, recvbuf, root): if rank == root: for idx in range(size): if idx != rank: dist.recv(recvbuf[idx], src=idx, group=group) else: recvbuf[rank] = sendbuf else: dist.send(sendbuf, group=group, dst=root)
def get(self, key: str, dst: Optional[torch.Tensor] = None, shared: bool = False) -> Optional[torch.Tensor]: """Get a tensor from the server.""" self._validate_get(key, dst=dst, shared=shared) cmd_rpc = torch.tensor( [GET_CMD, len(key), dst is None, 0, 0, 0], dtype=torch.long) metadata_pg = self._metadata_pg() td.send(cmd_rpc, self.server_rank, group=metadata_pg) td.send(_fromstring(key), self.server_rank, group=metadata_pg) if dst is None: meta = torch.full((2, ), -1, dtype=torch.long) td.recv(meta, src=self.server_rank, group=metadata_pg) ndim, ttype = meta if ndim.item() == -1: return None size = torch.full((ndim.item(), ), -1, dtype=torch.long) td.recv(size, src=self.server_rank, group=metadata_pg) dtype = _dtypes[ttype.item()] if shared: dst = allocate_shared_tensor(size.tolist(), dtype=dtype) else: dst = torch.empty(size.tolist(), dtype=dtype) start_t = time.monotonic() data_pgs = self._data_pgs() if data_pgs is None: td.recv(dst, src=self.server_rank) else: outstanding_work = [] flattened_dst = dst.flatten() flattened_size = flattened_dst.shape[0] for idx, (pg, slice_) in enumerate( zip( data_pgs, split_almost_equally(flattened_size, num_parts=len(data_pgs)), )): outstanding_work.append( td.irecv( tensor=flattened_dst[slice_], src=self.server_rank, group=pg, tag=idx, )) for w in outstanding_work: w.wait() end_t = time.monotonic() if self.log_stats: stats_size = dst.numel() * dst.element_size() stats_time = end_t - start_t logger.debug( f"Received tensor {key} from server {self.server_rank}: " f"{stats_size:,} bytes " f"in {stats_time:,g} seconds " f"=> {stats_size / stats_time:,.0f} B/s") return dst
def sendParameters(network, rank, broadcast=False, workGroup=workGroup): global VERBOSE if VERBOSE: print("Server -> Worker", rank) for param in network.parameters(): if broadcast: dist.broadcast(param.data, src=dist.get_rank()) else: dist.send(param.data, dst=rank, tag=0)
def join(self) -> None: """All clients should call join at the end, which will allow the server to exit. """ cmd_rpc = torch.tensor([JOIN_CMD, 0, 0, 0, 0, 0], dtype=torch.long) td.send(cmd_rpc, self.server_rank) ack = torch.empty((1, )) td.recv(ack, src=self.server_rank)
def send_message(self, message_code, payload, dst=0): """Sends a message to a destination Concatenates both the message code and destination with the payload into a single tensor and then sends that as a tensor """ _LOGGER.info("SENDING MESSAGE: {} RANK: {}".format( message_code, dist.get_rank())) m_parameter = quantize_tensor(payload, self.quantize_num_bits) meta = torch.Tensor([dist.get_rank(), message_code]).to(torch.int16) m_parameter = torch.cat((meta, m_parameter)) dist.send(tensor=m_parameter, dst=dst)
def run(rank, size): tensor = torch.zeros(1) if rank == 0: tensor += 1 for i in range(1, size): dist.send(tensor=tensor, dst=i) else: # Receive tensor from process 0 dist.recv(tensor=tensor, src=0) print('Rank ', rank, ' has data ', tensor[0])
def run(rank, size): tensor = torch.zeros(1) if rank == 0: tensor += 1 # Send the tensor to process 1 dist.send(tensor=tensor, dst=1) else: # Receive tensor from process 0 dist.recv(tensor=tensor, src=0) print('Rank ', rank, ' has data ', tensor[0])
def test_send_recv(self): rank = dist.get_rank() tensor = _build_tensor(rank + 1) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(src + 1, value=-1) expected_tensor = _build_tensor(src + 1) dist.recv(tensor, src) self.assertEqual(tensor, expected_tensor) self._barrier()
def test_send_recv_any_source(self): rank = dist.get_rank() tensor = _build_tensor(10, rank) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) recv_ranks = set() for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(10, value=-1) dist.recv(tensor) recv_ranks.add(tensor.resize_(1)[0]) self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) self._barrier()
def test_irecv(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)] requests = [ dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size) ] for src in range(1, world_size): requests[src - 1].wait() self.assertTrue(requests[src - 1].is_completed()) self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10)) else: tensor = _build_tensor(rank, 10) dist.send(tensor, 0) self._barrier()
def send(self, var): dist.send(tensor=var, dst=self.other)
else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.broadcast(tensor, 0) dist.barrier() if rank == 0: print_header("send from 0 to 1") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.send(tensor, 1) end = timer() print_stats(bytes, num_tensors, end - start) print() elif rank == 1: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.recv(tensor, 0) dist.barrier() if rank == 0: print_header("reduce") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42)