def slave(self): #{{{ #setup prerequests #messages.STATUS {{{ self.status_reqs = array(0, dtype=int) self.status_prequest = comm.Recv_init([self.status_reqs, MPI.INT], source=0, tag=messages.STATUS) self.control_reqlist += [self.status_prequest] self.control_reqhandlers += [self.status] #}}} #messages.ABORT {{{ self.abort_reqs = array(0, dtype=int) self.abort_prequest = comm.Recv_init([self.abort_reqs, MPI.INT], source=0, tag=messages.ABORT) self.control_reqlist += [self.abort_prequest] self.control_reqhandlers += [self.abort] #}}} #messages.STATE {{{ self.state_req = array([0] * 300, dtype='uint8') #XXX, calculate exact state size self.state_prequest = comm.Recv_init([self.state_req, MPI.BYTE], source=MPI.ANY_SOURCE, tag=messages.STATE) self.computation_reqlist += [self.state_prequest] self.computation_reqhandlers += [self.state_received] #}}} self.setupGoalMessageHandler() self.termination_detection.slave_init() self.reqlist = self.control_reqlist + self.computation_reqlist self.reqhandlers = self.control_reqhandlers + self.computation_reqhandlers #print self.reqlist, self.reqhandlers #MPI buffer send space MPI.Attach_buffer(MPI.Alloc_mem(1024 * 1024)) #1Mb MPI.Prequest.Startall(self.reqlist) print "Slave", rank, "awaiting your command" self.running = True while self.running: #Do control comm, if any (i, comm_todo) = MPI.Prequest.Testany(self.control_reqlist) if comm_todo: self.control_reqhandlers[i]() self.control_reqlist[i].Start() #Do computation comm, if any (i, comm_todo) = MPI.Prequest.Testany(self.computation_reqlist) if comm_todo: #print "calling ", self.computation_reqhandlers[i] self.computation_reqhandlers[i]() self.computation_reqlist[i].Start() #Do a computation try: self.mcReachability.compute() except NoMoreStatesException: #No more work... #print "Rank %d has no more work, len(pwlist) = %d" % (rank, len(self.pwlist.waiting)) self.termination_detection.noMoreWork() #... wait for something to arrive i = MPI.Prequest.Waitany(self.reqlist) #print "calling ", self.reqhandlers[i] self.reqhandlers[i]() self.reqlist[i].Start() except GoalFoundException: #goal state found: Profit! comm.Bsend([None, MPI.INT], dest=0, tag=messages.GOAL_REACHED)
all_compounds = [] all_com_beta = [] all_com_gap = [] all_com_lumo = [] wave_compounds = [] com_gap = [] com_lumo = [] depth = [] result = [] """ start distributing jobs to all ranks """ #comm = MPI.COMM_WORLD #rank = comm.Get_rank() mem = np.zeros(8192) MPI.Attach_buffer(mem) num_cores = 12 hsm = HashTable(num_cores) q1 = Queue() num_job = 3 * num_cores #print ("check node:",rootnode.state) print("root rank:", hsm.hashing(root)) #if rank==0: #if rank==2: print("f**k the code") if rank == 0: for i in range(num_job): print("root rank:", hsm.hashing(root)) comm.bsend([root, reward, child_char], dest=int(hsm.hashing(root)),
from mpi4py import MPI try: from numpy import empty except ImportError: from array import array def empty(size, dtype): return array(dtype, [0] * size) # -------------------------------------------------------------------- BUFSISE = 10000 + MPI.BSEND_OVERHEAD buff = empty(BUFSISE, dtype='b') MPI.Attach_buffer(buff) buff2 = MPI.Detach_buffer() MPI.Attach_buffer(buff2) MPI.Detach_buffer() # -------------------------------------------------------------------- assert len(buff2) == BUFSISE # --------------------------------------------------------------------
def testPersistent(self): size = self.COMM.Get_size() rank = self.COMM.Get_rank() dest = (rank + 1) % size source = (rank - 1) % size for array in arrayimpl.ArrayTypes: for typecode in arrayimpl.TypeMap: for s in range(size): for xs in range(3): # sbuf = array(s, typecode, s) rbuf = array(-1, typecode, s + xs) sendreq = self.COMM.Send_init(sbuf.as_mpi(), dest, 0) recvreq = self.COMM.Recv_init(rbuf.as_mpi(), source, 0) sendreq.Start() recvreq.Start() sendreq.Wait() recvreq.Wait() self.assertNotEqual(sendreq, MPI.REQUEST_NULL) self.assertNotEqual(recvreq, MPI.REQUEST_NULL) sendreq.Free() recvreq.Free() self.assertEqual(sendreq, MPI.REQUEST_NULL) self.assertEqual(recvreq, MPI.REQUEST_NULL) for value in rbuf[:s]: self.assertEqual(value, s) for value in rbuf[s:]: self.assertEqual(value, -1) # sbuf = array(s, typecode, s) rbuf = array(-1, typecode, s + xs) sendreq = self.COMM.Send_init(sbuf.as_mpi(), dest, 0) recvreq = self.COMM.Recv_init(rbuf.as_mpi(), source, 0) reqlist = [sendreq, recvreq] MPI.Prequest.Startall(reqlist) index1 = MPI.Prequest.Waitany(reqlist) self.assertTrue(index1 in [0, 1]) self.assertNotEqual(reqlist[index1], MPI.REQUEST_NULL) index2 = MPI.Prequest.Waitany(reqlist) self.assertTrue(index2 in [0, 1]) self.assertNotEqual(reqlist[index2], MPI.REQUEST_NULL) self.assertTrue(index1 != index2) index3 = MPI.Prequest.Waitany(reqlist) self.assertEqual(index3, MPI.UNDEFINED) for preq in reqlist: self.assertNotEqual(preq, MPI.REQUEST_NULL) preq.Free() self.assertEqual(preq, MPI.REQUEST_NULL) for value in rbuf[:s]: self.assertEqual(value, s) for value in rbuf[s:]: self.assertEqual(value, -1) # sbuf = array(s, typecode, s) rbuf = array(-1, typecode, s + xs) sendreq = self.COMM.Ssend_init(sbuf.as_mpi(), dest, 0) recvreq = self.COMM.Recv_init(rbuf.as_mpi(), source, 0) sendreq.Start() recvreq.Start() sendreq.Wait() recvreq.Wait() self.assertNotEqual(sendreq, MPI.REQUEST_NULL) self.assertNotEqual(recvreq, MPI.REQUEST_NULL) sendreq.Free() recvreq.Free() self.assertEqual(sendreq, MPI.REQUEST_NULL) self.assertEqual(recvreq, MPI.REQUEST_NULL) for value in rbuf[:s]: self.assertEqual(value, s) for value in rbuf[s:]: self.assertEqual(value, -1) # mem = array(0, typecode, s + MPI.BSEND_OVERHEAD).as_raw() sbuf = array(s, typecode, s) rbuf = array(-1, typecode, s + xs) MPI.Attach_buffer(mem) sendreq = self.COMM.Bsend_init(sbuf.as_mpi(), dest, 0) recvreq = self.COMM.Recv_init(rbuf.as_mpi(), source, 0) sendreq.Start() recvreq.Start() sendreq.Wait() recvreq.Wait() MPI.Detach_buffer() self.assertNotEqual(sendreq, MPI.REQUEST_NULL) self.assertNotEqual(recvreq, MPI.REQUEST_NULL) sendreq.Free() recvreq.Free() self.assertEqual(sendreq, MPI.REQUEST_NULL) self.assertEqual(recvreq, MPI.REQUEST_NULL) for value in rbuf[:s]: self.assertEqual(value, s) for value in rbuf[s:]: self.assertEqual(value, -1) # rank = self.COMM.Get_rank() sbuf = array(s, typecode, s) rbuf = array(-1, typecode, s + xs) recvreq = self.COMM.Recv_init(rbuf.as_mpi(), rank, 0) sendreq = self.COMM.Rsend_init(sbuf.as_mpi(), rank, 0) recvreq.Start() sendreq.Start() recvreq.Wait() sendreq.Wait() self.assertNotEqual(sendreq, MPI.REQUEST_NULL) self.assertNotEqual(recvreq, MPI.REQUEST_NULL) sendreq.Free() recvreq.Free() self.assertEqual(sendreq, MPI.REQUEST_NULL) self.assertEqual(recvreq, MPI.REQUEST_NULL) for value in rbuf[:s]: self.assertEqual(value, s) for value in rbuf[s:]: self.assertEqual(value, -1)
mpi_thread_safe_info_msg = "Provided MPI implementation (%s) is not thread safe configured, " % ( mpi_vendor_str) mpi_thread_safe_info_msg = mpi_thread_safe_info_msg + "maximum thread safe level supported is: %s" % ( mpi_thread_safe_level_str) mpi_thread_safe_info_msg = mpi_thread_safe_info_msg + "\nNOTE: In most MPI implementations thread-safety " mpi_thread_safe_info_msg = mpi_thread_safe_info_msg + "can be enabled at pre-compile, " mpi_thread_safe_info_msg = mpi_thread_safe_info_msg + "by setting explicit thread-safe configuration options, " mpi_thread_safe_info_msg = mpi_thread_safe_info_msg + "\n e.g. (MPI 1.6.5) --enable-mpi-thread-multiple" # Allocate memory for buffered sends if mpi_initialized and mpi_world_size > 1 and is_mpi_thread_safe: try: mpi_buffer_size_in_mb = 100 __mpi_buffer = __mpi_factory.Alloc_mem(mpi_buffer_size_in_mb * 1024 * 1024) __mpi_factory.Attach_buffer(__mpi_buffer) mpi_buffer_allocated = True except Exception, instance: mpi_buffer_allocated = False mpi_buffer_allocation_error_msg = traceback.format_exc() # Check if MPI is effectively enabled if mpi_initialized and mpi_world_size > 1 and is_mpi_thread_safe and mpi_buffer_allocated: is_mpi_enabled = True else: is_mpi_enabled = False if not mpi_initialized: mpi_error_msg = mpi_initialization_error_msg elif mpi_world_size < 2: mpi_error_msg = "Only 1 MPI process found" elif not is_mpi_thread_safe:
comm.Barrier() comm.Reduce(opt_local_obj, opt_global_obj, op=MPI.SUM, root=0) opt_global_obj /= size #initial state: x0, y0 local_x = np.random.normal(0, 4, (fea_size, )) grad = lr_model.compute_grad(local_x) last_grad = np.zeros((fea_size, )) local_y = grad iter_num = 300 lr = 8e-3 buf = np.empty(((fea_size) * 2 + 3) * out_deg, dtype=np.float) MPI.Attach_buffer(buf) sync = 1 for iter in range(iter_num): #send local x and y to out neighbors for i, outp in enumerate(out_peers): send_buff = np.stack((local_x - lr * local_y, local_y / (out_deg + 1)), axis=1) comm.Bsend([send_buff, MPI.F_FLOAT], dest=outp) #receive local x and y from in neighbors #clear receive buffer and flag x = np.zeros((fea_size, size)) y = np.zeros((fea_size, size)) recv_flag = np.zeros((size, ), dtype=int) if sync == 1:
# attach_detach_buf.py import numpy as np from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() max_msg_size = 2**10 BUFSISE = 32 * max_msg_size mpi_buf = bytearray(BUFSISE) # Attach a big user-provided buffer for sending in buffered mode MPI.Attach_buffer(mpi_buf) recv_buf = np.empty((max_msg_size,), np.float64) if rank == 0: print('-' * 80) print('With an attached big buffer:') print() msg_size = 1 tag = 0 while msg_size <= max_msg_size: msg = np.random.random((msg_size,)) if rank == 0: print('Trying with size: ', msg_size) comm.Bsend(msg, (rank+1)%2, tag)
def worker(comm, whole_comm, args): #distributed setting rank = comm.Get_rank() size = comm.Get_size() #assign out/in neighbors neighbors = 1 out_peers = [(rank + 1 + i) % size for i in range(3)] #[(rank + 1) % size, (rank + size - 1) % size] in_peers = [(rank - 1 - i + size) % size for i in range(3)] print(rank, out_peers, in_peers) in_deg = len(in_peers) out_deg = len(out_peers) #load data and model use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) gpu_ind = rank % torch.cuda.device_count() print(rank, "gpu", gpu_ind) device = torch.device("cuda:" + str(gpu_ind) if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} #prepare training dataset global_dataset = datasets.MNIST('./data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) data_size = len(global_dataset) seg = np.floor(data_size / size) rmd = data_size % size if rank + 1 <= rmd: seg += 1 local_a = rank * seg else: local_a = rank * seg + rmd ind = np.arange(local_a, local_a + seg, dtype=np.int) local_dataset = torch.utils.data.Subset(global_dataset, ind) #move training data to gpu data_tensor = [] target_tensor = [] for i in range(local_dataset.__len__()): data, target = local_dataset.__getitem__(i) data_tensor.append(data) target_tensor.append(torch.tensor(target)) data_tensor = torch.stack(data_tensor).to(device) target_tensor = torch.stack(target_tensor).to(device) tensor_local_dataset = torch.utils.data.TensorDataset( data_tensor, target_tensor) train_loader = torch.utils.data.DataLoader(tensor_local_dataset, batch_size=args.batch_size, shuffle=True) #, **kwargs) #sample a batch trainloader_iter = iter(train_loader) try: batch, target = trainloader_iter.next() except StopIteration: trainloader_iter = iter(train_loader) batch, target = trainloader_iter.next() #start time start_t = time.time() #initial state: x0, y0 Net = Model().to(device) Net.train() local_x = [] for param in Net.parameters(): local_x.append(param.data.view(-1).clone()) local_x = torch.cat(local_x).detach() net_size = local_x.size()[0] optimizer = optim.SGD(Net.parameters(), lr=args.lr) obj = worker_grad(Net, device, batch, target, optimizer) grad = [] for param in Net.parameters(): grad.append(param.grad.view(-1).clone()) grad = torch.cat(grad).detach() local_y = grad.clone().detach() last_grad = torch.zeros_like(local_y, device=device, requires_grad=False) iter_num = args.iter_num lr = args.lr decay = 1 #time series t_seq = np.zeros([ iter_num, ], dtype=np.float32) loss_seq = np.zeros([ iter_num, ], dtype=np.float32) #the number of accumulated received iterates acc_recv = 0 buf = np.empty(10 * out_deg * (net_size * 2 + 10), dtype=np.float) MPI.Attach_buffer(buf) #record if an neighbor has exited stop_flags = np.zeros([ size, ], dtype=int) echo_interval = 100 asyn = args.asyn send_req = [] recv_deg = out_deg for i in range(iter_num): if not asyn: comm.Barrier() #send local x and y to out neighbors send_complete = MPI.Request.Testall(send_req) if recv_deg > 0 and send_complete == True: send_x = local_x - lr * local_y send_y = local_y / (out_deg + 1) send_num = np.minimum(recv_deg, out_deg) for outp in out_peers: #np.random.choice(out_peers, size=send_num, replace=False): send_buf = torch.stack((send_x, send_y), dim=0).cpu().numpy() tag = 0 send_req.append( comm.Isend([send_buf, MPI.FLOAT], dest=outp, tag=tag)) #receive local x and y from in neighbors #clear receive buffer and flag recv_flag = np.zeros((size, ), dtype=int) buf_x = torch.zeros(local_x.size(), requires_grad=False) buf_y = torch.zeros(local_y.size(), requires_grad=False) recv_deg = 0 buf_size = [2, net_size] if asyn: waiting_time = 5 s_time = time.time() while in_deg > 0 and recv_deg == 0: info = MPI.Status() while comm.Iprobe(source=MPI.ANY_SOURCE, status=info): recv_rank = info.source recv_tag = info.tag #print(i, rank, "receiving", recv_rank) recvbuf = np.zeros(buf_size, dtype=np.float32) comm.Recv([recvbuf, MPI.FLOAT], source=recv_rank) if recv_tag == 1: stop_flags[recv_rank] = 1 if recv_rank in out_peers: out_peers.remove(recv_rank) out_deg -= 1 if recv_rank in in_peers: in_peers.remove(recv_rank) in_deg -= 1 else: buf_x += torch.from_numpy(recvbuf[0, :]) buf_y += torch.from_numpy(recvbuf[1, :]) recv_flag[recv_rank] += 1 recv_deg += 1 acc_recv += 1 info = MPI.Status() if recv_deg > in_deg: break if time.time() - s_time > wait_time: break wait_times -= 1 else: for j, inp in enumerate(in_peers): recvbuf = np.zeros(buf_size, dtype=send_buf.dtype) comm.Recv([recvbuf, MPI.FLOAT], source=inp) buf_x += torch.from_numpy(recvbuf[0, :]) buf_y += torch.from_numpy(recvbuf[1, :]) recv_deg += 1 recv_flag[recv_rank] += 1 buf_x = buf_x.to(device) buf_y = buf_y.to(device) if recv_deg > 0: # local update local_x -= lr * local_y local_y /= (out_deg + 1) #average consensus and update local x buf_x += local_x local_x = buf_x / (recv_deg + 1) buf_y += local_y #update net parameters assign_array_to_net(local_x, Net) #compute gradient and update localy last_grad.copy_(grad) try: batch, target = next(trainloader_iter) except StopIteration: trainloader_iter = iter(train_loader) batch, target = next(trainloader_iter) obj = worker_grad(Net, device, batch, target, optimizer) grad = [] for param in Net.parameters(): grad.append(param.grad.view(-1).clone()) grad = torch.cat(grad).detach() local_y = buf_y + grad - last_grad lr *= decay t_seq[i] = time.time() - start_t loss_seq[i] = obj if i % echo_interval == 0: print('rank', rank, 'iter: ', i, 'time', t_seq[i], 'local obj', obj) send_list = list(range(size)) send_list.remove(rank) for node in send_list: sendbuf = np.zeros([ 1, ], ) comm.Send(sendbuf, dest=node, tag=1) #receive the left iterates from neighbors ''' info = MPI.Status() buffer = np.empty(buf_size, dtype=np.float32) while acc_recv < iter_num * in_deg: while comm.Iprobe(source=MPI.ANY_SOURCE, status=info): recv_rank = info.source comm.Recv([buffer, MPI.FLOAT], source=recv_rank) acc_recv += 1 info = MPI.Status() ''' whole_comm.send(np.stack((t_seq, loss_seq), axis=1).tolist(), dest=whole_comm.Get_size() - 1) whole_comm.Send([local_x.cpu().numpy(), MPI.FLOAT], dest=whole_comm.Get_size() - 1)