def _handle_writelist(self, writelist): for write_socket in writelist: removal = [] with self.socket_to_request_lock: #request_list = self.socket_to_request[write_socket] try: request_list = self.socket_to_request[write_socket] except Exception as e: #Logger().debug("rank:%i trying to find %s on socket_to_request:%s" % (self.rank, write_socket, self.socket_to_request ) ) raise e for request in request_list: if request.status == "cancelled": removal.append(request) elif request.status == "new": # Send the data on the socket try: if request.multi: utils.robust_send(write_socket, request.header) utils.robust_send_multi(write_socket, request.data) else: utils.robust_send_multi( write_socket, [request.header] + request.data) except socket.error, e: Logger().error("got:%s for socket:%s with data:%s" % (e, write_socket, request.data)) # TODO: Make sure we really want to continue here, instead of reacting # Send went wrong, do not update, but hope for better luck next time continue #raise e except Exception, e: Logger().error( "Other exception got:%s for socket:%s with header:%s payload:%s" % (e, write_socket, request.header, request.data)) # Send went wrong, do not update, but hope for better luck next time raise e removal.append(request) if request.acknowledge: request.update( "unacked" ) # update status to wait for acknowledgement else: request.update( "ready" ) # update status and signal anyone waiting on this request else: pass
def run(self, *args, **kwargs): if not self.do_run: self.finished.set() self.data = "Cancelled due to availablity check on the remote host" return try: connection = socket.create_connection((self.hostname, self.portno), 4.0) if not connection: self.error = "Could not connect to rank %d. We received a connect timeout" % self.rank else: # Send the message header, payloads = utils.prepare_message( (self.security_component, self.send_data), -1, cmd=self.cmd_id, comm_id=-1) utils.robust_send_multi(connection, [header] + payloads) # Test if we should also receive a message back from the rank. If so, we wait # for that message for a specific timeout. If we haven't received the message # by then, the command was not a sucess. Otherwise it was. if self.pong: incomming, _, errors = select.select([connection], [], [connection], self.timeout or 5) if incomming: # We read the message from the connection and set that as the result_data. rank, cmd, tag, ack, comm_id, _, data = get_raw_message( incomming[0]) data = pickle.loads(data) self.data = data else: self.error = "Connection timeout (30 seconds)" except Exception, e: self.error = "Error in connecting to rank %d: %s" % (self.rank, str(e))
def run(self, *args, **kwargs): timeout = 1 errors = 0 while errors < 5: try: connection = socket.create_connection( (self.hostname, self.portno), 4.0) break except Exception as e: import time time.sleep(timeout) timeout *= 2 errors += 1 if not connection: sys.exit(1) from mpi.network import utils as mpi_utils header, payloads = mpi_utils.prepare_message( self.data, -1, cmd=constants.CMD_MIGRATE_PACK) mpi_utils.robust_send_multi(connection, [header] + payloads) self.event.set()
def execute_commands(mpi): """ Execute the actual system commands. This functions returns a boolean indicating if the decorated function should be called. """ from mpi.network.utils import robust_send_multi, prepare_message rest_list = [] for obj in mpi.pending_systems_commands: cmd, connection, user_data = obj rank = mpi.MPI_COMM_WORLD.comm_group.rank() # Handle the message in a big if-statement. When / if the number # of commands escalades, we should consider moving them away. if cmd == constants.CMD_ABORT: sys.exit(1) elif cmd == constants.CMD_PING: # We need to access the rank like this. Calling rank() on the # communicator will active this function again. Should be # apply some locking? header,payloads = prepare_message("PONG", rank) robust_send_multi(connection, [header]+payloads) elif cmd == constants.CMD_READ_REGISTER: # Send our registers. We just send everything and let the # client filter. header,payloads = prepare_message(mpi.user_register, rank) robust_send_multi(connection, [header]+payloads) elif cmd == constants.CMD_MIGRATE_PACK: # This if is just here so people know it is not missing. We # handle this command in a different way. rest_list.append(obj) elif cmd == constants.CMD_CONFIG: res = mpi.set_configuration(user_data) # Send the result back header,payloads = prepare_message(res, rank) robust_send_multi(connection, [header]+payloads) mpi.pending_systems_commands = rest_list
def _handshake(self, mpirun_hostname, mpirun_port, internal_rank): """ This method creates the MPI_COMM_WORLD communicator, by receiving (hostname, port, rank) for all the processes started by mpirun. For mpirun to have this information we first send all the data from our own process. So we can bind a socket. The data sent back to mpirun.py is a tuple containing the following elements: * hostname : Our hostname * port : Our port number. Together with hostname, this information makes it possible to connect with this instance. * rank : Our rank. The mpirun.py process will distribute this information among the other processes. * security_component : A SHA1 hash used for "security". Scripts communicating with the MPI environment must supply this as a simple way to disallow other than the starting user. * availability : Information about each system commands availability on this host. mpirun.py sends a tuple back containing: * all_procs : Prior to release 0.8.0 this was the only data sent (and it was not in a tuple). * script-path : The user script path. This is only used when we are resuming a packed job and need to import and run it. * state : The state of the program when the job was packed. """ sec_comp = self.mpi.generate_security_component() avail = syscommands.availablity() # Packing the data data = (self.hostname, self.port, self.unix_socket_filename, internal_rank, sec_comp, avail) # Connection to the mpirun processs s_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM) recipient = (mpirun_hostname, mpirun_port) s_conn.connect(recipient) # Pack the data with our special format header, payloads = prepare_message(data, internal_rank, comm_id=-1, tag=constants.TAG_INITIALIZING) utils.robust_send_multi(s_conn, [header] + payloads) # Receiving data about the communicator, by unpacking the head etc. # first _ is rank from mpi import dill _, _, _, _, _, _, data = get_raw_message(s_conn) all_procs, state = dill.loads(data) if state: self.mpi.resume = True self.mpi.resume_state = state s_conn.close() self.all_procs = {} for (host, port, unx_filename, global_rank) in all_procs: # Check if this rank lives on the same host as we do. If so use the # unix socket instead of the TCP information. if host == self.hostname and self.options.unixsockets: connection_info = unx_filename connection_type = "local" else: connection_info = (host, port) connection_type = "tcp" self.all_procs[global_rank] = { 'connection_info': connection_info, 'connection_type': connection_type, 'global_rank': global_rank }
def close_all_connections(self): from mpi.network import utils as mpi_utils import select rank = self.mpi.MPI_COMM_WORLD.comm_group.rank() # Find all connections in the socket pool. write_connections = [s for s in self.pool.sockets] read_connections = [s for s in write_connections] # Try to find a socket to -1 (the admin). We don't want to close that one with self.pool.sockets_lock: admin_conn = self.pool._get_socket_for_rank(-1) try: write_connections.remove(admin_conn) except: pass try: read_connections.remove(admin_conn) except: pass # A list to contain the received objects (not CMD_CONN_CLOSE). received_messages = [] errors_left = 5 while write_connections or read_connections: all_connections = write_connections + read_connections rlist, wlist, err_list = select.select(read_connections, write_connections, all_connections, 10) if err_list: Logger().warning( "Received an error list with %d elements: %s" % (len(err_list), err_list)) # Handle the writes. for wsocket in wlist: header, payloads = mpi_utils.prepare_message( "", rank, cmd=constants.CMD_CONN_CLOSE) mpi_utils.robust_send_multi(wsocket, [header] + payloads) write_connections.remove(wsocket) # Handle the reads. for rsocket in rlist: try: rank, cmd, tag, ack, comm_id, _, data = mpi_utils.get_raw_message( rsocket) if cmd == constants.CMD_CONN_CLOSE: read_connections.remove(rsocket) else: Logger().info( "received important information while closing the sockets." ) pass # This message is important. We need to add it to the MPI environment. except Exception as e: errors_left -= 1 if errors_left <= 0: break
class MigratePack(object): def __init__(self, mpi, script_hostinfo): self.mpi = mpi # The bypassed function is the one decorated with # handle_system_commands. When the environment is # unpacked again this needs to be executed. The format # of the variable is a tuple with 3 elements: # (function, args, kwargs) self.script_hostinfo = script_hostinfo self.network = self.mpi.network self.t_in = self.network.t_in self.t_out = self.network.t_out self.pool = self.network.socket_pool # Start migration. self.pack() def pack(self): # Find the network threads, as we need direct access to them # for extracting state and pause commands. if self.t_in.type == "combo": self.network_type = "combo" else: self.network_type = "normal" self.rank = self.mpi.MPI_COMM_WORLD.comm_group.rank() # This dict will be sent to the admin caller, who will # serialize it (probably with data from other ranks). self.data = { 'rank': self.rank, 'meta': { 'pack_start': datetime.now(), }, 'settings': { 'network_type': self.network_type }, } # TODO: Fix this comment - what pause are you talkin' 'bout? # Pause the threads. Note that pause.set() might be called two times on # one network thread, if we are using the combo version. self.mpi.shutdown_event.set() self.mpi.has_work_event.set() self.network.finalize() # Make the MPI thread run out if its main loop. We keep it around so # the network threads and add messages to it (there might be some on # the network layer while we are shuttig down) self.mpi.shutdown_event.set() # FIXME: why are we calling this again? ###self.mpi.queues_flushed.wait() # Make the network send CONN_CLOSE on every socket connection. This way # we are sure not to miss messages "on the wire". self.close_all_connections() # Serialize other data self.data['mpi'] = self.mpi.get_state() # Remove stuff we can't pickle. self.clear_unpickable_objects() # Dump the session into a file. import tempfile _, filename = tempfile.mkstemp(prefix="pupy") try: dill.dump_session(filename=filename) # Load the session data into the dict so we can sent it. self.data['session'] = dill.load(open(filename)) except Exception, e: print "Cant serialize the current session. Traceback and information follows:" print "\t Error:", e import __main__ print "\t Main session:", __main__.__dict__ # Send the data+file on the connection. from mpi.network.utils import robust_send_multi, prepare_message connection = socket.create_connection(self.script_hostinfo, 4.0) header, payloads = prepare_message(dill.dumps(self.data), self.rank, is_serialized=True) robust_send_multi(connection, [header] + payloads) sys.exit(0)