Exemplo n.º 1
0
    def _handle_writelist(self, writelist):
        for write_socket in writelist:
            removal = []
            with self.socket_to_request_lock:
                #request_list = self.socket_to_request[write_socket]
                try:
                    request_list = self.socket_to_request[write_socket]
                except Exception as e:
                    #Logger().debug("rank:%i trying to find %s on socket_to_request:%s" % (self.rank, write_socket, self.socket_to_request ) )
                    raise e
            for request in request_list:
                if request.status == "cancelled":
                    removal.append(request)
                elif request.status == "new":
                    # Send the data on the socket
                    try:
                        if request.multi:
                            utils.robust_send(write_socket, request.header)
                            utils.robust_send_multi(write_socket, request.data)
                        else:
                            utils.robust_send_multi(
                                write_socket, [request.header] + request.data)
                    except socket.error, e:
                        Logger().error("got:%s for socket:%s with data:%s" %
                                       (e, write_socket, request.data))
                        # TODO: Make sure we really want to continue here, instead of reacting
                        # Send went wrong, do not update, but hope for better luck next time
                        continue
                        #raise e
                    except Exception, e:
                        Logger().error(
                            "Other exception got:%s for socket:%s with header:%s payload:%s"
                            % (e, write_socket, request.header, request.data))
                        # Send went wrong, do not update, but hope for better luck next time
                        raise e

                    removal.append(request)

                    if request.acknowledge:
                        request.update(
                            "unacked"
                        )  # update status to wait for acknowledgement
                    else:
                        request.update(
                            "ready"
                        )  # update status and signal anyone waiting on this request
                else:
                    pass
Exemplo n.º 2
0
    def run(self, *args, **kwargs):
        if not self.do_run:
            self.finished.set()
            self.data = "Cancelled due to availablity check on the remote host"
            return

        try:
            connection = socket.create_connection((self.hostname, self.portno),
                                                  4.0)
            if not connection:
                self.error = "Could not connect to rank %d. We received a connect timeout" % self.rank
            else:
                # Send the message
                header, payloads = utils.prepare_message(
                    (self.security_component, self.send_data),
                    -1,
                    cmd=self.cmd_id,
                    comm_id=-1)
                utils.robust_send_multi(connection, [header] + payloads)

                # Test if we should also receive a message back from the rank. If so, we wait
                # for that message for a specific timeout. If we haven't received the message
                # by then, the command was not a sucess. Otherwise it was.
                if self.pong:
                    incomming, _, errors = select.select([connection], [],
                                                         [connection],
                                                         self.timeout or 5)
                    if incomming:
                        # We read the message from the connection and set that as the result_data.
                        rank, cmd, tag, ack, comm_id, _, data = get_raw_message(
                            incomming[0])
                        data = pickle.loads(data)

                        self.data = data
                    else:
                        self.error = "Connection timeout (30 seconds)"
        except Exception, e:
            self.error = "Error in connecting to rank %d: %s" % (self.rank,
                                                                 str(e))
Exemplo n.º 3
0
    def run(self, *args, **kwargs):
        timeout = 1
        errors = 0
        while errors < 5:
            try:
                connection = socket.create_connection(
                    (self.hostname, self.portno), 4.0)
                break
            except Exception as e:
                import time
                time.sleep(timeout)
                timeout *= 2
                errors += 1

        if not connection:
            sys.exit(1)

        from mpi.network import utils as mpi_utils
        header, payloads = mpi_utils.prepare_message(
            self.data, -1, cmd=constants.CMD_MIGRATE_PACK)
        mpi_utils.robust_send_multi(connection, [header] + payloads)

        self.event.set()
Exemplo n.º 4
0
def execute_commands(mpi):
    """
    Execute the actual system commands. This functions returns a
    boolean indicating if the decorated function should be called.
    """
    from mpi.network.utils import robust_send_multi, prepare_message
    rest_list = []
    for obj in mpi.pending_systems_commands:
        cmd, connection, user_data = obj
        rank = mpi.MPI_COMM_WORLD.comm_group.rank()
        # Handle the message in a big if-statement. When / if the number
        # of commands escalades, we should consider moving them away.
        if cmd == constants.CMD_ABORT:
            sys.exit(1)

        elif cmd == constants.CMD_PING:
            # We need to access the rank like this. Calling rank() on the
            # communicator will active this function again. Should be
            # apply some locking?
            header,payloads = prepare_message("PONG", rank)
            robust_send_multi(connection, [header]+payloads)

        elif cmd == constants.CMD_READ_REGISTER:
            # Send our registers. We just send everything and let the
            # client filter.
            header,payloads = prepare_message(mpi.user_register, rank)
            robust_send_multi(connection, [header]+payloads)

        elif cmd == constants.CMD_MIGRATE_PACK:
            # This if is just here so people know it is not missing. We
            # handle this command in a different way.
            rest_list.append(obj)
            
        elif cmd == constants.CMD_CONFIG:
            res = mpi.set_configuration(user_data)
            # Send the result back
            header,payloads = prepare_message(res, rank)
            robust_send_multi(connection, [header]+payloads)
        
    mpi.pending_systems_commands = rest_list
Exemplo n.º 5
0
    def _handshake(self, mpirun_hostname, mpirun_port, internal_rank):
        """
        This method creates the MPI_COMM_WORLD communicator, by receiving
        (hostname, port, rank) for all the processes started by mpirun.

        For mpirun to have this information we first send all the data
        from our own process. So we can bind a socket.

        The data sent back to mpirun.py is a tuple containing the following
        elements:

            * hostname           : Our hostname
            * port               : Our port number. Together with hostname, this
                                   information makes it possible to connect with
                                   this instance.
            * rank               : Our rank. The mpirun.py process will distribute
                                   this information among the other processes.
            * security_component : A SHA1 hash used for "security". Scripts
                                   communicating with the MPI environment must
                                   supply this as a simple way to disallow
                                   other than the starting user.
            * availability       : Information about each system commands
                                   availability on this host.

        mpirun.py sends a tuple back containing:

            * all_procs          : Prior to release 0.8.0 this was the only data
                                   sent (and it was not in a tuple).
            * script-path        : The user script path. This is only used when we
                                   are resuming a packed job and need to import and
                                   run it.
            * state              : The state of the program when the job was packed.
        """
        sec_comp = self.mpi.generate_security_component()
        avail = syscommands.availablity()

        # Packing the data
        data = (self.hostname, self.port, self.unix_socket_filename,
                internal_rank, sec_comp, avail)

        # Connection to the mpirun processs
        s_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        recipient = (mpirun_hostname, mpirun_port)
        s_conn.connect(recipient)

        # Pack the data with our special format
        header, payloads = prepare_message(data,
                                           internal_rank,
                                           comm_id=-1,
                                           tag=constants.TAG_INITIALIZING)
        utils.robust_send_multi(s_conn, [header] + payloads)

        # Receiving data about the communicator, by unpacking the head etc.
        # first _ is rank
        from mpi import dill
        _, _, _, _, _, _, data = get_raw_message(s_conn)
        all_procs, state = dill.loads(data)

        if state:
            self.mpi.resume = True
            self.mpi.resume_state = state

        s_conn.close()

        self.all_procs = {}

        for (host, port, unx_filename, global_rank) in all_procs:

            # Check if this rank lives on the same host as we do. If so use the
            # unix socket instead of the TCP information.
            if host == self.hostname and self.options.unixsockets:
                connection_info = unx_filename
                connection_type = "local"
            else:
                connection_info = (host, port)
                connection_type = "tcp"
            self.all_procs[global_rank] = {
                'connection_info': connection_info,
                'connection_type': connection_type,
                'global_rank': global_rank
            }
Exemplo n.º 6
0
    def close_all_connections(self):
        from mpi.network import utils as mpi_utils
        import select

        rank = self.mpi.MPI_COMM_WORLD.comm_group.rank()

        # Find all connections in the socket pool.
        write_connections = [s for s in self.pool.sockets]
        read_connections = [s for s in write_connections]

        # Try to find a socket to -1 (the admin). We don't want to close that one
        with self.pool.sockets_lock:
            admin_conn = self.pool._get_socket_for_rank(-1)

        try:
            write_connections.remove(admin_conn)
        except:
            pass

        try:
            read_connections.remove(admin_conn)
        except:
            pass

        # A list to contain the received objects (not CMD_CONN_CLOSE).
        received_messages = []
        errors_left = 5

        while write_connections or read_connections:
            all_connections = write_connections + read_connections
            rlist, wlist, err_list = select.select(read_connections,
                                                   write_connections,
                                                   all_connections, 10)

            if err_list:
                Logger().warning(
                    "Received an error list with %d elements: %s" %
                    (len(err_list), err_list))

            # Handle the writes.
            for wsocket in wlist:
                header, payloads = mpi_utils.prepare_message(
                    "", rank, cmd=constants.CMD_CONN_CLOSE)
                mpi_utils.robust_send_multi(wsocket, [header] + payloads)
                write_connections.remove(wsocket)

            # Handle the reads.
            for rsocket in rlist:
                try:
                    rank, cmd, tag, ack, comm_id, _, data = mpi_utils.get_raw_message(
                        rsocket)

                    if cmd == constants.CMD_CONN_CLOSE:
                        read_connections.remove(rsocket)
                    else:
                        Logger().info(
                            "received important information while closing the sockets."
                        )
                        pass  # This message is important. We need to add it to the MPI environment.
                except Exception as e:
                    errors_left -= 1

            if errors_left <= 0:
                break
Exemplo n.º 7
0
class MigratePack(object):
    def __init__(self, mpi, script_hostinfo):
        self.mpi = mpi

        # The bypassed function is the one decorated with
        # handle_system_commands. When the environment is
        # unpacked again this needs to be executed. The format
        # of the variable is a tuple with 3 elements:
        # (function, args, kwargs)
        self.script_hostinfo = script_hostinfo

        self.network = self.mpi.network
        self.t_in = self.network.t_in
        self.t_out = self.network.t_out
        self.pool = self.network.socket_pool

        # Start migration.
        self.pack()

    def pack(self):
        # Find the network threads, as we need direct access to them
        # for extracting state and pause commands.
        if self.t_in.type == "combo":
            self.network_type = "combo"
        else:
            self.network_type = "normal"

        self.rank = self.mpi.MPI_COMM_WORLD.comm_group.rank()

        # This dict will be sent to the admin caller, who will
        # serialize it (probably with data from other ranks).
        self.data = {
            'rank': self.rank,
            'meta': {
                'pack_start': datetime.now(),
            },
            'settings': {
                'network_type': self.network_type
            },
        }

        # TODO: Fix this comment - what pause are you talkin' 'bout?
        # Pause the threads. Note that pause.set() might be called two times on
        # one network thread, if we are using the combo version.
        self.mpi.shutdown_event.set()
        self.mpi.has_work_event.set()

        self.network.finalize()

        # Make the MPI thread run out if its main loop. We keep it around so
        # the network threads and add messages to it (there might be some on
        # the network layer while we are shuttig down)
        self.mpi.shutdown_event.set()  # FIXME: why are we calling this again?
        ###self.mpi.queues_flushed.wait()

        # Make the network send CONN_CLOSE on every socket connection. This way
        # we are sure not to miss messages "on the wire".
        self.close_all_connections()

        # Serialize other data
        self.data['mpi'] = self.mpi.get_state()

        # Remove stuff we can't pickle.
        self.clear_unpickable_objects()

        # Dump the session into a file.
        import tempfile
        _, filename = tempfile.mkstemp(prefix="pupy")

        try:
            dill.dump_session(filename=filename)
            # Load the session data into the dict so we can sent it.
            self.data['session'] = dill.load(open(filename))
        except Exception, e:
            print "Cant serialize the current session. Traceback and information follows:"
            print "\t Error:", e
            import __main__
            print "\t Main session:", __main__.__dict__

        # Send the data+file on the connection.
        from mpi.network.utils import robust_send_multi, prepare_message

        connection = socket.create_connection(self.script_hostinfo, 4.0)

        header, payloads = prepare_message(dill.dumps(self.data),
                                           self.rank,
                                           is_serialized=True)
        robust_send_multi(connection, [header] + payloads)

        sys.exit(0)