def get_request(self, tag, *args, **kwargs): # Find the first suitable request for the given tag. There is no safety # net so if requests are non-exhaustive in their combined accept # pattern those not cathed parameters will not return a Request. try: req_class_list = self.cls_mapping[tag] except: Logger().warning( "Unable to find collective list in the cls_mapping for tag %s" % tag) for req_class in req_class_list: obj = req_class.accept(self.communicator, self.communicator.mpi.settings, self.cache, *args, **kwargs) if obj: # Set the tag on the object. obj.tag = tag # Add the object to the MPI environment and send the start signal. with self.mpi.unstarted_collective_requests_lock: self.mpi.unstarted_collective_requests.append(obj) # Signal self.mpi.unstarted_collective_requests_has_work.set() self.mpi.has_work_event.set() return obj # Note: If we define a safety net we could select the first / last class # and initialize that. Logger().warning( "Unable to initialize the collective request for tag %s. I suspect failure from this point" % tag)
def _version_check(self): """ Check that the required Python version is installed """ (major,minor,_,_,_) = sys.version_info if (major == 2 and minor < 6) or major < 2: Logger().error("pupyMPI requires Python 2.6 (you may have to kill processes manually)") sys.exit(1) elif major >= 2 and minor is not 6: Logger().warn("pupyMPI is only certified to run on Python 2.6")
def _handle_writelist(self, writelist): for write_socket in writelist: removal = [] with self.socket_to_request_lock: #request_list = self.socket_to_request[write_socket] try: request_list = self.socket_to_request[write_socket] except Exception as e: #Logger().debug("rank:%i trying to find %s on socket_to_request:%s" % (self.rank, write_socket, self.socket_to_request ) ) raise e for request in request_list: if request.status == "cancelled": removal.append(request) elif request.status == "new": # Send the data on the socket try: if request.multi: utils.robust_send(write_socket, request.header) utils.robust_send_multi(write_socket, request.data) else: utils.robust_send_multi( write_socket, [request.header] + request.data) except socket.error, e: Logger().error("got:%s for socket:%s with data:%s" % (e, write_socket, request.data)) # TODO: Make sure we really want to continue here, instead of reacting # Send went wrong, do not update, but hope for better luck next time continue #raise e except Exception, e: Logger().error( "Other exception got:%s for socket:%s with header:%s payload:%s" % (e, write_socket, request.header, request.data)) # Send went wrong, do not update, but hope for better luck next time raise e removal.append(request) if request.acknowledge: request.update( "unacked" ) # update status to wait for acknowledgement else: request.update( "ready" ) # update status and signal anyone waiting on this request else: pass
def handle_system_message(self, rank, command, raw_data, connection): """ Handle a system message. We define a list of read only commands and all others are considered writeable. The raw data contains a security component we need to check in the case of a write command. This method returns a boolean indicating if the command was actually tried. """ read_only = (constants.CMD_PING, constants.CMD_READ_REGISTER) commands = (constants.CMD_CONFIG, constants.CMD_ABORT, constants.CMD_PING, constants.CMD_MIGRATE_PACK, constants.CMD_READ_REGISTER, constants.CMD_CONN_CLOSE) data = utils.pickle.loads(raw_data) user_data = None security_component = None if isinstance(data, tuple): security_component, user_data = data else: security_component = data # Security check. if command not in read_only: if security_component != self.get_security_component() and rank < 0: Logger().warning("Failed security check in system command. Expected security component was %s but received %s for command %s" % (self.get_security_component(), raw_data, command)) return False # Check we have a system command if command in commands: with self.pending_systems_commands_lock: self.pending_systems_commands.append( (command, connection, user_data)) else: print "Error: Unknown system command"
def add_out_request(self, request): """ Put a requested out operation (eg. send) on the out list """ # Create the proper data structure and pickle the data #request.prepare_send() # Find a socket and port of recipient process connection_info = self.network.all_procs[ request.global_rank]['connection_info'] connection_type = self.network.all_procs[ request.global_rank]['connection_type'] # TODO: This call should be extended to allow asking for a persistent connection client_socket, newly_created = self.socket_pool.get_socket( request.global_rank, connection_info, connection_type) # If the connection is a new connection it is added to the socket lists of the respective thread(s) if newly_created: self.network.t_in.add_in_socket(client_socket) self.network.t_out.add_out_socket(client_socket) with self.socket_to_request_lock: try: self.network.t_out.socket_to_request[client_socket].append( request ) # socket already exists just add another request to the list self.outbound_requests += 1 except Exception, e: # This should not happen Logger().error( "Network-thread (%s) got error: %s of type: %s, socket_to_request was: %s" % (self.type, e, type(e), self.network.t_out.socket_to_request))
def ssh(host, arguments, x_forward, process_io, logdir, rank): """Process starter using ssh through subprocess. No loadbalancing yet.""" logger = Logger() # We join the sys.path here to allow user modifications to PYTHONPATH to take effect remotelyy python_path = os.path.dirname( os.path.abspath(__file__)) + "/../" + ":" + ":".join(sys.path) sshexec_str = "ssh %s%s \"PYTHONPATH=%s %s\"" % ( ("-XY " if x_forward else ""), host, python_path, ' '.join(arguments)) #if rank == 0: # logger.debug("Starting remote process: %s with process_io type %s" % (sshexec_str, process_io)) if process_io in [ 'none', 'direct', 'remotefile' ]: # network is closed for i/o, nothing displayed or written on mpirun side. If remote_file, a file is created on the remote process machine only. target = None elif process_io == 'asyncdirect': # uses io forwarder and prints to console target = subprocess.PIPE elif process_io == 'localfile': # writes to a file on the mpirun machine only try: target = open(os.path.join(logdir, "mpi.rank%s.log" % rank), "w") io_target_list.append(target) except: raise MPIException( "Local directory not writeable - check that this path exists and is writeable:\n%s" % options.logdir) else: raise MPIException("Unsupported I/O type: '%s'" % process_io) # Execute the ssh command in a subprocess p = subprocess.Popen(sshexec_str, shell=True, stdout=target, stderr=target) process_list.append(p) return p
def add_accepted_socket(self, socket_connection, global_rank): """ Add a socket connection to the pool, where the connection is the returned value from a socket.accept - that is we are at the recieving end of a connection attempt. """ if global_rank >= 0 and self.readonly: #Logger().debug("Bad conn to rank %i with metainfo:%s and sockets:%s" % (global_rank, self.metainfo, self.sockets)) raise Exception( "Can't add accepted socket. We're in readonly mode") #Logger().debug("SocketPool.add_accepted_socket: Adding socket connection for rank %d: %s" % (global_rank, socket_connection)) with self.sockets_lock: known_socket = self._get_socket_for_rank(global_rank) # TODO: Move this check under the if known_socket: condition since it is more specialized (i.e. saves an if-comparison in the normal case) if known_socket == socket_connection: Logger().error( "SocketPool.add_accepted_socket: Trying to add a socket_connection that is already in the pool?!" ) return if known_socket: # When two procs send to each other simultaneously the result can be # adding a duplicate connection #Logger().debug("Already a socket in the pool:%s for an accepted connection:%s to rank:%i" % (known_socket,socket_connection,global_rank)) pass self._add(global_rank, socket_connection, False)
def create_random_socket(min=10000, max=30000): """ A simple helper method for creating a socket, binding it to a random free port within the specified range. """ logger = Logger() used = [] sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Enable TCP_NODELAY to improve performance of sending one-off packets by # immediately acknowledging received packages instead of trying to # piggyback the ACK on the next outgoing packet (Nagle's algorithm) # XXX: If you remove this, remember to do so in socketpool as well. sock.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1) #sock.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) hostname = socket.gethostname() port_no = None while True: port_no = random.randint(min, max) if port_no in used: #logger.debug("get_socket: We know port %d is already in use, try a new one" % port_no) continue try: #logger.debug("get_socket: Trying to bind on port %d" % port_no) sock.bind( (hostname, port_no) ) break except socket.error: #logger.debug("get_socket: Permission error on port %d, trying a new one" % port_no) used.append( port_no ) # Mark socket as used (or no good or whatever) continue return sock, hostname, port_no
def select_out(self): try: #return select.select( [], self.sockets_out, self.sockets_out) return select.select([], self.sockets_out, self.sockets_out, 1) except Exception, e: Logger().error( "Network-thread (%s) Got exception: %s of type: %s" % (self.type, e, type(e)))
def close_all_sockets(self): """ Close all sockets in the socketpool """ for s in self.sockets: try: #s.shutdown(2) s.close() except Exception, e: Logger().debug("Got error when closing socket: %s" % e)
def get_communicator_class(socket_poll_method=False): c_class = None if socket_poll_method: poll_method_exists = getattr(select, socket_poll_method, None) if not poll_method_exists: Logger().warn( "Socket poll method '%s' is not supported on this system - falling back to automatic selection." % socket_poll_method) socket_poll_method = False if socket_poll_method == "epoll": c_class = CommunicationHandlerEpoll elif socket_poll_method == "poll": c_class = CommunicationHandlerPoll elif socket_poll_method == "select": c_class = CommunicationHandlerSelect else: if socket_poll_method: Logger().warn( "Unknown socket poll method '%s' - falling back to automatic selection." % socket_poll_method) epoll = getattr(select, "epoll", None) if epoll: c_class = CommunicationHandlerEpoll poll = getattr(select, "poll", None) if poll and not c_class: c_class = CommunicationHandlerPoll if not c_class: c_class = CommunicationHandlerSelect #Logger().debug("Found communicator class of type %s, called with socket_poll_method parameter %s" % (c_class, socket_poll_method)) return c_class
def _handle_readlist(self, readlist): for read_socket in readlist: add_to_pool = False if read_socket in (self.main_receive_socket, self.unix_socket): try: # _ is sender_address (conn, _) = read_socket.accept() self.network.t_in.add_in_socket(conn) self.network.t_out.add_out_socket(conn) add_to_pool = True except socket.error, e: # We try to accept but if accept fails maybe it just data coming in? Logger().error( "accept() threw: %s on the main recv socket:%s" % (e, read_socket)) continue except Exception, e: Logger().error( "_handle_readlist: Unknown error. Error was: %s" % e) continue
def accept_msg(self, rank, raw_data, msg_type): # Do not do anything if the request is completed. if self._finished.is_set(): return False # Deserialize data data = utils.deserialize_message(raw_data, msg_type) if self.phase == "up": if rank not in self.missing_children: return False # Remove the rank from the missing children. self.missing_children.remove(rank) # Add the data to the list of received data if self.partial: # If partial reduce we didn't get a dict but just the reduced data self.received_data[rank] = data else: self.received_data.update(data) # When the list of missing children is empty we have received from # every child and can reduce the data and send to the parent. if not self.missing_children: # Add our own data element self.received_data[self.rank] = self.data # reduce the data if self.partial: self.data = reduce_elementwise(self.received_data.values(), self.operation) else: self.data = self.received_data # forward to the parent. self.to_parent() return True elif self.phase == "down": if rank != self.parent: return False self.data = data self.to_children() return True else: Logger().warning("Accept_msg in unknown phase: %s" % self.phase) return False
def accept_msg(self, rank, raw_data, msg_type): # Do not do anything if the request is completed. if self._finished.is_set(): return False if self.phase == "up": if rank not in self.missing_children: return False # Remove the rank from the missing children. self.missing_children.remove(rank) desc = self.topology.descendants(rank) all_ranks = [ rank ] + desc # All the ranks having a payload in this message all_ranks.sort() # keep it sorted # Store payloads in proper positions for i, r in enumerate(all_ranks): pos_r = i begin = pos_r * (self.chunksize) end = begin + (self.chunksize) # Add the data to the list of received data self.data_list[r] = raw_data[begin:end] # When the list of missing children is empty we have received from # every child and can reduce the data and send to the parent. if not self.missing_children: # forward to the parent. self.to_parent() return True elif self.phase == "down": if rank != self.parent: return False # FIXME: just fix! # NOTE: Unless we are sure that no reduce operation can change the # msg_type or chunksize we should really store them again here #Logger().debug("STORING msg_type:%s vs. old:%s type(raw_data):%s len(raw_data):%s raw_data:%s" % (msg_type, self.msg_type, type(raw_data), len(raw_data), raw_data)) self.data_list = [raw_data] # boxing self.to_children() return True else: Logger().warning("Accept_msg in unknown phase: %s" % self.phase) return False
def find_mapper(module_or_func): mod = __import__("mpi.lib.hostfile.mappers", fromlist="mpi.lib.hostfile") mapper = getattr(mod, module_or_func, None) if not mapper: # Try to import the module. if module_or_func.find(".") == -1: raise Exception("Cant import a custom hostmapper. Maybe you supplied something in a bad format") try: split = module_or_func.split(".") mod = __import__(split[:-1]) func = split[-1] mapper = getatr(mod, func, None) except Exception, e: Logger().warn("Cant import the custom module. The exception raised is %s" % e)
def match_pending(self, request): """ Tries to match a pending request with something in the received data. If the received data is found we remove it from the list. The request is updated with the data if found and this status update returned from the function so it is possible to remove the item from the list. """ match = False with self.received_data_lock: #Logger().debug("-- Match pending has lock! received_data:%s" % self.received_data) for element in self.received_data: (sender, tag, acknowledge, communicator_id, message) = element # For a message to match # 1) it must be within the same communicator # 2) participant must match or any rank have been specified # 3) tag must match OR if any-tag has been specified the message should just be any user tag (ie. non-negative) if request.communicator.id == communicator_id \ and request.participant in (sender, constants.MPI_SOURCE_ANY) \ and ( (request.tag == tag) or (request.tag == constants.MPI_TAG_ANY and tag > 0) ): # Incoming synchronized communication requires acknowledgement if acknowledge: Logger().debug("SSEND RECEIVED request: %s" % request) # Generate an acknowledge message as an isend # NOTE: Consider using an empty message string, to save (a little) resources self.communicators[communicator_id]._isend( "ACKNOWLEDGEMENT", sender, constants.TAG_ACK) matched_element = element match = True request.update(status="ready", data=message) break # We can only find matching data for one request and we have if match: self.received_data.remove(matched_element) return match
def generate_settings(self, settings): # We first import our normal settings packed with the mpi environment. These # will make a good base for all the functionality here. If the user supplies # any other settings files these will override the ones in our module. from mpi import settings as base_settings self.settings = base_settings if settings: settings = settings.strip().strip(", ") modules = settings.split(",") for module in modules: # help people a bit module = module.strip().strip(".py") try: mod = __import__(module) self.settings.__dict__.update(mod.__dict__) except ImportError: #Logger().debug("Can not import a settings module by the name of %s" % module) pass except Exception, e: Logger().error("Something very wrong happened with your settings module:", e)
def resume_packed_state(self): from mpi import dill obj = dill.loads(self.resume_state) session_data = obj['session'] # Import everything from the user module. This is important as the user # might have defined objects / classes etc deleted as part of the # pickle process. user_module = obj['mpi']['user_module'] try: user_module = __import__(user_module) import __main__ for k in user_module.__dict__: if k not in __main__.__dict__: __main__.__dict__[k] = user_module.__dict__[k] user_module.__dict__.update(__main__.__dict__) except Exception, e: Logger().warning("Can't import the user module: %s. This might not be a problem, but it is better to restore the script with your script in your PYTHONPATH." % user_module) print e
def round_robin(hosts, total_cpu, max_cpu, np=1, overmapping=True): l = generate_localhost_data(hosts, np) if l: return l if np > total_cpu: # Overmapping. if not overmapping or np > max_cpu: raise HostfileMapException("Number of processes exceeds the maximum allowed CPUs") Logger().warning("Not enough hosts. Overmapping in effect. ") mapped_hosts = [] host_count = {} rank = 0 done = False while not done: for host in hosts: hostname = host['host'] if hostname not in host_count: host_count[hostname] = 0 # Just check that this host allows more virtual CPUs on it if host_count[hostname] < host['max_cpu']: host_count[hostname] += 1 else: continue mapped_hosts.append( (hostname, rank) ) rank += 1 if rank == np: done = True break return mapped_hosts
def robust_send_multi(socket, messages): """ experimental cousin of robust_send if we can agree that the overhead of always considering messages a list is negligible this can be folded into regular robust_send TODO: Check (eg. with wireshark) if every send produces a tcp packet or if several messages can be packed into on tcp packet (which we hope is what happens) """ for message in messages: target = len(message) # how many bytes to send transmitted_bytes = 0 try: while target > transmitted_bytes: delta = socket.send(message) transmitted_bytes += delta if target > transmitted_bytes: # Rare unseen case therefore relegated to if clause instead of always slicing in send message = message[transmitted_bytes:] #Logger().debug("Message sliced because it was too large for one send.") except Exception as e: Logger().error("BAD multisend caller:%s msg type%s len:%s of %i in all - msg:%s error:%s" % (whosdaddy(), type(message), target, len(messages), message, e ) ) raise e
def wait_for_shutdown(process_list): """ Go through list of processes and make sure they all have terminated """ logger = Logger() exit_codes = [] while process_list: remove = [] for p in process_list: returncode = p.poll() #logger.debug("Got return code: %s" % returncode) if returncode is None: # still alive pass elif returncode == 0: # exited correctly exit_codes += [returncode] remove.append(p) #process_list.remove( p ) logger.debug( "A process exited with a status of 0. And we have %i left." % (len(process_list) - len(remove))) else: # error code exit_codes += [returncode] remove.append(p) #process_list.remove( p ) logger.debug( "A process exited with return code %d. And we have %i left." % (returncode, len(process_list) - len(remove))) # We remove outside iteration over list just to be safe for p in remove: process_list.remove(p) time.sleep(1) # Target list is empty unless the option process_io=localfile is specified, in # which case we close the filedescriptors of all the log files made for t in io_target_list: t.close() return exit_codes
def parse_hostfile(filepath="hostfile", limit_to=None): config = ConfigParser.SafeConfigParser() config.read(filepath) # The returned data will be at tuple containing # the hosts, sum of cpus and sum of allowed overmapped cpus sum_cpu = 0 sum_maxcpu = 0 hosts = [] # As a default, we use all the sections defining # hosts. If - on the other hand - the user defined # a section called [ActiveNodes], only the secions # mention there will be read. sections = config.sections() # Filter the sections by looking in the ActiveNodes, # so only a subset of the section will be used. if "ActiveNodes" in sections: sections.remove("ActiveNodes") try: active_sections = config.get("ActiveNodes", "active") active_sections = [s.strip() for s in active_sections.split(",")] # Test if there is global overlap if not all([s in sections for s in active_sections]): raise Exception( "There were sections defined in the ActiveNodes that does not exist" ) sections = active_sections except ConfigParser.NoOptionError: pass if limit_to: for s in sections: if s not in limit_to: sections.remove(s) defaults = {'cpu': 0, 'max_cpu': 0} if "Defaults" in config.sections(): # Fetch the default keys. for key in defaults: defaults[key] = config.getint("Defaults", key) # We are now ready to parse the remaining sections for section in sections: try: nodes = config.get(section, "nodes").split(",") except ConfigParser.NoOptionError: Logger().warning( "Found section %s in hostfile, but it does not include any nodes. This section will not contribture anything to the later process to host mapping." ) for node in nodes: node = node.strip() # Use the defaults as defaults (wauw). Override them after. s = copy.copy(defaults) s["host"] = node for key in defaults: try: s[key] = config.getint(section, key) except ConfigParser.NoOptionError: pass # Aggregate some key information sum_maxcpu += s["max_cpu"] sum_cpu += s["cpu"] hosts.append(s) if sum_cpu > sum_maxcpu: Logger().warn( "Hostfile parser detected that the hostfile specifies more actual CPUs than 'virtual" ) return hosts, sum_cpu, sum_maxcpu
def parse(parser): """ Parses the above parser (and more maybe) and handle other elements like creating Loggers with the proper parameter etc. This method should be used to avoid a lot of duplicate code. """ options, args = parser.parse_args() # Look for a different tag mapper. if len(args) != 1: parser.error("No data file supplied!") handle_file = args[0] if not Handle.valid_handle_file(handle_file): parser.error("Invalid data file") # Check that the supplied tag mapper is actually a file if options.tag_mapper and not os.path.isfile(options.tag_mapper): parser.error("No such tag mapper file: %s" % options.tag_mapper) if not options.tag_mapper: # Look for a default tag mapper potential_mapper = handle_file + ".tagmapper" if os.path.isfile(potential_mapper): options.tag_mapper = potential_mapper # Clean the filter test. options.test_filter = filter( None, [s.strip().lower() for s in options.test_filter.split(",")]) # Create a negative test case if we try to use datasize. This is not supported by barrier. if "datasize" in (options.x_data, options.y_data): options.test_filter.append(":barrier") # Setup a logger based on the arguments about. This might seem stupid # as it is not returned from the call, but as the Logger is a singleton # it is possible to do a simple Logger() call later. from mpi.logger import Logger logfile = "pupyplot.log" if options.logfile: logfile = options.logfile verbosity = 1 if options.debug: verbosity = 3 elif options.verbose: verbosity = 2 Logger(logfile, "pupyplot", options.debug, verbosity, not options.verbose) # Normalize the raw filters. raw_filters = [] for f in filter(None, [f.strip() for f in options.raw_filters.split(";")]): # For now we only have one filter type (equal). We identify this by # a simple string. Parser people would probably not like this t = f.split(":") vals = filter(None, [f.strip() for f in t[1].split(",")]) if len(t) == 2: raw_filters.append((t[0], "EQ", vals)) options.raw_filters = raw_filters return options, args
def terminate_children(): for p in process_list: logger = Logger() logger.debug("Killing %s" % p) p.terminate()
'avg_time' : 'time', 'min_time' : 'time', 'max_time' : 'time', 'throughput' :'throughput', 'nodes' : 'number', } if __name__ == "__main__": # Receive the parser and groups so we can add further elements # if we want parser, groups = plot_parser() # Parse it. This will setup logging and other things. options, args = parse(parser) Logger().debug("Command line arguments parsed.") # Object creation, used to keep, filter, aggregate, validate data. handle = Handle(args[0]) tag_mapper = {} if options.tag_mapper: tag_mapper = get_tag_mapping(options.tag_mapper) # to extract and filter the data. ds = DataSupplier(handle.getdata()) ds.set_raw_filters(options.raw_filters) # It should be possible to limit the tests to one single test. how should # this be one. rt = 0
def close_all_sockets(self): for s in self.sockets_in + self.sockets_out: try: s.close() except Exception, e: Logger().error("Got error when closing socket: %s" % e)
def generate_localhost_data(hosts, np): if not hosts: Logger().warning("No hostfile. Overmapping on localhost. Unless you are developing right now, this might not be what you want.") return [("localhost", i) for i in range(np)]
try: rank, msg_type, tag, ack, comm_id, coll_class_id, raw_data = get_raw_message( conn, self.network.mpi.settings.SOCKET_RECEIVE_BYTECOUNT) except MPIException, e: # Broken connection is ok when shutdown is going on if self.shutdown_event.is_set(): break # We don't care about incoming during shutdown else: # TODO: We should check for a specific Exception thrown from get_raw_message to signify when other side has closed connection # We have no way of knowing whether other party has reached shutdown or this was indeed an error # so we just try listening to next socket continue except Exception, e: Logger().error( "_handle_readlist: Unexpected error thrown from get_raw_message. Error was: %s" % e) continue # Now that we know the rank of sender we can add the socket to the pool if add_to_pool: self.network.socket_pool.add_accepted_socket(conn, rank) # user messages have a cmd field larger than CMD_RAWTYPE if msg_type >= constants.CMD_RAWTYPE: try: with self.network.mpi.raw_data_lock: self.network.mpi.raw_data_queue.append( (rank, msg_type, tag, ack, comm_id, coll_class_id, raw_data)) self.network.mpi.raw_data_has_work.set()
def main(): Logger("migrate", "migrate", True, True, True) options, args = parse_extended_args() ranks = options.ranks hostinfo = options.hostinfo bypass = options.bypass # Create a socket we can receive results from. sock, hostname, port_no = create_random_socket() sock.listen(len(ranks)) all_data = { 'procs': {}, 'mpirun_args': options.mpirun_args, } # Start a tread for reaciing. receiver = Receiver(sock, len(ranks), all_data) receiver.start() receiver.start_event.wait() senders = [] for participant in hostinfo: remote_host, remote_port, rank, security_component, avail = participant succ = True if not bypass: succ = avail_or_error(avail, rank, constants.CMD_MIGRATE_PACK) if not succ: sys.exit(1) # Data to send is a tuple with the security component, and then # command specific data data = (security_component, (hostname, port_no)) sender = Sender(remote_host, remote_port, data) sender.start() senders.append(sender) # Join all the sender threads. for s in senders: s.wait() s.join() # Wait until everybody sent back. receiver.wait() receiver.join() # Write the final data to a file import tempfile _, filename = tempfile.mkstemp(prefix="pupy") fh = open(filename, "wb") dill.dump(all_data, fh) fh.close() print "Halted system saved to file: ", filename sys.exit(0)
def __init__(self): Thread.__init__(self) """ Initializes the MPI environment. This will give each process a separate rank in the MPI_COMM_WORLD communicator along with the total number of processes in the communicator. Both attributes can be read just after startup:: from mpi import MPI mpi = MPI() rank = mpi.MPI_COMM_WORLD.rank() size = mpi.MPI_COMM_WORLD.size() print "Proc %d of %d started" % (rank, size) mpi.finalize() """ self.name = "MPI" # Thread name # Startup time. Used in Wtime() implementation. self.startup_timestamp = time.time() # Event for handling thread packing. self.packing = threading.Event() # Data structures for jobs. # The locks are for guarding the data structures # The events are for signalling change in data structures # Pending requests are recieve requests where the data may or may not have arrived self.pending_requests = [] self.pending_requests_lock = threading.Lock() self.pending_requests_has_work = threading.Event() # Raw data are messages that have arrived but not been unpickled yet self.raw_data_queue = [] self.raw_data_lock = threading.Lock() self.raw_data_has_work = threading.Event() # Recieved data are messages that have arrived and are unpickled # (ie. ready for matching with a posted recv request) #There are no events as this is handled through the "pending_request_" event. self.received_data = [] self.received_data_lock = threading.Lock() # General event to wake up main mpi thread self.has_work_event = threading.Event() # Shutdown signals self.shutdown_event = threading.Event() # MPI finalize has been called, shutdown in progress # Lock and counter for enumerating request ids self.current_request_id_lock = threading.Lock() self.current_request_id = 0 # Pending system commands. These will be executed at first chance we have (we # need access to the user code). We also have a lock around the list, to ensure # proper access. self.pending_systems_commands = [] self.pending_systems_commands_lock = threading.Lock() # Unstarted collective requests. self.unstarted_collective_requests = [] self.unstarted_collective_requests_lock = threading.Lock() self.unstarted_collective_requests_has_work = threading.Event() # When the collective requsts are started they are moved to this queue until # they are finished. self.pending_collective_requests = [] self.received_collective_data_lock = threading.Lock() self.received_collective_data = [] self.pending_collective_requests_has_work = threading.Event() # The settings module. This will be handle proper by the # function ``generate_settings``. self.settings = None self.config_callbacks = [] # Append callbacks from mpi.settings import standard_callbacks self.config_callbacks.extend(standard_callbacks) options = self.parse_options() # TODO: See if logger initialisations below here shouldn't be refactored into one # Decide how to deal with I/O if options.process_io == "remotefile": # Initialise the logger import os logger = Logger(os.path.join(options.logdir,"remotelog"), "proc-%d" % options.rank, options.debug, options.verbosity, True) filename = constants.DEFAULT_LOGDIR+'mpi.local.rank%s.log' % options.rank logger.debug("Opening file for I/O: %s" % filename) try: output = open(filename, "w") except: raise MPIException("File for I/O not writeable - check that this path exists and is writeable:\n%s" % constants.DEFAULT_LOGDIR) sys.stdout = output sys.stderr = output elif options.process_io == "none": # Initialise the logger logger = Logger(options.logdir+"mpi", "proc-%d" % options.rank, options.debug, options.verbosity, True) logger.debug("Closing stdout") sys.stdout = None else: # Initialise the logger logger = Logger(options.logdir+"mpi", "proc-%d" % options.rank, options.debug, options.verbosity, options.quiet) # TODO: Put this info under settings when they start to work properly # Also we should check that the path here is accessible and valid # if filepath starts with something else than / it is a relative path and we assume it relative to pupympi dir if not options.logdir.startswith('/'): _BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) self.logdir = os.path.join(_BASE,options.logdir) else: self.logdir = options.logdir # Parse and save settings. self.generate_settings(options.settings) # Attributes for the security component. self.disable_utilities = options.disable_utilities self.security_component = None # First check for required Python version self._version_check() # Check for yappi support self._yappi_enabled = False if options.yappi: try: import yappi self._yappi_enabled = True self._yappi_sorttype = yappi.SORTTYPE_NCALL if options.yappi_sorttype: if options.yappi_sorttype == 'name': self._yappi_sorttype = yappi.SORTTYPE_NAME elif options.yappi_sorttype == 'ncall': self._yappi_sorttype = yappi.SORTTYPE_NCALL elif options.yappi_sorttype == 'ttotal': self._yappi_sorttype = yappi.SORTTYPE_TTOTAL elif options.yappi_sorttype == 'tsub': self._yappi_sorttype = yappi.SORTTYPE_TSUB elif options.yappi_sorttype == 'tavg': self._yappi_sorttype = yappi.SORTTYPE_TAVG else: logger.warn("Unknown yappi sorttype '%s' - defaulting to ncall." % options.yappi_sorttype) except ImportError: logger.warn("Yappi is not supported on this system. Statistics will not be logged.") self._yappi_enabled = False # Start built-in profiling facility self._profiler_enabled = False if options.enable_profiling: if self._yappi_enabled: logger.warn("Running yappi and pupyprof simultaneously is unpossible. Pupyprof has been disabled."); else: try: import pupyprof self._profiler_enabled = True except ImportError: logger.warn("Pupyprof is not supported on this system. Tracefile will not be generated"); self._profiler_enabled = False # Set a resume parameter indicating if we are resuming a packed job. # This will be changed (maybe) in the netowrk startup. self.resume = False # Enable a register for the users to put values in. This register can be read # with the readregister.py script found in bin/utils/ self.user_register = {} # Place to keep functions needed when packing / unpacking the running MPI # instance. The best place to start is migrate.py self.migrate_onpack = None self.network = Network(self, options) # Create the initial global Group, and assign the network all_procs as members world_Group = Group(options.rank) world_Group.members = self.network.all_procs # Create the initial communicator MPI_COMM_WORLD. It is initialized with # the rank of the process that holds it and size. # The members are filled out after the network is initialized. self.communicators = {} self.MPI_COMM_WORLD = Communicator(self, options.rank, options.size, self.network, world_Group, comm_root=None) # Tell the network about the global MPI_COMM_WORLD, and let it start to # listen on the corresponding network channels self.network.MPI_COMM_WORLD = self.MPI_COMM_WORLD # Change the contents of sys.argv runtime, so the user processes # can't see all the mpi specific parameters we start with. user_options =[sys.argv[0], ] user_options.extend(sys.argv[sys.argv.index("--")+1:]) sys.argv = user_options # Set up the global mpi constants constants.MPI_GROUP_EMPTY = Group() self.daemon = True resumer = None if self.resume: resumer = self.resume_packed_state() self.start() # Make every node connect to each other if settings specify it if not options.disable_full_network_startup: self.network.start_full_network() self.initinfo = (self.MPI_COMM_WORLD, self.MPI_COMM_WORLD.rank(), self.MPI_COMM_WORLD.size()) # Set a static attribute on the class so we know it is initialised. self.__class__._initialized = True if self._profiler_enabled: pupyprof.start() if self.resume and resumer: resumer(self)