Exemplo n.º 1
0
 def stage_in(self, operation):
     """ Stage the current operation """
     super(MulticoreBackend, self).stage_in(operation)
     self.pool = multiprocessing.Pool(processes = self.pool_size)
     
     # Set up progress bar
     widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()]
     self.progress_bar = ProgressBar(widgets = widgets, 
                            maxval = self.current_operation.number_processes)
     self.progress_bar.start()
     
     self._log("Operation - staged")
     self.state = "staged"
Exemplo n.º 2
0
    def stage_in(self, operation):
        """ Stage the current operation """
        super(LoadLevelerBackend, self).stage_in(operation)
        # set up queue
        self.result_handlers = multiprocessing.Queue(200)
        # Set up progress bar
        widgets = [
            'Operation progress: ',
            Percentage(), ' ',
            Bar(), ' ',
            ETA()
        ]
        self.progress_bar = ProgressBar(
            widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"
Exemplo n.º 3
0
    def stage_in(self, operation):
        """ Stage the current operation """
        super(MulticoreBackend, self).stage_in(operation)
        self.pool = multiprocessing.Pool(processes=self.pool_size)

        # Set up progress bar
        widgets = ["Operation progress: ", Percentage(), " ", Bar(), " ", ETA()]
        self.progress_bar = ProgressBar(widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"
Exemplo n.º 4
0
 def stage_in(self, operation):
     """ Stage the current operation """
     super(LoadLevelerBackend, self).stage_in(operation)
     # set up queue
     self.result_handlers = multiprocessing.Queue(200)
     # Set up progress bar
     widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()]
     self.progress_bar = ProgressBar(widgets = widgets, 
                            maxval = self.current_operation.number_processes)
     self.progress_bar.start()
     
     self._log("Operation - staged")
     self.state = "staged"
Exemplo n.º 5
0
 def stage_in(self, operation):
     """
     Stage the current operation
     """
     super(SerialBackend, self).stage_in(operation)
     
     # Set up progress bar
     widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()]
     self.progress_bar = ProgressBar(widgets = widgets, 
                                     maxval = self.current_operation.number_processes)
     self.progress_bar.start()
     
     self._log("Operation - staged")
     self.state = "staged"
Exemplo n.º 6
0
class SerialBackend(Backend):
    """ A backend that allows for easy debugging since the program flow
    is not threaded or distributed over several OS processes.
    """
    
    def __init__(self):
        super(SerialBackend, self).__init__()
              
        self.state = "idling"  
        self.current_process = 0
        
        
    def stage_in(self, operation):
        """
        Stage the current operation
        """
        super(SerialBackend, self).stage_in(operation)
        
        # Set up progress bar
        widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()]
        self.progress_bar = ProgressBar(widgets = widgets, 
                                        maxval = self.current_operation.number_processes)
        self.progress_bar.start()
        
        self._log("Operation - staged")
        self.state = "staged"
        
    def execute(self):
        """
        Executes all processes specified in the currently staged
        operation.
        """
        assert(self.state == "staged")
        
        self.state = "executing" 
        self._log("Operation - executing")
        
        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host" : self.host, "port" : self.port}
        
        try:
            process = self.current_operation.processes.get()
        except KeyboardInterrupt:
            self._log(traceback.format_exc(), level=logging.CRITICAL)
            process = False
        # while there are Processes in the queue ...
        while not process is False:
            process.prepare(pySPACE.configuration, handler_class, handler_args)
            # Execute process, update progress bar and get next queue-element
            try:
                process()
            # if an exception is raised somewhere in the code we maybe want to
            # further try other processes
            except Exception:
                self._log(traceback.format_exc(), level=logging.CRITICAL)
                process.post_benchmarking()
                process = False
            # if ctrl+c is pressed we want to immediately stop everything
            except KeyboardInterrupt:
                self._log(traceback.format_exc(), level=logging.CRITICAL)
                process.post_benchmarking()
                process = False
            else:    
                self.current_process += 1
                self.progress_bar.update(self.current_process)
                process = self.current_operation.processes.get()

    def check_status(self):
        """
        Returns a description of the current state of the operations
        execution.
        
        .. todo:: do we really need this method???
        """
        # Returns which percentage of processes of the current operation
        # is already finished
        return float(self.current_process)/self.current_operation.number_processes
    
    def retrieve(self):
        """
        Returns the result of the operation.
        
        This is trivial in the Debug-Backend since execute blocks.
        """
        assert(self.state == "executing")
        
        self._log("Operation - retrieved")
        
        self.current_operation.processes.close()
        # if process creation has another thread
        if hasattr(self.current_operation, "create_process") \
                        and self.current_operation.create_process != None:
            self.current_operation.create_process.join()
            
        # Change the state to retrieved
        self.state = "retrieved"
    
    def consolidate(self):
        """
        Consolidates the results of the single processes into a consistent result of the whole
        operation
        """
        assert(self.state == "retrieved")
        
        try:
            self.current_operation.consolidate()
        except Exception:
            self._log(traceback.format_exc(), level=logging.CRITICAL)
        
        self._log("Operation - consolidated")
        self.state = "consolidated"
    
    def cleanup(self):
        """
        Remove the current operation and all potential results that
        have been stored in this object
        """
        self.state = "idling"
        
        self._log("Operation - cleaned up")
        self._log("Idling...")

        # Remove the file logger for this operation
        logging.getLogger('').removeHandler(self.file_handler)
        # close listener socket
        self.sock.close()
        
        self.current_operation = None
        self.current_process = 0      
Exemplo n.º 7
0
class LoadLevelerBackend(Backend):
    """ Commits every process to LoadLeveler cluster, which resumes parallel execution
    
    Each process corresponds to one combination of input data set and
    parameter choice. The process objects are first pickled.
    The path to the pickled object together with a helper script is then
    submitted to LoadLeveler. There the object is unpickled, called and 
    the backend is informed when the results are stored.
    
    Communication between the independent processes and the backend is
    done via TCP socket connection (see 
    :class:`~pySPACE.environments.backends.ll_backend.LoadLevelerComHandler` for detailed 
    information).
    
    :Author: Anett Seeland ([email protected])
    :Created: 2011/06/08
    :LastChange: 2012/09/06 Add communication to SubflowHandler
    """
    def __init__(self):
        super(LoadLevelerBackend, self).__init__()

        self.state = "idling"
        # create command file template for Loadleveler
        # TODO: maybe have different template on the disc?
        if not hasattr(pySPACE.configuration,"job_class") or \
                                              pySPACE.configuration.job_class == "":
            pySPACE.configuration.job_class = "general"
        if not hasattr(pySPACE.configuration,"consumable_memory") or \
                                      pySPACE.configuration.consumable_memory == "":
            pySPACE.configuration.consumable_memory = "3250mb"
        if not hasattr(pySPACE.configuration,"consumable_cpus") or \
                                        pySPACE.configuration.consumable_cpus == "":
            pySPACE.configuration.consumable_cpus = 1

        assert (
            pySPACE.configuration.job_class in [
                'critical', 'critical_forking', 'general', 'general_forking',
                'longterm', 'longterm_forking', 'test'
            ]
        ), "LL_Backend:: Job class not existing! Check your pySPACE config file!"


        self.LL_COMMAND_FILE_TEMPLATE = \
           "# @ job_type = serial \n"+ \
           "# @ notification = never \n"+ \
           "# @ class = "+ pySPACE.configuration.job_class +" \n"+ \
           "# @ resources = " \
               "ConsumableMemory("+ pySPACE.configuration.consumable_memory + ") " \
               "ConsumableCPUs(" + str(pySPACE.configuration.consumable_cpus) +") \n"
        if hasattr(pySPACE.configuration,
                   "anodes") and pySPACE.configuration.anodes != "":
            self.LL_COMMAND_FILE_TEMPLATE += \
                "# @ requirements = " + pySPACE.configuration.anodes + " \n"
        self.LL_COMMAND_FILE_TEMPLATE += \
           "# @ executable = /usr/bin/python \n"+ \
           "# @ arguments = "+ os.path.join(pySPACE.configuration.root_dir,
                                           "environments","backends","ll_runner.py")+ \
                            " %(process_file_path)s " + self.SERVER_IP + \
                            " %(server_port)d \n" + \
           "# @ output = %(op_result_dir)s/log/pySPACE_$(jobid).out \n"+ \
           "# @ error = %(op_result_dir)s/log/pySPACE_$(jobid).err \n"+ \
           "# @ queue"

        # queue for execution
        self.result_handlers = None
        # to label message end when communicating via socket connection
        self.end_token = "!END!"

        self._log("Created LoadLeveler Backend.")

    def stage_in(self, operation):
        """ Stage the current operation """
        super(LoadLevelerBackend, self).stage_in(operation)
        # set up queue
        self.result_handlers = multiprocessing.Queue(200)
        # Set up progress bar
        widgets = [
            'Operation progress: ',
            Percentage(), ' ',
            Bar(), ' ',
            ETA()
        ]
        self.progress_bar = ProgressBar(
            widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self):
        """ Execute all processes specified in the currently staged operation """
        assert (self.state == "staged")

        self._log("Operation - executing")
        self.state = "executing"

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}
        # the communication properties to talk to LoadLevelerComHandler
        backend_com = (self.SERVER_IP, self.SERVER_PORT)
        print('--> Loadleveler Communication : \n\t\t host:%s, port:%s' %
              (self.host, self.port))
        # Prepare the directory where processes are stored before submitted
        # to LoadLeveler
        self.process_dir = os.sep.join(
            [self.current_operation.result_directory, ".processes"])
        if not os.path.exists(self.process_dir):
            os.mkdir(self.process_dir)
        # create and start server socket thread
        self.listener = LoadLevelerComHandler(
            self.sock,
            self.result_handlers,
            self.progress_bar,
            self.LL_COMMAND_FILE_TEMPLATE,
            operation_dir=self.current_operation.result_directory)
        self.listener.start()
        # create a client socket to talk to server socket thread
        send_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        send_socket.connect((self.SERVER_IP, self.SERVER_PORT))

        # get first process from creation queue
        process = self.current_operation.processes.get()
        process_counter = 0

        # Until not all Processes have been created, prepare all processes
        # from the queue for remote execution and execute them
        while process != False:
            process.prepare(pySPACE.configuration, handler_class, handler_args,
                            backend_com)
            # since preparing the process might be quite faster than executing
            # it we need another queue where processes get out when they have
            # finished execution
            self.result_handlers.put(1)
            # pickle the process object
            proc_file_name = os.sep.join(
                [self.process_dir,
                 "process_%d.pickle" % process_counter])
            proc_file = open(proc_file_name, "w")
            cPickle.dump(process, proc_file, cPickle.HIGHEST_PROTOCOL)
            proc_file.close()
            # fill out LoadLeveler template
            llfile = self.LL_COMMAND_FILE_TEMPLATE % \
                      {"process_file_path": proc_file_name,
                       "server_port": self.SERVER_PORT,
                       "op_result_dir": self.current_operation.result_directory}
            llfilepath = os.path.join(self.current_operation.result_directory,
                                      "ll_call.cmd")
            f = open(llfilepath, 'w')
            f.write(llfile)
            f.close()
            # submit to LoadLeveler
            while 1:
                outlog, errlog = sub.Popen(["llsubmit", llfilepath],
                                           stdout=sub.PIPE,
                                           stderr=sub.PIPE).communicate()
                if errlog == "":
                    break
                else:
                    self._log("Warning: Job submission to LoadLeveler failed"\
                              " with %s. Job will be resubmitted." % errlog,
                              logging.WARNING)
                    time.sleep(1)
            # parse job_id for monitoring
            loadl_id = outlog.split("\"")[1].split(".")[-1]
            # inform listener that we successfully submitted the job
            send_socket = inform('submitted;%d;%s%s' % \
                                 (process_counter, loadl_id, self.end_token),
                                send_socket, (self.SERVER_IP,self.SERVER_PORT))
            # get next process and update process_counter
            process = self.current_operation.processes.get()
            process_counter += 1

        # send message 'creation finished' to listener
        send_socket = inform('creation finished' + self.end_token, send_socket,
                             (self.SERVER_IP, self.SERVER_PORT))
        # give socket chance to process message
        # time.sleep(0.001)
        self.listener.creation_finished = True
        send_socket.shutdown(socket.SHUT_RDWR)
        send_socket.close()

    def check_status(self):
        """ Return a description of the current state of the operations execution
        
        .. todo:: do we really need this method???
        """
        # Returns the current state of the operation
        return self.state

    def retrieve(self):
        """ Wait for all results of the operation
        
        This call blocks until all processes are finished.
        """
        assert (self.state == "executing")
        self._log("All processes submitted. Waiting for finishing.")
        # since self.current_operation.number_processes is not reliable (maybe
        # to high) we wait until the listener thread is terminated
        self.listener.finished.wait()
        self._log("Worker processes have exited gracefully")

        self.current_operation.processes.close()

        # if process creation has another thread
        if self.current_operation.create_process != None:
            self.current_operation.create_process.join()
        self.result_handlers.close()
        # join also listener thread
        self.listener.join()
        # Change the state to finished
        self._log("Operation - retrieved")
        self.state = "retrieved"

    def consolidate(self):
        """ Consolidate the single processes' results into a consistent result of the whole operation """
        assert (self.state == "retrieved")

        self.current_operation.consolidate()

        self._log("Operation - consolidated")

        # collect all log file
        def _merge_files(file_list, delete=True):
            result_str = ""
            for filename in file_list:
                tmp_str = ""
                try:
                    if os.path.getsize(filename) != 0:
                        tmp_str += filename.split(os.sep)[-1] + "\n" + \
                                      len(filename.split(os.sep)[-1])*"-" + "\n"
                        f = open(filename, 'r')
                        tmp_str += f.read()
                        f.close()
                        tmp_str += 80 * "-" + "\n"
                    if delete:
                        os.remove(filename)
                except Exception, e:
                    warnings.warn("Problems with file %s: %s." %
                                  (filename, str(e)))
                result_str += tmp_str
            return result_str

        outlist = glob.glob(self.current_operation.result_directory +
                            "/log/pySPACE*.out")
        out = _merge_files(outlist)
        errlist = glob.glob(self.current_operation.result_directory +
                            "/log/pySPACE*.err")
        err = _merge_files(errlist)

        merged_out = open(
            self.current_operation.result_directory + "/pySPACE.out", 'w')
        merged_out.write(out)
        merged_out.close()

        merged_err = open(
            self.current_operation.result_directory + "/pySPACE.err", 'w')
        merged_err.write(err)
        merged_err.close()

        try:
            outlist = glob.glob(self.current_operation.result_directory +
                                "/sub_log/pySPACE*.out")
            out = _merge_files(outlist)
            errlist = glob.glob(self.current_operation.result_directory +
                                "/sub_log/pySPACE*.err")
            err = _merge_files(errlist)

            merged_out = open(
                self.current_operation.result_directory + "/pySPACE_sub.out",
                'w')
            merged_out.write(out)
            merged_out.close()

            merged_err = open(
                self.current_operation.result_directory + "/pySPACE_sub.err",
                'w')
            merged_err.write(err)
            merged_err.close()
        except:
            pass

        self._log("Process Logging - consolidated")

        self.state = "consolidated"
Exemplo n.º 8
0
class LoadLevelerBackend(Backend):
    """ Commits every process to LoadLeveler cluster, which resumes parallel execution
    
    Each process corresponds to one combination of input data set and
    parameter choice. The process objects are first pickled.
    The path to the pickled object together with a helper script is then
    submitted to LoadLeveler. There the object is unpickled, called and 
    the backend is informed when the results are stored.
    
    Communication between the independent processes and the backend is
    done via TCP socket connection (see 
    :class:`~pySPACE.environments.backends.ll_backend.LoadLevelerComHandler` for detailed 
    information).
    
    :Author: Anett Seeland ([email protected])
    :Created: 2011/06/08
    :LastChange: 2012/09/06 Add communication to SubflowHandler
    """
    
    def __init__(self):
        super(LoadLevelerBackend, self).__init__()
        
        self.state = "idling"
        # create command file template for Loadleveler
        # TODO: maybe have different template on the disc?
        if not hasattr(pySPACE.configuration,"job_class") or \
                                              pySPACE.configuration.job_class == "":
            pySPACE.configuration.job_class="general"
        if not hasattr(pySPACE.configuration,"consumable_memory") or \
                                      pySPACE.configuration.consumable_memory == "":
            pySPACE.configuration.consumable_memory="3250mb"
        if not hasattr(pySPACE.configuration,"consumable_cpus") or \
                                        pySPACE.configuration.consumable_cpus == "":
            pySPACE.configuration.consumable_cpus=1
        
        assert (pySPACE.configuration.job_class in ['critical', 'critical_forking', 
                                                    'general', 'general_forking', 
                                                    'longterm', 'longterm_forking',
                                                    'test']), "LL_Backend:: Job class not existing! Check your pySPACE config file!" 
                                             
        
        self.LL_COMMAND_FILE_TEMPLATE = \
           "# @ job_type = serial \n"+ \
           "# @ notification = never \n"+ \
           "# @ class = "+ pySPACE.configuration.job_class +" \n"+ \
           "# @ resources = " \
               "ConsumableMemory("+ pySPACE.configuration.consumable_memory + ") " \
               "ConsumableCPUs(" + str(pySPACE.configuration.consumable_cpus) +") \n"
        if hasattr(pySPACE.configuration,"anodes") and pySPACE.configuration.anodes!="":
            self.LL_COMMAND_FILE_TEMPLATE += \
                "# @ requirements = " + pySPACE.configuration.anodes + " \n"
        self.LL_COMMAND_FILE_TEMPLATE += \
           "# @ executable = /usr/bin/python \n"+ \
           "# @ arguments = "+ os.path.join(pySPACE.configuration.root_dir,
                                           "environments","backends","ll_runner.py")+ \
                            " %(process_file_path)s " + self.SERVER_IP + \
                            " %(server_port)d \n" + \
           "# @ output = %(op_result_dir)s/log/pySPACE_$(jobid).out \n"+ \
           "# @ error = %(op_result_dir)s/log/pySPACE_$(jobid).err \n"+ \
           "# @ queue"
              
        # queue for execution
        self.result_handlers = None
        # to label message end when communicating via socket connection
        self.end_token = "!END!"
                
        self._log("Created LoadLeveler Backend.")
        
    def stage_in(self, operation):
        """ Stage the current operation """
        super(LoadLevelerBackend, self).stage_in(operation)
        # set up queue
        self.result_handlers = multiprocessing.Queue(200)
        # Set up progress bar
        widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()]
        self.progress_bar = ProgressBar(widgets = widgets, 
                               maxval = self.current_operation.number_processes)
        self.progress_bar.start()
        
        self._log("Operation - staged")
        self.state = "staged"
        
    def execute(self):
        """ Execute all processes specified in the currently staged operation """
        assert(self.state == "staged")
        
        self._log("Operation - executing")
        self.state = "executing" 
        
        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host" : self.host, "port" : self.port}
        # the communication properties to talk to LoadLevelerComHandler
        backend_com = (self.SERVER_IP, self.SERVER_PORT)
        print('--> Loadleveler Communication : \n\t\t host:%s, port:%s' % \
                                            (self.SERVER_IP, self.SERVER_PORT))
        # Prepare the directory where processes are stored before submitted
        # to LoadLeveler
        self.process_dir = os.sep.join([self.current_operation.result_directory,
                                   ".processes"])
        if not os.path.exists(self.process_dir):
            os.mkdir(self.process_dir)
        # create and start server socket thread 
        self.listener = LoadLevelerComHandler(self.sock, self.result_handlers,
                                              self.progress_bar, 
                                              self.LL_COMMAND_FILE_TEMPLATE,
                                              operation_dir=self.current_operation.result_directory)
        self.listener.start()
        # create a client socket to talk to server socket thread
        send_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        send_socket.connect((self.SERVER_IP,self.SERVER_PORT))
        
        # get first process from creation queue
        process = self.current_operation.processes.get()
        process_counter = 0
        
        # Until not all Processes have been created, prepare all processes
        # from the queue for remote execution and execute them
        while process != False:
            process.prepare(pySPACE.configuration, handler_class, handler_args,
                            backend_com)
            # since preparing the process might be quite faster than executing
            # it we need another queue where processes get out when they have
            # finished execution
            self.result_handlers.put(1)
            # pickle the process object
            proc_file_name = os.sep.join([self.process_dir,
                                         "process_%d.pickle" % process_counter])
            proc_file = open(proc_file_name, "w")
            cPickle.dump(process, proc_file, cPickle.HIGHEST_PROTOCOL)
            proc_file.close()
            # fill out LoadLeveler template
            llfile = self.LL_COMMAND_FILE_TEMPLATE % \
                      {"process_file_path": proc_file_name,
                       "server_port": self.SERVER_PORT,
                       "op_result_dir": self.current_operation.result_directory}
            llfilepath = os.path.join(self.current_operation.result_directory,
                                        "ll_call.cmd")
            f=open(llfilepath,'w')
            f.write(llfile)
            f.close()
            # submit to LoadLeveler
            while 1:
                outlog, errlog = sub.Popen(["llsubmit", llfilepath], 
                                stdout=sub.PIPE, stderr=sub.PIPE).communicate()
                if errlog == "":
                    break
                else:
                    self._log("Warning: Job submission to LoadLeveler failed"\
                              " with %s. Job will be resubmitted." % errlog,
                              logging.WARNING)
                    time.sleep(1)
            # parse job_id for monitoring
            loadl_id = outlog.split("\"")[1].split(".")[-1]
            # inform listener that we successfully submitted the job
            send_socket = inform('submitted;%d;%s%s' % \
                                 (process_counter, loadl_id, self.end_token), 
                                send_socket, (self.SERVER_IP,self.SERVER_PORT))
            # get next process and update process_counter
            process = self.current_operation.processes.get()
            process_counter+=1
            
        # send message 'creation finished' to listener
        send_socket = inform('creation finished'+self.end_token, send_socket,
                                              (self.SERVER_IP,self.SERVER_PORT))
        # give socket chance to process message
        # time.sleep(0.001)
        self.listener.creation_finished = True
        send_socket.shutdown(socket.SHUT_RDWR)
        send_socket.close()
 
    def check_status(self):
        """ Return a description of the current state of the operations execution
        
        .. todo:: do we really need this method???
        """
        # Returns the current state of the operation
        return self.state
    
    def retrieve(self):
        """ Wait for all results of the operation
        
        This call blocks until all processes are finished.
        """
        assert(self.state == "executing")
        self._log("All processes submitted. Waiting for finishing.")
        # since self.current_operation.number_processes is not reliable (maybe
        # to high) we wait until the listener thread is terminated
        self.listener.finished.wait()
        self._log("Worker processes have exited gracefully")      

        self.current_operation.processes.close()

        # if process creation has another thread
        if self.current_operation.create_process != None:
            self.current_operation.create_process.join()
        self.result_handlers.close()
        # join also listener thread
        self.listener.join()
        # Change the state to finished        
        self._log("Operation - retrieved")
        self.state = "retrieved"
    
    def consolidate(self):
        """ Consolidate the single processes' results into a consistent result of the whole operation """
        assert(self.state == "retrieved")
        
        self.current_operation.consolidate()
        
        self._log("Operation - consolidated")
        
        # collect all log file
        def _merge_files(file_list, delete=True):
            result_str = ""
            for filename in file_list:
                tmp_str=""
                try:
                    if os.path.getsize(filename)!=0:
                        tmp_str += filename.split(os.sep)[-1] + "\n" + \
                                      len(filename.split(os.sep)[-1])*"-" + "\n"
                        f=open(filename,'r')
                        tmp_str += f.read()
                        f.close()
                        tmp_str += 80*"-" + "\n"
                    if delete:
                        os.remove(filename)
                except Exception, e:
                    warnings.warn("Problems with file %s: %s." %(filename,str(e)))
                result_str += tmp_str
            return result_str
        
        outlist = glob.glob(self.current_operation.result_directory+"/log/pySPACE*.out")
        out = _merge_files(outlist)
        errlist = glob.glob(self.current_operation.result_directory+"/log/pySPACE*.err")
        err = _merge_files(errlist)
        
        merged_out = open(self.current_operation.result_directory+"/pySPACE.out",'w')
        merged_out.write(out)
        merged_out.close()
        
        merged_err = open(self.current_operation.result_directory+"/pySPACE.err",'w')
        merged_err.write(err)
        merged_err.close()

        try:
            outlist = glob.glob(self.current_operation.result_directory+"/sub_log/pySPACE*.out")
            out = _merge_files(outlist)
            errlist = glob.glob(self.current_operation.result_directory+"/sub_log/pySPACE*.err")
            err = _merge_files(errlist)

            merged_out = open(self.current_operation.result_directory+"/pySPACE_sub.out",'w')
            merged_out.write(out)
            merged_out.close()

            merged_err = open(self.current_operation.result_directory+"/pySPACE_sub.err",'w')
            merged_err.write(err)
            merged_err.close()
        except:
            pass

        self._log("Process Logging - consolidated")
        
        self.state = "consolidated"
Exemplo n.º 9
0
class MulticoreBackend(Backend):
    """ Execute as many processes in parallel as there are (logical) CPUs on the local machine
    
    This backend is based on the multiprocessing package and should work on every
    multicore system without additional settings even on virtual machines.
    Each process corresponds to one combination of input data set and
    parameter choice.
    
    :Author: Anett Seeland ([email protected])
    :LastChange: 2012/09/24
    
    """
    def __init__(self, pool_size=None):
        super(MulticoreBackend, self).__init__()

        # Set the number of processes in the pool
        # per default to the number of CPUs
        if pool_size == None:
            pool_size = MulticoreBackend.detect_CPUs()

        self.pool_size = pool_size

        self.state = "idling"

        # queue for execution
        self.result_handlers = multiprocessing.Queue(pool_size + 2)

        self.pool = None
        self.current_process = 0

        self._log("Created MulticoreBackend with pool size %s" % pool_size)

    def reset_queue(self):
        """ Resets the execution queue"""
        self.result_handlers = multiprocessing.Queue(self.pool_size + 2)

    def stage_in(self, operation):
        """ Stage the current operation """
        super(MulticoreBackend, self).stage_in(operation)
        self.pool = multiprocessing.Pool(processes=self.pool_size)

        # Set up progress bar
        widgets = [
            'Operation progress: ',
            Percentage(), ' ',
            Bar(), ' ',
            ETA()
        ]
        self.progress_bar = ProgressBar(
            widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self):
        """ Execute all processes specified in the currently staged operation """
        assert (self.state == "staged")

        self._log("Operation - executing")
        self.state = "executing"

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}
        backend_com = (self.SERVER_IP, self.SERVER_PORT)

        # A socket communication thread to handle e.g. subflows
        self.listener = LocalComHandler(self.sock)
        self.listener.start()

        try:
            process = self.current_operation.processes.get()
        except KeyboardInterrupt:
            process = False
        # Until not all Processes have been created prepare all processes
        # from the queue for remote execution and execute them
        while process != False:
            process.prepare(pySPACE.configuration, handler_class, handler_args,
                            backend_com)
            # since preparing the process might be quite faster than executing
            # it we need another queue where processes get out when they have
            # finished execution
            self.result_handlers.put(1)
            # Execute all functions in the process pool but return immediately
            self.pool.apply_async(process, callback=self.dequeue_process)
            process = self.current_operation.processes.get()
            time.sleep(0.1)

    def dequeue_process(self, result):
        """ Callback function for finished processes """
        self.current_process += 1
        self.result_handlers.get()
        self.progress_bar.update(self.current_process)

    def check_status(self):
        """ Return a description of the current state of the operations execution
        
        .. todo:: do we really need this method???
        """
        # Returns which percentage of processes of the current operation
        # is already finished
        return float(
            self.current_process) / self.current_operation.number_processes

    def retrieve(self):
        """ Wait for all results of the operation
        
        This call blocks until all processes are finished.
        """
        assert (self.state == "executing")

        # Prevent any other processes from being submitted to the pool
        # (necessary for join)
        self.pool.close()
        self._log("Closing pool", level=logging.DEBUG)

        self._log("Operation - retrieved")
        self.current_operation.processes.close()
        # if process creation has another thread
        if hasattr(self.current_operation, "create_process") \
            and self.current_operation.create_process != None:
            self.current_operation.create_process.join()
        self.pool.join()  # Wait for worker processes to exit
        self._log("Worker processes have exited gracefully")
        self.result_handlers.close()
        # inform listener that its time to die
        self.listener.operation_finished = True
        time.sleep(1)
        self.listener.join()
        # Change the state to finished
        self.state = "retrieved"

    def consolidate(self):
        """ Consolidate the single processes' results into a consistent result of the whole operation """
        assert (self.state == "retrieved")

        try:
            self.current_operation.consolidate()
        except Exception:
            import traceback
            self._log(traceback.format_exc(), level=logging.ERROR)

        self._log("Operation - consolidated")

        self.state = "consolidated"

    def cleanup(self):
        """ Remove the current operation and all potential results that have been stored in this object """
        self.state = "idling"

        self._log("Operation - cleaned up")
        self._log("Idling...")

        # Remove the file logger for this operation
        logging.getLogger('').removeHandler(self.file_handler)
        # close listener socket
        self.sock.close()

        self.current_operation = None
        self.current_process = 0

    @classmethod
    def detect_CPUs(cls):
        """ Detects the number of CPUs on a system. Cribbed from pp.
        
        :from: http://codeliberates.blogspot.com/2008/05/detecting-cpuscores-in-python.html
        """
        ncpus = None
        # Linux, Unix and MacOS:
        if hasattr(os, "sysconf"):
            if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"):
                # Linux & Unix:
                ncpus = os.sysconf("SC_NPROCESSORS_ONLN")
            if isinstance(ncpus, int) and ncpus > 0:
                return ncpus
            else:  # OSX:
                return int(os.popen2("sysctl -n hw.ncpu")[1].read())
        # Windows:
        if os.environ.has_key("NUMBER_OF_PROCESSORS"):
            ncpus = int(os.environ["NUMBER_OF_PROCESSORS"])
            if ncpus > 0:
                return ncpus
        return 1  # Default
Exemplo n.º 10
0
    def stage_in(self, operation):
        """
        Stage the current operation
        """
        super(MpiBackend, self).stage_in(operation)
        # init of process lists, because backend is only initialized once
        self.process_args_list = []
        self.IndexCopyStart = 0
        self.ProcessingSuccessful = True
        self.TotalProcessesFinished = 0
        self.CrashedProcesses = []
        # Set up progress bar
        widgets = [
            'Operation progress: ',
            Percentage(), ' ',
            Bar(), ' ',
            ETA()
        ]
        self.progress_bar = ProgressBar(
            widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}

        # Set up stage in directory
        stagein_dir = os.sep.join(
            [self.current_operation.result_directory, ".stagein"])
        # Check if hosts file is created in the right directoy
        HostfileCreated = pySPACE.configuration.root_dir + "/" + 'hostsfile'
        if (not os.path.isfile(HostfileCreated)):
            print "***************************************************************************************************"
            print "hostsfile not created !"
            print "Please create the hosts file with a filename 'hostsfile' under ", pySPACE.configuration.root_dir
            print "***************************************************************************************************"
            raise UserWarning('Missing hostsfile.')
        if not os.path.exists(stagein_dir):
            os.mkdir(stagein_dir)

        process = self.current_operation.processes.get()
        print "Preparing processes. This might take a few minutes...."
        # Until not all Processes have been created prepare all processes
        # from the queue for remote execution and execute them
        i = 0
        while process != False:
            process.prepare(pySPACE.configuration, handler_class, handler_args)
            # since preparing the process might be quite faster than executing
            # it we need another queue where processes get out when they have
            # finished execution
            #self.result_handlers.put(1)
            # Execute all functions in the process pool but return immediately
            #self.pool.apply_async(process, callback=self.dequeue_process)
            proc_file_name = os.sep.join(
                [stagein_dir, "process_%d.pickle" % i])
            proc_file = open(proc_file_name, "w")
            cPickle.dump(process, proc_file)
            proc_file.close()
            # Add task to job specification
            self.process_args_list.append(proc_file_name)
            # Get the next process
            process = self.current_operation.processes.get()
            i += 1

        self._log("Operation - staged")
        self.state = "staged"
Exemplo n.º 11
0
class MpiBackend(Backend):
    """ 
    A message passing interface (mpi) backend to pySPACE
    
    In order to use this backend, you need a working MPI distribution and mpi4py. 
    You can download mpi4py from http://code.google.com/p/mpi4py/. mpi4py is 
    compatible with a python 2.3  to 2.7 or 3.0 to 3.1 distribution. 

    This backend assumes a global file system that is seen by all nodes running 
    the processes. 
    
    **Parameters**
        :pool_size: Define how many MPI processes should be started in parallel.
                    This should not exceed the amount of available processors.
                    (or the number of mpi slots defined in the hostsfile)
        
            (*recommended, default: 156*)
 
    """
    def __init__(self, pool_size=156):
        super(MpiBackend, self).__init__()
        #self.COMMAND_MPI = '/usr/lib64/openmpi/bin/mpirun'
        self.COMMAND_MPI = 'mpirun'
        self.COMMAND_PYTHON = sys.executable
        self.runner_script = os.sep.join([
            pySPACE.configuration.root_dir, "environments", "backends",
            "mpi_runner.py"
        ])
        # start as many processes as the total number of processors
        # available
        self.NumberOfProcessesToRunAtBeginning = pool_size
        self.NumberOfProcessesToRunLater = pool_size  #39

    def __del__(self):
        pass

    def stage_in(self, operation):
        """
        Stage the current operation
        """
        super(MpiBackend, self).stage_in(operation)
        # init of process lists, because backend is only initialized once
        self.process_args_list = []
        self.IndexCopyStart = 0
        self.ProcessingSuccessful = True
        self.TotalProcessesFinished = 0
        self.CrashedProcesses = []
        # Set up progress bar
        widgets = [
            'Operation progress: ',
            Percentage(), ' ',
            Bar(), ' ',
            ETA()
        ]
        self.progress_bar = ProgressBar(
            widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}

        # Set up stage in directory
        stagein_dir = os.sep.join(
            [self.current_operation.result_directory, ".stagein"])
        # Check if hosts file is created in the right directoy
        HostfileCreated = pySPACE.configuration.root_dir + "/" + 'hostsfile'
        if (not os.path.isfile(HostfileCreated)):
            print "***************************************************************************************************"
            print "hostsfile not created !"
            print "Please create the hosts file with a filename 'hostsfile' under ", pySPACE.configuration.root_dir
            print "***************************************************************************************************"
            raise UserWarning('Missing hostsfile.')
        if not os.path.exists(stagein_dir):
            os.mkdir(stagein_dir)

        process = self.current_operation.processes.get()
        print "Preparing processes. This might take a few minutes...."
        # Until not all Processes have been created prepare all processes
        # from the queue for remote execution and execute them
        i = 0
        while process != False:
            process.prepare(pySPACE.configuration, handler_class, handler_args)
            # since preparing the process might be quite faster than executing
            # it we need another queue where processes get out when they have
            # finished execution
            #self.result_handlers.put(1)
            # Execute all functions in the process pool but return immediately
            #self.pool.apply_async(process, callback=self.dequeue_process)
            proc_file_name = os.sep.join(
                [stagein_dir, "process_%d.pickle" % i])
            proc_file = open(proc_file_name, "w")
            cPickle.dump(process, proc_file)
            proc_file.close()
            # Add task to job specification
            self.process_args_list.append(proc_file_name)
            # Get the next process
            process = self.current_operation.processes.get()
            i += 1

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self, timeout=1e6):
        """
        Executes all processes specified in the currently staged
        operation.
        """
        assert (self.state == "staged")

    def check_status(self):
        """
        Returns a description of the current state of the operations
        execution. 
        """
        #self.progress_bar.update(float(self.current_job.info()["percentDone"]))
        #return float(self.current_job.info()["percentDone"]) / 100.0
        #return float(self.current_process) / self.current_operation.number_processes
        return 1.0

    def not_xor(self, a, b):
        return not ((a or b) and not (a and b))

    def retrieve(self, timeout=1e6):
        """
        Returns the result of the operation.
        """

        self.state = "executing"
        self._log("Operation - executing")
        if (self.NumberOfProcessesToRunAtBeginning > len(
                self.process_args_list)):
            args = ([self.COMMAND_MPI] + ['--loadbalance'] + ['--nolocal'] +
                    ['--hostfile'] +
                    [pySPACE.configuration.root_dir + "/" + 'hostsfile'] +
                    ['-n', str(len(self.process_args_list))] +
                    [self.COMMAND_PYTHON] + [self.runner_script] +
                    self.process_args_list)
            # Start the processes.
            self._log("mpi-parameters: %s" % args, level=logging.DEBUG)
            self._log("mpi-parameters-joined: %s" % os.path.join(args),
                      level=logging.DEBUG)
            p = subprocess.Popen(args)
            #self.pids.append(p)
            self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning
            #print args
        else:
            #copy the arguments of the processes to run
            sub_process_args_list = (
                self.process_args_list[self.IndexCopyStart:self.
                                       NumberOfProcessesToRunAtBeginning])
            args = ([self.COMMAND_MPI] + ['--loadbalance'] + ['--nolocal'] +
                    ['--hostfile'] +
                    [pySPACE.configuration.root_dir + "/" + 'hostsfile'] +
                    ['-n', str(len(sub_process_args_list))] +
                    [self.COMMAND_PYTHON] + [self.runner_script] +
                    sub_process_args_list)
            # Start the processes.
            p = subprocess.Popen(args)
            #self.pids.append(p) # TODO: call p.poll() for p in self.pids after all processes have exited
            self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning
            #print args

        # Create a list of boolean for processes which are finished.
        # First we assume that all processes are not started, so we set
        # every element of the list to false
        FinishedProcesses = [False for i in range(len(self.process_args_list))]

        # Wait until all processes finish and start new processes
        # when old ones finish

        print "Waiting for the processes to finish...."

        # Counter for the processes which are finished. It will be reset
        # after 'NumberOfProcessesToRunLater' processes are finished
        CounterProcessesFinished = 0
        processes_Finished = False

        while not processes_Finished:
            try:
                processes_Finished = True
                for LoopCounter, process_args in enumerate(
                        self.process_args_list):
                    if (self.not_xor(
                            os.path.isfile(process_args + "_Finished"),
                            os.path.isfile(process_args + "_Crashed"))):
                        processes_Finished = False
                    else:
                        if (FinishedProcesses[LoopCounter] == False):
                            # Record that the process is finished
                            FinishedProcesses[LoopCounter] = True
                            # If the process is crashed take note of that
                            if (os.path.isfile(process_args + "_Crashed")):
                                self.CrashedProcesses.append(process_args)
                            # Increment the counter for the number of processes finished
                            # by one
                            CounterProcessesFinished += 1
                            self.TotalProcessesFinished += 1
                            # update the progress bar
                            self.progress_bar.update(
                                self.TotalProcessesFinished)
                            if (CounterProcessesFinished ==
                                    self.NumberOfProcessesToRunLater):
                                # Define a variable for a subset of processes to run
                                sub_process_args_list = []
                                if (self.IndexCopyStart == len(
                                        self.process_args_list)):
                                    break
                                elif ((self.IndexCopyStart +
                                       self.NumberOfProcessesToRunLater) < len(
                                           self.process_args_list)):
                                    sub_process_args_list = (
                                        self.process_args_list[
                                            self.IndexCopyStart:self.
                                            IndexCopyStart +
                                            self.NumberOfProcessesToRunLater])
                                else:
                                    sub_process_args_list = self.process_args_list[
                                        self.IndexCopyStart:len(
                                            self.process_args_list)]
                                args = (
                                    [self.COMMAND_MPI] + ['--loadbalance'] +
                                    ['--nolocal'] + ['--hostfile'] + [
                                        pySPACE.configuration.root_dir + "/" +
                                        'hostsfile'
                                    ] +
                                    ['-n',
                                     str(len(sub_process_args_list))] +
                                    [self.COMMAND_PYTHON] +
                                    [self.runner_script] +
                                    sub_process_args_list)
                                # Start the processes
                                if (len(sub_process_args_list) > 0):
                                    p = subprocess.Popen(args)
                                #print args
                                # Adjust the start index
                                self.IndexCopyStart += self.NumberOfProcessesToRunLater
                                # Reset the counter for processes finished
                                CounterProcessesFinished = 0
                # sleep for one second
                time.sleep(1)
            except (KeyboardInterrupt,
                    SystemExit):  # if processes hang forever
                self.ProcessingSuccessful = False
                print "*********************************************************************************************************"
                print "pySPACE forced to stop ..."
                print "Please wait until mpi_backend is finished with consolidating the results generated and with clean up ..."
                print "**********************************************************************************************************"
                import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary
                # merge the remaining files
                print "***************************************************************************************************"
                print "Starting merging . . ."
                PerformanceResultSummary.merge_performance_results(
                    self.current_operation.result_directory)
                print "Merging complete . . ."
                print "***************************************************************************************************"
                break  #The while loop will break

        self._log("Operation - processing finished")

        # Change the state to retrieved
        self.state = "retrieved"

        return None

    def consolidate(self):
        """
        Consolidates the results of the single processes into a consistent result of the whole
        operation
        """
        assert (self.state == "retrieved")

        if ((self.ProcessingSuccessful == True)
                and (len(self.CrashedProcesses) == 0)):
            self.current_operation.consolidate()

        if ((self.ProcessingSuccessful == True)
                and (len(self.CrashedProcesses) != 0)):
            import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary
            # merge the remaining files
            print "***************************************************************************************************"
            print "Starting merging . . ."
            PerformanceResultSummary.merge_performance_results(
                self.current_operation.result_directory)
            print "Merging complete . . ."
            print "***************************************************************************************************"

        self._log("Operation - consolidated")

        self.state = "consolidated"

    def cleanup(self):
        """
        Remove the current operation and all potential results that
        have been stored in this object
        """
        self.state = "idling"

        # Cleaning up...
        stagein_dir = os.sep.join(
            [self.current_operation.result_directory, ".stagein"])
        if ((self.ProcessingSuccessful == True)
                and (len(self.CrashedProcesses) == 0)):
            deleted = False

            while not deleted:
                try:
                    os.chdir("..")
                    shutil.rmtree(stagein_dir)
                    deleted = True
                except OSError, e:
                    if e.errno == 66:
                        self._log(
                            "Could not remove .stagein dir "
                            ", waiting for NFS lock",
                            level=logging.WARNING)
                    time.sleep(5)

        self._log("Operation - cleaned up")
        self._log("Idling...")

        # Remove the file logger for this operation
        logging.getLogger('').removeHandler(self.file_handler)
        # close listener socket
        self.sock.close()
        self.current_operation = None
Exemplo n.º 12
0
class MulticoreBackend(Backend):
    """ Execute as many processes in parallel as there are (logical) CPUs on the local machine
    
    This backend is based on the multiprocessing package and should work on every
    multicore system without additional settings even on virtual machines.
    Each process corresponds to one combination of input data set and
    parameter choice.
    
    :Author: Anett Seeland ([email protected])
    :LastChange: 2012/09/24
    
    """

    def __init__(self, pool_size=None):
        super(MulticoreBackend, self).__init__()

        # Set the number of processes in the pool
        # per default to the number of CPUs
        if pool_size is None:
            pool_size = MulticoreBackend.detect_CPUs()
        self.pool_size = pool_size
        self.state = "idling"

        # queue for execution
        self.result_handlers = []
        self.pool = None
        self.current_process = 0
        self._log("Created MulticoreBackend with pool size %s" % pool_size)

    def reset_queue(self):
        """ Resets the execution queue"""
        self.result_handlers = []

    def stage_in(self, operation):
        """ Stage the current operation """
        super(MulticoreBackend, self).stage_in(operation)
        self.pool = multiprocessing.Pool(processes=self.pool_size)

        # Set up progress bar
        widgets = ["Operation progress: ", Percentage(), " ", Bar(), " ", ETA()]
        self.progress_bar = ProgressBar(widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self, timeout=None):
        """ Execute all processes specified in the currently staged operation """
        # This blocks until all results are available, hence this call is synchronize
        assert self.state == "staged"

        self._log("Operation - executing")
        self.state = "executing"

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}
        backend_com = (self.SERVER_IP, self.SERVER_PORT)

        # A socket communication thread to handle e.g. subflows
        self.listener = LocalComHandler(self.sock)
        self.listener.start()

        try:
            process = self.current_operation.processes.get(timeout=timeout)
        except KeyboardInterrupt:
            process = False
        # Until not all Processes have been created prepare all processes
        # from the queue for remote execution and execute them
        while not process is False:
            process.prepare(pySPACE.configuration, handler_class, handler_args, backend_com)
            # Execute all functions in the process pool but return immediately
            self.pool.apply_async(process, callback=self.dequeue_process)
            process = self.current_operation.processes.get(timeout=timeout)
            time.sleep(0.1)

    def dequeue_process(self, result):
        """ Callback function for finished processes """
        self.current_process += 1
        self.progress_bar.update(self.current_process)

    def check_status(self):
        """ Return a description of the current state of the operations execution
        
        .. todo:: do we really need this method???
        """
        # Returns which percentage of processes of the current operation
        # is already finished
        return float(self.current_process) / self.current_operation.number_processes

    def retrieve(self, timeout=0):
        """ Wait for all results of the operation
        
        This call blocks until all processes are finished.
        """
        assert self.state == "executing"

        # Prevent any other processes from being submitted to the pool
        # (necessary for join)
        self.pool.close()
        self._log("Closing pool", level=logging.DEBUG)

        self._log("Operation - retrieved")
        self.current_operation.processes.close()
        # if process creation has another thread
        if hasattr(self.current_operation, "create_process") and self.current_operation.create_process != None:
            self.current_operation.create_process.join()
        # Close the result handler and wait for every process
        # to terminate
        try:
            for result in self.result_handlers:
                result.wait(timeout=timeout)
        except multiprocessing.TimeoutError:
            # A timeout occurred, terminate the pool
            self._log("Timeout occurred, terminating worker processes")
            self.pool.terminate()
            return False
        finally:
            self.pool.join()  # Wait for worker processes to exit
            # inform listener that its time to die
            self.listener.operation_finished = True
            time.sleep(1)
            self.listener.join()
            # Change the state to finished
            self.state = "retrieved"
        self._log("Worker processes have exited gracefully")
        return True

    def consolidate(self):
        """ Consolidate the single processes' results into a consistent result of the whole operation """
        assert self.state == "retrieved"
        try:
            self.current_operation.consolidate()
        except Exception:
            import traceback

            self._log(traceback.format_exc(), level=logging.ERROR)
        self._log("Operation - consolidated")
        self.state = "consolidated"

    def cleanup(self):
        """ Remove the current operation and all potential results that have been stored in this object """
        self.state = "idling"
        self._log("Operation - cleaned up")
        self._log("Idling...")
        # Remove the file logger for this operation
        logging.getLogger("").removeHandler(self.file_handler)
        # close listener socket
        self.sock.close()
        self.current_operation = None
        self.current_process = 0

    @classmethod
    def detect_CPUs(cls):
        """ Detects the number of CPUs on a system. Cribbed from pp.
        
        :from: http://codeliberates.blogspot.com/2008/05/detecting-cpuscores-in-python.html
        """
        ncpus = None
        # Linux, Unix and MacOS:
        if hasattr(os, "sysconf"):
            if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"):
                # Linux & Unix:
                ncpus = os.sysconf("SC_NPROCESSORS_ONLN")
            if isinstance(ncpus, int) and ncpus > 0:
                return ncpus
            else:  # OSX:
                return int(os.popen2("sysctl -n hw.ncpu")[1].read())
        # Windows:
        if os.environ.has_key("NUMBER_OF_PROCESSORS"):
            ncpus = int(os.environ["NUMBER_OF_PROCESSORS"])
            if ncpus > 0:
                return ncpus
        return 1  # Default
Exemplo n.º 13
0
class LoadLevelerBackend(Backend):
    """ Commits every process to LoadLeveler cluster, which resumes parallel execution

    Each process corresponds to one combination of input data set and
    parameter choice. The process objects are first pickled.
    The path to the pickled object together with a helper script is then
    submitted to LoadLeveler. There the object is unpickled, called and
    the backend is informed when the results are stored.

    Communication between the independent processes and the backend is
    done via TCP socket connection (see
    :class:`~pySPACE.environments.backends.ll_backend.LoadLevelerComHandler` for detailed
    information).

    :Author: Anett Seeland ([email protected])
    :Created: 2011/06/08
    :LastChange: 2012/09/06 Add communication to SubflowHandler
    """
    LL_COMMAND_FILE_TEMPLATE = """
# @ job_type = serial
# @ notification = never
# @ class = {job_class}
# @ resources = ConsumableMemory({memory}) ConsumableCPUs({CPUs})
# @ requirements = {requirements}
# @ executable = {executable}
# @ arguments = {arguments}
# @ output = %(op_result_dir)s/log/pySPACE_$(jobid).out
# @ error = %(op_result_dir)s/log/pySPACE_$(jobid).err
# @ queue"""

    def __init__(self):
        super(LoadLevelerBackend, self).__init__()

        self.state = "idling"
        # create command file template for Loadleveler
        if "job_class" not in pySPACE.configuration or not pySPACE.configuration["job_class"]:
            pySPACE.configuration["job_class"] = "general"
        if "consumable_memory" not in pySPACE.configuration or not pySPACE.configuration["consumable_memory"]:
            pySPACE.configuration["consumable_memory"] = "3250mb"
        if "consumable_cpus" not in pySPACE.configuration or not pySPACE.configuration["consumable_cpus"]:
            pySPACE.configuration["consumable_cpus"] = 1
        if "anode" not in pySPACE.configuration:
            pySPACE.configuration["anode"] = ""

        assert (pySPACE.configuration["job_class"] in ['critical', 'critical_forking',
                                                       'general', 'general_forking',
                                                       'longterm', 'longterm_forking',
                                                       'test']),\
            "LL_Backend:: Job class not existing! Check your pySPACE config file!"

        self.template_file = LoadLevelerBackend.LL_COMMAND_FILE_TEMPLATE.format(
            executable=sys.executable,
            arguments=" ".join([os.path.join(pySPACE.configuration.root_dir,
                                             "environments", "backends", "ll_runner.py"),
                                "%(process_file_path)s", self.SERVER_IP, "%(server_port)d"]),
            job_class=pySPACE.configuration["job_class"],
            memory=pySPACE.configuration["consumable_memory"],
            CPUs=pySPACE.configuration["consumable_cpus"],
            requirements=pySPACE.configuration["anode"])

        self._log("Using '%s' as template", logging.DEBUG)

        # queue for execution
        self.result_handlers = None
        self.progress_bar = None
        self.process_dir = ""
        self._log("Created LoadLeveler Backend.")

    def stage_in(self, operation):
        """
        Stage the given operation.

        :param operation: The operation to stage.
        :type operation: Operation
        """
        super(LoadLevelerBackend, self).stage_in(operation)
        # set up queue
        self.result_handlers = multiprocessing.Queue(200)
        # Set up progress bar
        widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()]
        self.progress_bar = ProgressBar(widgets=widgets,
                                        maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self, timeout=1e6):
        """ Execute all processes specified in the currently staged operation """
        assert (self.state == "staged")

        self._log("Operation - executing")
        self.state = "executing"

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}
        # the communication properties to talk to LoadLevelerComHandler
        backend_com = (self.SERVER_IP, self.SERVER_PORT)
        self._log('--> Loadleveler Communication : \n\t\t host:%s, port:%s' % (self.SERVER_IP, self.SERVER_PORT))
        # Prepare the directory where processes are stored before submitted
        # to LoadLeveler
        self.process_dir = os.sep.join([self.current_operation.result_directory, ".processes"])
        if not os.path.exists(self.process_dir):
            os.mkdir(self.process_dir)

        process_counter = 0

        # create and start server socket thread
        self.listener = LoadLevelerComHandler(self.sock, self.result_handlers,
                                              self.progress_bar,
                                              self.template_file,
                                              log_func=self._log,
                                              operation_dir=self.current_operation.result_directory)
        self.listener.start()
        # create a client socket to talk to server socket thread
        send_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        send_socket.connect((self.SERVER_IP, self.SERVER_PORT))
        try:
            # Until not all Processes have been created, prepare all processes
            # from the queue for remote execution and execute them
            get_process = partial(self.current_operation.processes.get, timeout=timeout)
            for process in iter(get_process, False):
                process.prepare(pySPACE.configuration, handler_class, handler_args,
                                backend_com)
                # since preparing the process might be quite faster than executing
                # it we need another queue where processes get out when they have
                # finished execution
                self.result_handlers.put(process)
                # pickle the process object
                proc_file_name = os.sep.join([self.process_dir,
                                              "process_%d.pickle" % process_counter])
                with open(proc_file_name, "wb") as proc_file:
                    pickle.dump(process, proc_file, pickle.HIGHEST_PROTOCOL)

                # fill out LoadLeveler template
                llfile = self.template_file % {
                    "process_file_path": proc_file_name,
                    "server_port": self.SERVER_PORT,
                    "op_result_dir": self.current_operation.result_directory}

                llfilepath = os.path.join(self.current_operation.result_directory, "ll_call.cmd")
                with open(llfilepath, 'w') as f:
                    f.write(llfile)

                # submit to LoadLeveler
                error_counter = 0
                while True:
                    outlog, errlog = sub.Popen(["llsubmit", llfilepath],
                                               stdout=sub.PIPE, stderr=sub.PIPE).communicate()
                    if errlog == "":
                        break
                    elif error_counter < 100:
                        self._log("Warning: Job submission to LoadLeveler failed"
                                  " with %s. Job will be resubmitted." % errlog,
                                  logging.WARNING)
                        time.sleep(1)
                        error_counter += 1
                    else:
                        self._log("Warning: Job submission to LoadLeveler failed %d times"
                                  " with %s. skipping job" % (error_counter, errlog),
                                  logging.WARNING)
                        break

                # parse job_id for monitoring
                loadl_id = outlog.split("\"")[1].split(".")[-1]
                # inform listener that we successfully submitted the job
                # noinspection PyTypeChecker
                send_socket = LoadLevelerComHandler.send_message(send_socket, self.SERVER_IP, self.SERVER_PORT,
                                                                 LoadLevelerComHandler.MESSAGES.SUBMITTED,
                                                                 process_counter, loadl_id)
                # update process_counter
                process_counter += 1

            # send message 'creation finished' to listener
            # noinspection PyTypeChecker
            send_socket = LoadLevelerComHandler.send_message(send_socket, self.SERVER_IP, self.SERVER_PORT,
                                                             LoadLevelerComHandler.MESSAGES.CREATION_FINISHED)
        finally:
            self.listener.creation_finished = True
            send_socket.shutdown(socket.SHUT_RDWR)
            send_socket.close()

    def check_status(self):
        """ Return a description of the current state of the operations execution

        .. todo:: do we really need this method???
        """
        # Returns the current state of the operation
        return self.state

    def retrieve(self, timeout=1e10):
        """
        Wait for all results of the operation

        This call blocks until all processes are finished
        or the given timeout is reached. If the timeout is zero,
        the timeout is disabled.

        :param timeout: The time to wait until a job is considered as "finished"
                        and will be stopped.
        :type timeout: int
        """
        assert (self.state == "executing")
        self._log("All processes submitted. Waiting for finishing.")
        # since self.current_operation.number_processes is not reliable (maybe
        # to high) we wait until the listener thread is terminated
        self.listener.finished.wait(timeout=timeout)
        self._log("Worker processes have exited gracefully")

        self.current_operation.processes.close()

        # if process creation has another thread
        if self.current_operation.create_process is not None:
            self.current_operation.create_process.join(timeout=timeout)
        self.result_handlers.close()
        # join also listener thread
        self.listener.join(timeout=timeout)
        # Change the state to finished
        self._log("Operation - retrieved")
        self.state = "retrieved"
        return True

    def consolidate(self):
        """ Consolidate the single processes' results into a consistent result of the whole operation """
        assert (self.state == "retrieved")

        self.current_operation.consolidate()

        self._log("Operation - consolidated")

        # collect all log file
        def _merge_files(file_list, delete=True):
            result_str = ""
            for filename in file_list:
                tmp_str = ""
                try:
                    if os.path.getsize(filename) != 0:
                        tmp_str += filename.split(os.sep)[-1] + "\n" + len(filename.split(os.sep)[-1]) * "-" + "\n"
                        with open(filename, 'r') as f:
                            tmp_str += f.read()
                        tmp_str += 80 * "-" + "\n"
                    if delete:
                        os.remove(filename)
                except (IOError, OSError), e:
                    self._log("Problems with file %s: %s." % (filename, e), logging.WARNING)
                result_str += tmp_str
            return result_str

        outlist = glob.glob(self.current_operation.result_directory + "/log/pySPACE*.out")
        out = _merge_files(outlist)
        errlist = glob.glob(self.current_operation.result_directory + "/log/pySPACE*.err")
        err = _merge_files(errlist)

        with open(self.current_operation.result_directory + "/pySPACE.out", 'w') as merged_out:
            merged_out.write(out)

        with open(self.current_operation.result_directory + "/pySPACE.err", 'w') as merged_err:
            merged_err.write(err)

        try:
            outlist = glob.glob(self.current_operation.result_directory + "/sub_log/pySPACE*.out")
            out = _merge_files(outlist)
            errlist = glob.glob(self.current_operation.result_directory + "/sub_log/pySPACE*.err")
            err = _merge_files(errlist)

            with open(self.current_operation.result_directory + "/pySPACE_sub.out", 'w') as merged_out:
                merged_out.write(out)

            with open(self.current_operation.result_directory + "/pySPACE_sub.err", 'w') as merged_err:
                merged_err.write(err)
        except IOError:
            pass

        self._log("Process Logging - consolidated")
        self.state = "consolidated"
Exemplo n.º 14
0
class MpiBackend(Backend):
    """ 
    A message passing interface (mpi) backend to pySPACE
    
    In order to use this backend, you need a working MPI distribution and mpi4py. 
    You can download mpi4py from http://code.google.com/p/mpi4py/. mpi4py is 
    compatible with a python 2.3  to 2.7 or 3.0 to 3.1 distribution. 

    This backend assumes a global file system that is seen by all nodes running 
    the processes. 
 
    """
    
    def __init__(self):
        super(MpiBackend, self).__init__()
        
        self.COMMAND_MPI = '/usr/lib64/openmpi/bin/mpirun'
        self.COMMAND_PYTHON = sys.executable
        self.runner_script = os.sep.join([pySPACE.configuration.root_dir,
                             "app",
                             "backends",
                             "mpi_runner.py"])
        # start as many processes as the total number of processors
        # available
        self.NumberOfProcessesToRunAtBeginning = 156
        self.NumberOfProcessesToRunLater = 39
    def __del__(self):
        pass
        

    def stage_in(self, operation):
        """
        Stage the current operation
        """
        super(MpiBackend, self).stage_in(operation)
        # init of process lists, because backend is only initialized once        
        self.process_args_list = []
        self.IndexCopyStart = 0
        self.ProcessingSuccessful = True
        self.TotalProcessesFinished = 0
        self.CrashedProcesses = []
        # Set up progress bar
        widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()]
        self.progress_bar = ProgressBar(widgets = widgets, 
                                       maxval = self.current_operation.number_processes)
        self.progress_bar.start()

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host" : self.host, "port" : self.port}
        
        # Set up stage in directory
        stagein_dir = os.sep.join([self.current_operation.result_directory,
                                   ".stagein"])
        # Check if hosts file is created in the right directoy
        HostfileCreated = pySPACE.configuration.root_dir+ "/" +'hostsfile'
        if (not os.path.isfile(HostfileCreated)):
            print "***************************************************************************************************"
            print "hostsfile not created !"
            print "Please create the hosts file with a filename 'hostsfile' under ", pySPACE.configuration.root_dir
            print "***************************************************************************************************"
            raise UserWarning('Missing hostsfile.')
        if not os.path.exists(stagein_dir):
            os.mkdir(stagein_dir)   

        process = self.current_operation.processes.get()
        print "Preparing processes. This might take a few minutes...."
        # Until not all Processes have been created prepare all processes
        # from the queue for remote execution and execute them
        i = 0
        while process != False:
            process.prepare(pySPACE.configuration, handler_class, handler_args)
            # since preparing the process might be quite faster than executing
            # it we need another queue where processes get out when they have
            # finished execution
            #self.result_handlers.put(1)
            # Execute all functions in the process pool but return immediately
            #self.pool.apply_async(process, callback=self.dequeue_process)
            proc_file_name = os.sep.join([stagein_dir,
                                          "process_%d.pickle" % i])
            proc_file = open(proc_file_name, "w")
            cPickle.dump(process, proc_file)
            proc_file.close()
            # Add task to job specification
            self.process_args_list.append(proc_file_name)
            # Get the next process
            process = self.current_operation.processes.get()
            i+=1

        self._log("Operation - staged")
        self.state = "staged"        
        
    def execute(self):
        """
        Executes all processes specified in the currently staged
        operation.
        """
        assert(self.state == "staged")
        
        
    def check_status(self):
        """
        Returns a description of the current state of the operations
        execution. 
        """
        #self.progress_bar.update(float(self.current_job.info()["percentDone"]))
        #return float(self.current_job.info()["percentDone"]) / 100.0
        #return float(self.current_process) / self.current_operation.number_processes
        return 1.0

    def not_xor(self, a, b):
        return not((a or b) and not (a and b))
    
    def retrieve(self):
        """
        Returns the result of the operation.
        """
        
        self.state = "executing" 
        self._log("Operation - executing") 
        if (self.NumberOfProcessesToRunAtBeginning > len(self.process_args_list)):
            args = ([self.COMMAND_MPI] +
                ['--loadbalance']+
                ['--nolocal']+
                ['--hostfile'] +
                [pySPACE.configuration.root_dir+ "/" +'hostsfile'] +
                ['-n', str(len(self.process_args_list))] +
                [self.COMMAND_PYTHON] +  
                [self.runner_script] + 
                self.process_args_list)
            # Start the processes. 
            p =subprocess.Popen(args)
            #self.pids.append(p)
            self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning
            #print args
        else:
            #copy the arguments of the processes to run
            sub_process_args_list = (self.process_args_list[self.IndexCopyStart: 
                                     self.NumberOfProcessesToRunAtBeginning])
            args = ([self.COMMAND_MPI] +
                ['--loadbalance']+
                ['--nolocal']+
                ['--hostfile'] +
                [pySPACE.configuration.root_dir+ "/" +'hostsfile'] +
                ['-n', str(len(sub_process_args_list))] +
                [self.COMMAND_PYTHON] +  
                [self.runner_script] + 
                sub_process_args_list)
            # Start the processes. 
            p = subprocess.Popen(args)
            #self.pids.append(p) # TODO: call p.poll() for p in self.pids after all processes have exited
            self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning
            #print args

        # Create a list of boolean for processes which are finished.
        # First we assume that all processes are not started, so we set
        # every element of the list to false
        FinishedProcesses=[False for i in range(len(self.process_args_list))] 
        
        # Wait until all processes finish and start new processes
        # when old ones finish

        print "Waiting for the processes to finish...."

        # Counter for the processes which are finished. It will be reset
        # after 'NumberOfProcessesToRunLater' processes are finished
        CounterProcessesFinished = 0
        processes_Finished = False

        while not processes_Finished:
          try:
             processes_Finished = True
             for LoopCounter, process_args in enumerate(self.process_args_list):
                 if (self.not_xor (os.path.isfile(process_args+"_Finished"), 
                               os.path.isfile(process_args+"_Crashed"))):
                    processes_Finished = False
                 else:
                    if (FinishedProcesses[LoopCounter] == False):
                       # Record that the process is finished                       
                       FinishedProcesses[LoopCounter] = True
                       # If the process is crashed take note of that
                       if (os.path.isfile(process_args+"_Crashed")):
                           self.CrashedProcesses.append(process_args)
                       # Increment the counter for the number of processes finished
                       # by one
                       CounterProcessesFinished += 1
                       self.TotalProcessesFinished += 1 
                       # update the progress bar
                       self.progress_bar.update(self.TotalProcessesFinished)
                       if (CounterProcessesFinished == self.NumberOfProcessesToRunLater):
                          # Define a variable for a subset of processes to run
                          sub_process_args_list = []
                          if (self.IndexCopyStart==len(self.process_args_list)):
                              break
                          elif ((self.IndexCopyStart+self.NumberOfProcessesToRunLater)< len(self.process_args_list)):
                              sub_process_args_list = (self.process_args_list[self.IndexCopyStart:
                                                       self.IndexCopyStart +self.NumberOfProcessesToRunLater])
                          else:
                              sub_process_args_list = self.process_args_list[self.IndexCopyStart:len(self.process_args_list)]
                          args = ([self.COMMAND_MPI] +
                                 ['--loadbalance']+
                                 ['--nolocal']+
                                 ['--hostfile'] +
                                 [pySPACE.configuration.root_dir+ "/" +'hostsfile'] +
                                 ['-n', str(len(sub_process_args_list))] +
                                 [self.COMMAND_PYTHON] +  
                                 [self.runner_script] + 
                                 sub_process_args_list)
                          # Start the processes
                          if (len(sub_process_args_list) > 0):
                             p = subprocess.Popen(args)
                          #print args                          
                          # Adjust the start index
                          self.IndexCopyStart += self.NumberOfProcessesToRunLater
                          # Reset the counter for processes finished
                          CounterProcessesFinished = 0
             # sleep for one second                
             time.sleep(1)
          except (KeyboardInterrupt, SystemExit): # if processes hang forever
            self.ProcessingSuccessful = False
            print "*********************************************************************************************************"
            print "pySPACE forced to stop ..."
            print "Please wait until mpi_backend is finished with consolidating the results generated and with clean up ..."
            print "**********************************************************************************************************"
            import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary
            # merge the remaining files
            print "***************************************************************************************************"
            print "Starting merging . . ."
            PerformanceResultSummary.merge_performance_results(self.current_operation.result_directory)
            print "Merging complete . . ."
            print "***************************************************************************************************"
            break #The while loop will break

        self._log("Operation - processing finished")
        
        # Change the state to retrieved
        self.state = "retrieved"
        
        return None


    def consolidate(self):
        """
        Consolidates the results of the single processes into a consistent result of the whole
        operation
        """
        assert(self.state == "retrieved")
        
        if ((self.ProcessingSuccessful ==True) and (len(self.CrashedProcesses) == 0)):
            self.current_operation.consolidate()
                 
        if ((self.ProcessingSuccessful ==True) and (len(self.CrashedProcesses) != 0)):
            import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary
            # merge the remaining files
            print "***************************************************************************************************"
            print "Starting merging . . ."
            PerformanceResultSummary.merge_performance_results(self.current_operation.result_directory)
            print "Merging complete . . ."
            print "***************************************************************************************************"

        self._log("Operation - consolidated")
        
        self.state = "consolidated"
        
        
    def cleanup(self):
        """
        Remove the current operation and all potential results that
        have been stored in this object
        """
        self.state = "idling"

        # Cleaning up...
        stagein_dir = os.sep.join([self.current_operation.result_directory,
                                   ".stagein"])
        if ((self.ProcessingSuccessful == True) and (len(self.CrashedProcesses) == 0)):
           deleted = False

           while not deleted:
               try:
                  os.chdir("..")
                  shutil.rmtree(stagein_dir)
                  deleted = True
               except OSError, e:
                  if e.errno == 66:
                     self._log("Could not remove .stagein dir " 
                             ", waiting for NFS lock",
                              level=logging.WARNING)
                  time.sleep(5)
               
        self._log("Operation - cleaned up")
        self._log("Idling...")
        
        # Remove the file logger for this operation
        logging.getLogger('').removeHandler(self.file_handler)       
        # close listener socket
        self.sock.close()
        self.current_operation = None 
Exemplo n.º 15
0
class SerialBackend(Backend):
    """ A backend that allows for easy debugging since the program flow
    is not threaded or distributed over several OS processes.
    """
    def __init__(self):
        super(SerialBackend, self).__init__()

        self.state = "idling"
        self.current_process = 0

    def stage_in(self, operation):
        """
        Stage the current operation
        """
        super(SerialBackend, self).stage_in(operation)

        # Set up progress bar
        widgets = [
            'Operation progress: ',
            Percentage(), ' ',
            Bar(), ' ',
            ETA()
        ]
        self.progress_bar = ProgressBar(
            widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self, timeout=1e6):
        """
        Executes all processes specified in the currently staged
        operation.
        """
        assert (self.state == "staged")

        self.state = "executing"
        self._log("Operation - executing")

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}

        get_process = partial(self.current_operation.processes.get,
                              timeout=timeout)
        for process in iter(get_process, False):
            process.prepare(pySPACE.configuration, handler_class, handler_args)
            # Execute process, update progress bar and get next queue-element
            try:
                process()
            # if an exception is raised somewhere in the code we maybe want to
            # further try other processes
            except Exception:
                self._log(traceback.format_exc(), level=logging.CRITICAL)
                process.post_benchmarking()
                process = False
            # if ctrl+c is pressed we want to immediately stop everything
            except KeyboardInterrupt:
                self._log(traceback.format_exc(), level=logging.CRITICAL)
                process.post_benchmarking()
                process = False
            else:
                self.current_process += 1
                self.progress_bar.update(self.current_process)

    def check_status(self):
        """
        Returns a description of the current state of the operations
        execution.
        
        .. todo:: do we really need this method???
        """
        # Returns which percentage of processes of the current operation
        # is already finished
        return float(
            self.current_process) / self.current_operation.number_processes

    def retrieve(self, timeout=1e6):
        """
        Returns the result of the operation.
        
        This is trivial in the Debug-Backend since execute blocks.
        """
        assert (self.state == "executing")

        self._log("Operation - retrieved")

        self.current_operation.processes.close()
        # if process creation has another thread
        if hasattr(self.current_operation, "create_process") \
                        and self.current_operation.create_process != None:
            self.current_operation.create_process.join(timeout=1e6)

        # Change the state to retrieved
        self.state = "retrieved"

    def consolidate(self):
        """
        Consolidates the results of the single processes into a consistent result of the whole
        operation
        """
        assert (self.state == "retrieved")

        try:
            self.current_operation.consolidate()
        except Exception:
            self._log(traceback.format_exc(), level=logging.CRITICAL)

        self._log("Operation - consolidated")
        self.state = "consolidated"

    def cleanup(self):
        """
        Remove the current operation and all potential results that
        have been stored in this object
        """
        self.state = "idling"

        self._log("Operation - cleaned up")
        self._log("Idling...")

        # Remove the file logger for this operation
        logging.getLogger('').removeHandler(self.file_handler)
        # close listener socket
        self.sock.close()

        self.current_operation = None
        self.current_process = 0
Exemplo n.º 16
0
    def stage_in(self, operation):
        """
        Stage the current operation
        """
        super(MpiBackend, self).stage_in(operation)
        # init of process lists, because backend is only initialized once        
        self.process_args_list = []
        self.IndexCopyStart = 0
        self.ProcessingSuccessful = True
        self.TotalProcessesFinished = 0
        self.CrashedProcesses = []
        # Set up progress bar
        widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()]
        self.progress_bar = ProgressBar(widgets = widgets, 
                                       maxval = self.current_operation.number_processes)
        self.progress_bar.start()

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host" : self.host, "port" : self.port}
        
        # Set up stage in directory
        stagein_dir = os.sep.join([self.current_operation.result_directory,
                                   ".stagein"])
        # Check if hosts file is created in the right directoy
        HostfileCreated = pySPACE.configuration.root_dir+ "/" +'hostsfile'
        if (not os.path.isfile(HostfileCreated)):
            print "***************************************************************************************************"
            print "hostsfile not created !"
            print "Please create the hosts file with a filename 'hostsfile' under ", pySPACE.configuration.root_dir
            print "***************************************************************************************************"
            raise UserWarning('Missing hostsfile.')
        if not os.path.exists(stagein_dir):
            os.mkdir(stagein_dir)   

        process = self.current_operation.processes.get()
        print "Preparing processes. This might take a few minutes...."
        # Until not all Processes have been created prepare all processes
        # from the queue for remote execution and execute them
        i = 0
        while process != False:
            process.prepare(pySPACE.configuration, handler_class, handler_args)
            # since preparing the process might be quite faster than executing
            # it we need another queue where processes get out when they have
            # finished execution
            #self.result_handlers.put(1)
            # Execute all functions in the process pool but return immediately
            #self.pool.apply_async(process, callback=self.dequeue_process)
            proc_file_name = os.sep.join([stagein_dir,
                                          "process_%d.pickle" % i])
            proc_file = open(proc_file_name, "w")
            cPickle.dump(process, proc_file)
            proc_file.close()
            # Add task to job specification
            self.process_args_list.append(proc_file_name)
            # Get the next process
            process = self.current_operation.processes.get()
            i+=1

        self._log("Operation - staged")
        self.state = "staged"        
Exemplo n.º 17
0
class LoadLevelerBackend(Backend):
    """ Commits every process to LoadLeveler cluster, which resumes parallel execution

    Each process corresponds to one combination of input data set and
    parameter choice. The process objects are first pickled.
    The path to the pickled object together with a helper script is then
    submitted to LoadLeveler. There the object is unpickled, called and
    the backend is informed when the results are stored.

    Communication between the independent processes and the backend is
    done via TCP socket connection (see
    :class:`~pySPACE.environments.backends.ll_backend.LoadLevelerComHandler` for detailed
    information).

    :Author: Anett Seeland ([email protected])
    :Created: 2011/06/08
    :LastChange: 2012/09/06 Add communication to SubflowHandler
    """
    LL_COMMAND_FILE_TEMPLATE = """
# @ job_type = serial
# @ notification = never
# @ class = {job_class}
# @ resources = ConsumableMemory({memory}) ConsumableCPUs({CPUs})
# @ requirements = {requirements}
# @ executable = {executable}
# @ arguments = {arguments}
# @ output = %(op_result_dir)s/log/pySPACE_$(jobid).out
# @ error = %(op_result_dir)s/log/pySPACE_$(jobid).err
# @ queue"""

    def __init__(self):
        super(LoadLevelerBackend, self).__init__()

        self.state = "idling"
        # create command file template for Loadleveler
        if "job_class" not in pySPACE.configuration or not pySPACE.configuration[
                "job_class"]:
            pySPACE.configuration["job_class"] = "general"
        if "consumable_memory" not in pySPACE.configuration or not pySPACE.configuration[
                "consumable_memory"]:
            pySPACE.configuration["consumable_memory"] = "3250mb"
        if "consumable_cpus" not in pySPACE.configuration or not pySPACE.configuration[
                "consumable_cpus"]:
            pySPACE.configuration["consumable_cpus"] = 1
        if "anode" not in pySPACE.configuration:
            pySPACE.configuration["anode"] = ""

        assert (pySPACE.configuration["job_class"] in ['critical', 'critical_forking',
                                                       'general', 'general_forking',
                                                       'longterm', 'longterm_forking',
                                                       'test']),\
            "LL_Backend:: Job class not existing! Check your pySPACE config file!"

        self.template_file = LoadLevelerBackend.LL_COMMAND_FILE_TEMPLATE.format(
            executable=sys.executable,
            arguments=" ".join([
                os.path.join(pySPACE.configuration.root_dir, "environments",
                             "backends", "ll_runner.py"),
                "%(process_file_path)s", self.SERVER_IP, "%(server_port)d"
            ]),
            job_class=pySPACE.configuration["job_class"],
            memory=pySPACE.configuration["consumable_memory"],
            CPUs=pySPACE.configuration["consumable_cpus"],
            requirements=pySPACE.configuration["anode"])

        self._log("Using '%s' as template", logging.DEBUG)

        # queue for execution
        self.result_handlers = None
        self.progress_bar = None
        self.process_dir = ""
        self._log("Created LoadLeveler Backend.")

    def stage_in(self, operation):
        """
        Stage the given operation.

        :param operation: The operation to stage.
        :type operation: Operation
        """
        super(LoadLevelerBackend, self).stage_in(operation)
        # set up queue
        self.result_handlers = multiprocessing.Queue(200)
        # Set up progress bar
        widgets = [
            'Operation progress: ',
            Percentage(), ' ',
            Bar(), ' ',
            ETA()
        ]
        self.progress_bar = ProgressBar(
            widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self, timeout=1e6):
        """ Execute all processes specified in the currently staged operation """
        assert (self.state == "staged")

        self._log("Operation - executing")
        self.state = "executing"

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}
        # the communication properties to talk to LoadLevelerComHandler
        backend_com = (self.SERVER_IP, self.SERVER_PORT)
        self._log('--> Loadleveler Communication : \n\t\t host:%s, port:%s' %
                  (self.SERVER_IP, self.SERVER_PORT))
        # Prepare the directory where processes are stored before submitted
        # to LoadLeveler
        self.process_dir = os.sep.join(
            [self.current_operation.result_directory, ".processes"])
        if not os.path.exists(self.process_dir):
            os.mkdir(self.process_dir)

        process_counter = 0

        # create and start server socket thread
        self.listener = LoadLevelerComHandler(
            self.sock,
            self.result_handlers,
            self.progress_bar,
            self.template_file,
            log_func=self._log,
            operation_dir=self.current_operation.result_directory)
        self.listener.start()
        # create a client socket to talk to server socket thread
        send_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        send_socket.connect((self.SERVER_IP, self.SERVER_PORT))
        try:
            # Until not all Processes have been created, prepare all processes
            # from the queue for remote execution and execute them
            get_process = partial(self.current_operation.processes.get,
                                  timeout=timeout)
            for process in iter(get_process, False):
                process.prepare(pySPACE.configuration, handler_class,
                                handler_args, backend_com)
                # since preparing the process might be quite faster than executing
                # it we need another queue where processes get out when they have
                # finished execution
                self.result_handlers.put(process)
                # pickle the process object
                proc_file_name = os.sep.join(
                    [self.process_dir,
                     "process_%d.pickle" % process_counter])
                with open(proc_file_name, "wb") as proc_file:
                    pickle.dump(process, proc_file, pickle.HIGHEST_PROTOCOL)

                # fill out LoadLeveler template
                llfile = self.template_file % {
                    "process_file_path": proc_file_name,
                    "server_port": self.SERVER_PORT,
                    "op_result_dir": self.current_operation.result_directory
                }

                llfilepath = os.path.join(
                    self.current_operation.result_directory, "ll_call.cmd")
                with open(llfilepath, 'w') as f:
                    f.write(llfile)

                # submit to LoadLeveler
                error_counter = 0
                while True:
                    outlog, errlog = sub.Popen(["llsubmit", llfilepath],
                                               stdout=sub.PIPE,
                                               stderr=sub.PIPE).communicate()
                    if errlog == "":
                        break
                    elif error_counter < 100:
                        self._log(
                            "Warning: Job submission to LoadLeveler failed"
                            " with %s. Job will be resubmitted." % errlog,
                            logging.WARNING)
                        time.sleep(1)
                        error_counter += 1
                    else:
                        self._log(
                            "Warning: Job submission to LoadLeveler failed %d times"
                            " with %s. skipping job" % (error_counter, errlog),
                            logging.WARNING)
                        break

                # parse job_id for monitoring
                loadl_id = outlog.split("\"")[1].split(".")[-1]
                # inform listener that we successfully submitted the job
                # noinspection PyTypeChecker
                send_socket = LoadLevelerComHandler.send_message(
                    send_socket, self.SERVER_IP, self.SERVER_PORT,
                    LoadLevelerComHandler.MESSAGES.SUBMITTED, process_counter,
                    loadl_id)
                # update process_counter
                process_counter += 1

            # send message 'creation finished' to listener
            # noinspection PyTypeChecker
            send_socket = LoadLevelerComHandler.send_message(
                send_socket, self.SERVER_IP, self.SERVER_PORT,
                LoadLevelerComHandler.MESSAGES.CREATION_FINISHED)
        finally:
            self.listener.creation_finished = True
            send_socket.shutdown(socket.SHUT_RDWR)
            send_socket.close()

    def check_status(self):
        """ Return a description of the current state of the operations execution

        .. todo:: do we really need this method???
        """
        # Returns the current state of the operation
        return self.state

    def retrieve(self, timeout=1e6):
        """
        Wait for all results of the operation

        This call blocks until all processes are finished
        or the given timeout is reached. If the timeout is zero,
        the timeout is disabled.

        :param timeout: The time to wait until a job is considered as "finished"
                        and will be stopped.
        :type timeout: int
        """
        assert (self.state == "executing")
        self._log("All processes submitted. Waiting for finishing.")
        # since self.current_operation.number_processes is not reliable (maybe
        # to high) we wait until the listener thread is terminated
        self.listener.finished.wait(timeout=timeout)
        self._log("Worker processes have exited gracefully")

        self.current_operation.processes.close()

        # if process creation has another thread
        if self.current_operation.create_process is not None:
            self.current_operation.create_process.join(timeout=timeout)
        self.result_handlers.close()
        # join also listener thread
        self.listener.join(timeout=timeout)
        # Change the state to finished
        self._log("Operation - retrieved")
        self.state = "retrieved"
        return True

    def consolidate(self):
        """ Consolidate the single processes' results into a consistent result of the whole operation """
        assert (self.state == "retrieved")

        self.current_operation.consolidate()

        self._log("Operation - consolidated")

        # collect all log file
        def _merge_files(file_list, delete=True):
            result_str = ""
            for filename in file_list:
                tmp_str = ""
                try:
                    if os.path.getsize(filename) != 0:
                        tmp_str += filename.split(os.sep)[-1] + "\n" + len(
                            filename.split(os.sep)[-1]) * "-" + "\n"
                        with open(filename, 'r') as f:
                            tmp_str += f.read()
                        tmp_str += 80 * "-" + "\n"
                    if delete:
                        os.remove(filename)
                except (IOError, OSError), e:
                    self._log("Problems with file %s: %s." % (filename, e),
                              logging.WARNING)
                result_str += tmp_str
            return result_str

        outlist = glob.glob(self.current_operation.result_directory +
                            "/log/pySPACE*.out")
        out = _merge_files(outlist)
        errlist = glob.glob(self.current_operation.result_directory +
                            "/log/pySPACE*.err")
        err = _merge_files(errlist)

        with open(self.current_operation.result_directory + "/pySPACE.out",
                  'w') as merged_out:
            merged_out.write(out)

        with open(self.current_operation.result_directory + "/pySPACE.err",
                  'w') as merged_err:
            merged_err.write(err)

        try:
            outlist = glob.glob(self.current_operation.result_directory +
                                "/sub_log/pySPACE*.out")
            out = _merge_files(outlist)
            errlist = glob.glob(self.current_operation.result_directory +
                                "/sub_log/pySPACE*.err")
            err = _merge_files(errlist)

            with open(
                    self.current_operation.result_directory +
                    "/pySPACE_sub.out", 'w') as merged_out:
                merged_out.write(out)

            with open(
                    self.current_operation.result_directory +
                    "/pySPACE_sub.err", 'w') as merged_err:
                merged_err.write(err)
        except IOError:
            pass

        self._log("Process Logging - consolidated")
        self.state = "consolidated"