예제 #1
0
    def start_pilot_job(self, 
                 lrms_url, 
                 bigjob_agent_executable=None,
                 number_nodes=1,
                 queue=None,
                 project=None,
                 working_directory=None,
                 userproxy=None,
                 walltime=None,
                 processes_per_node=1,
                 filetransfers=None):
        """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported:
            fork://localhost/ (Default Job Adaptor
            gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor)
            pbspro://localhost (PBS Prop Adaptor)
        
        """
         
        if self.job != None:
            raise BigJobError("One BigJob already active. Please stop BigJob first.") 
            return

        ##############################################################################
        # initialization of coordination and communication subsystem
        # Communication & Coordination initialization
        lrms_saga_url = saga.url(lrms_url)
        self.pilot_url = self.app_url + ":" + lrms_saga_url.host
        pilot_url_dict[self.pilot_url]=self
        
        logger.debug("create pilot job entry on backend server: " + self.pilot_url)
        self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False)
                
        logger.debug("set pilot state to: " + str(Unknown))
        ##############################################################################
        
        self.number_nodes=int(number_nodes)        
        
        # create job description
        jd = saga.job.description()
        
        
        logger.debug("Adaptor specific modifications: "  + str(lrms_saga_url.scheme))
        if lrms_saga_url.scheme == "condorg":
            jd.arguments = [ "-a", self.coordination.get_address(), "-b",self.pilot_url]
            logger.debug("\n\n-a", self.coordination.get_address(),"-b", self.pilot_url)
            agent_exe = os.path.abspath(os.path.join(os.getcwd(),"..","bootstrap","bigjob-condor-bootstrap.py"))
            logger.debug(agent_exe) 
            jd.executable = agent_exe
            
        else:
            bootstrap_script = self.generate_bootstrap_script(self.coordination.get_address(), self.pilot_url)
            if lrms_saga_url.scheme == "gram":
                bootstrap_script = self.escape_rsl(bootstrap_script)
            elif lrms_saga_url.scheme == "pbspro":                
                bootstrap_script = self.escape_pbs(bootstrap_script)
            elif lrms_saga_url.scheme == "ssh":
                bootstrap_script = self.escape_ssh(bootstrap_script)
            ############ submit pbs script which launches bigjob agent using ssh adaptors########## 
            elif lrms_saga_url.scheme == "pbs-ssh":
                # change the url scheme ssh to use ssh adaptors to launch job
                bootstrap_script = self.escape_ssh(bootstrap_script)
                ### convert walltime in minutes to PBS representation of time ###
                hrs=walltime/60 
                minu=walltime%60 
                walltimepbs=""+str(hrs)+":"+str(minu)+":00"
                if number_nodes%processes_per_node == 0:
                    number_nodes = number_nodes/processes_per_node
                else:
                    number_nodes = ( number_nodes/processes_per_node) + 1
                pbssshj = pbsssh(bootstrap_script,lrms_saga_url, walltimepbs,number_nodes,processes_per_node,userproxy,working_directory)
                self.job = pbssshj
                self.job.run()
                return
            elif is_bliss:
                bootstrap_script = self.escape_bliss(bootstrap_script)

            #logger.debug(bootstrap_script)
            if is_bliss==False:
                jd.number_of_processes = str(number_nodes)
                jd.processes_per_host=str(processes_per_node)
            else:
                jd.TotalCPUCount=str(int(number_nodes)*int(processes_per_node))
                
            jd.spmd_variation = "single"
            #jd.arguments = [bigjob_agent_executable, self.coordination.get_address(), self.pilot_url]
            jd.arguments = ["-c", bootstrap_script]
            jd.executable = "python"
            if queue != None:
                jd.queue = queue
            if project !=None:
                jd.job_project = [project]
            if walltime!=None:
                jd.wall_time_limit=str(walltime)
        
            # XXX Isn't the working directory about the remote site?
            if working_directory != None:
                if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork":
                    os.mkdir(working_directory)
                self.working_directory = working_directory
            else:
                self.working_directory = os.path.expanduser("~")
    
            jd.working_directory = self.working_directory
    
            logger.debug("Working directory: " + jd.working_directory)
            jd.output = os.path.join(self.__get_bigjob_working_dir(), "stdout-bigjob_agent.txt")
            jd.error = os.path.join(self.__get_bigjob_working_dir(),"stderr-bigjob_agent.txt")
         
        # Stage BJ Input files
        # build target url
        bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir()
        self.__stage_files(filetransfers, bigjob_working_directory_url)
           
        # Submit job
        js = None    
        if userproxy != None and userproxy != '':
            s = saga.session()
            os.environ["X509_USER_PROXY"]=userproxy
            ctx = saga.context("x509")
            ctx.set_attribute ("UserProxy", userproxy)
            s.add_context(ctx)
            logger.debug("use proxy: " + userproxy)
            js = saga.job.service(s, lrms_saga_url)
        else:
            logger.debug("use standard proxy")
            js = saga.job.service(lrms_saga_url)

        self.job = js.create_job(jd)
        logger.debug("Submit pilot job to: " + str(lrms_saga_url))
        self.job.run()
예제 #2
0
    def start_pilot_job(self, 
                 lrms_url, 
                 bigjob_agent_executable=None,
                 number_nodes=1,
                 queue=None,
                 project=None,
                 working_directory=None,
                 userproxy=None,
                 walltime=None,
                 processes_per_node=1,
                 filetransfers=None):
        """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported:
            fork://localhost/ (Default Job Adaptor
            gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor)
            pbspro://localhost (PBS Prop Adaptor)
        
        """
         
        if self.job != None:
            raise BigJobError("One BigJob already active. Please stop BigJob first.") 
            return

        ##############################################################################
        # initialization of coordination and communication subsystem
        # Communication & Coordination initialization
        lrms_saga_url = saga.url(lrms_url)
        self.pilot_url = self.app_url + ":" + lrms_saga_url.host
        pilot_url_dict[self.pilot_url]=self
        
        logger.debug("create pilot job entry on backend server: " + self.pilot_url)
        self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False)
                
        logger.debug("set pilot state to: " + str(Unknown))
        ##############################################################################
        
        self.number_nodes=int(number_nodes)        
        
        # create job description
        jd = saga.job.description()
        
        # XXX Isn't the working directory about the remote site?
        # Yes, it is: This is to make sure that if fork
        if working_directory != None:
            if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork":
                os.mkdir(working_directory)
            self.working_directory = working_directory
        else:
            # if no working dir is set assume use home directory
            # will fail if home directory is not the same on remote machine
            # but this is just a guess to avoid failing
            self.working_directory = os.path.expanduser("~") 
            
        # Stage BJ Input files
        # build target url
        # this will also create the remote directory for the BJ
        if lrms_saga_url.username!=None and lrms_saga_url.username!="":
            bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + self.__get_bigjob_working_dir()
        else:
            bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir()
        
        # determine working directory of bigjob 
        # if a remote sandbox can be created via ssh => create a own dir for each bj job id
        # otherwise use specified working directory
        if self.__create_remote_directory(bigjob_working_directory_url)==True:
            self.working_directory = self.__get_bigjob_working_dir()
            self.__stage_files(filetransfers, bigjob_working_directory_url)
        else:        
            logger.warn("For file staging. SSH (incl. password-less authentication is required.")
         
        logger.debug("BJ Working Directory: %s", self.working_directory)
        logger.debug("Adaptor specific modifications: "  + str(lrms_saga_url.scheme))
        if lrms_saga_url.scheme == "condorg":
            jd.arguments = [ self.coordination.get_address(), self.pilot_url]
            agent_exe = os.path.abspath(os.path.join(os.path.dirname(__file__),"..","bootstrap","bigjob-condor-bootstrap.py"))
            logger.debug("agent_exe",agent_exe)
            jd.executable = agent_exe             
        else:
            bootstrap_script = self.generate_bootstrap_script(self.coordination.get_address(), self.pilot_url)
            if lrms_saga_url.scheme == "gram":
                bootstrap_script = self.escape_rsl(bootstrap_script)
            elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme=="xt5torque" or lrms_saga_url.scheme=="torque":                
                bootstrap_script = self.escape_pbs(bootstrap_script)
            elif lrms_saga_url.scheme == "ssh":
                bootstrap_script = self.escape_ssh(bootstrap_script)
            ############ submit pbs script which launches bigjob agent using ssh adaptors########## 
            elif lrms_saga_url.scheme == "pbs-ssh":
                bootstrap_script = self.escape_ssh(bootstrap_script)
                # PBS specific BJ plugin
                pbssshj = pbsssh(bootstrap_script, lrms_saga_url, walltime, number_nodes, 
                                 processes_per_node, userproxy, self.working_directory, self.working_directory)
                self.job = pbssshj
                self.job.run()
                return
            ############ submit sge script which launches bigjob agent using ssh adaptors########## 
            elif lrms_saga_url.scheme == "sge-ssh":
                bootstrap_script = self.escape_ssh(bootstrap_script)
                # PBS specific BJ plugin
                sgesshj = sgessh(bootstrap_script, lrms_saga_url, walltime, number_nodes, 
                                 processes_per_node, userproxy, project, queue, self.working_directory, self.working_directory)
                self.job = sgesshj
                self.job.run()
                return
            elif is_bliss:
                bootstrap_script = self.escape_bliss(bootstrap_script)

            #logger.debug(bootstrap_script)
            if is_bliss==False:
                jd.number_of_processes = str(number_nodes)
                jd.processes_per_host=str(processes_per_node)
            else:
                jd.TotalCPUCount=str(int(number_nodes)*int(processes_per_node))
                
            jd.spmd_variation = "single"
            #jd.arguments = [bigjob_agent_executable, self.coordination.get_address(), self.pilot_url]
            jd.arguments = ["python", "-c", bootstrap_script]
            jd.executable = "/usr/bin/env"
            if queue != None:
                jd.queue = queue
            if project !=None:
                jd.job_project = [project]
            if walltime!=None:
                jd.wall_time_limit=str(walltime)
        
            jd.working_directory = self.working_directory
    
            logger.debug("Working directory: " + jd.working_directory)
            jd.output = os.path.join(self.working_directory, "stdout-bigjob_agent.txt")
            jd.error = os.path.join(self.working_directory,"stderr-bigjob_agent.txt")
          
           
        # Submit job
        js = None    
        if userproxy != None and userproxy != '':
            s = saga.session()
            os.environ["X509_USER_PROXY"]=userproxy
            ctx = saga.context("x509")
            ctx.set_attribute ("UserProxy", userproxy)
            s.add_context(ctx)
            logger.debug("use proxy: " + userproxy)
            js = saga.job.service(s, lrms_saga_url)
        else:
            logger.debug("use standard proxy")
            js = saga.job.service(lrms_saga_url)

        logger.debug("Creating pilot job with description: %s" % str(jd))
              

        self.job = js.create_job(jd)
        logger.debug("Submit pilot job to: " + str(lrms_saga_url))
        self.job.run()
        return self.pilot_url