Exemplo n.º 1
0
 def run(self):
     jd = saga.job.description()
     jd.arguments = ["-c", self.bootstrap_script]
     jd.executable = "python"
     jd.working_directory =  self.working_directory
     jd.set_attribute("Interactive", "True")
     # Submit job
     js = None
     if self.userproxy != None and self.userproxy != '':
         s = saga.session()
         os.environ["X509_USER_PROXY"]=self.userproxy
         ctx = saga.context("x509")
         ctx.set_attribute ("UserProxy", self.userproxy)
         s.add_context(ctx)
         print "use proxy: " + self.userproxy
         js = saga.job.service(s, self.lrms_saga_url)
     else:
         print "use standard proxy"
         js = saga.job.service(self.lrms_saga_url)
     sgesshjob = js.create_job(jd)
     print "Submit pilot job to: " + str(self.lrms_saga_url)
     sgesshjob.run()
     sgesshjob.wait()
     outstr = (sgesshjob.get_stdout().read()).rstrip()
     errstr = sgesshjob.get_stderr().read()
     print "Output - \n" + str(outstr)
     self.job_id=(outstr).split("\n")[-1]
     print "SGE JobID: " + str(self.job_id)
     if self.job_id==None or self.job_id=="":
         raise Exception("BigJob submission via sge-ssh:// failed: %s %s" % (outstr,errstr))
    def test_context_and_session(self):
        """
        Test for the saga.file.get_size() API call 
        http://saga.cct.lsu.edu/python/apidoc/saga.file._file.html#file-get_size
        """
        try:
            ctx1 = saga.context(saga.attributes.context_server)
            ctx2 = saga.context(saga.attributes.SSH)

            s = saga.session()
            s.add_context(ctx1)
            s.add_context(ctx2)
            l = s.list_contexts()
    
            print len(l)
            print l[0].type
            print l[1].type

            print ctx1.get_ctype()
            print ctx2.get_ctype()
    
        except saga.exception, e: 
            self.fail(e)
Exemplo n.º 3
0
 def run(self):
     jd = saga.job.description()
     jd.arguments = ["-c", self.bootstrap_script]
     jd.executable = "python"
     jd.working_directory =  self.working_directory
     jd.set_attribute("Interactive", "True")
     # Submit job
     js = None
     if self.userproxy != None and self.userproxy != '':
         s = saga.session()
         os.environ["X509_USER_PROXY"]=self.userproxy
         ctx = saga.context("x509")
         ctx.set_attribute ("UserProxy", self.userproxy)
         s.add_context(ctx)
         print "use proxy: " + self.userproxy
         js = saga.job.service(s, self.lrms_saga_url)
     else:
         print "use standard proxy"
         js = saga.job.service(self.lrms_saga_url)
     pbssshjob = js.create_job(jd)
     print "Submit pilot job to: " + str(self.lrms_saga_url)
     pbssshjob.run()
     joboutput= pbssshjob.get_stdout()
     self.job_id=(joboutput.read()).split(".")[0]
import saga

try: 
  c1 = saga.context("x509")
  c1.set_attribute("UserProxy", "/tmp/x509up_u500_cern")
  
  c2 = saga.context("x509")
  c2.set_attribute("UserProxy", "/tmp/x509up_u500_ncsa")
  
  s = saga.session()
  s.add_context(c1)
  s.add_context(c2)
  
  js = saga.job.service(s, saga.url("gram://qb1.loni.org"))
  

except saga.exception, e:
  print e.get_full_message()
Exemplo n.º 5
0
    def start_pilot_job(self, 
                 lrms_url, 
                 bigjob_agent_executable=None,
                 number_nodes=1,
                 queue=None,
                 project=None,
                 working_directory=None,
                 userproxy=None,
                 walltime=None,
                 processes_per_node=1,
                 filetransfers=None):
        """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported:
            fork://localhost/ (Default Job Adaptor
            gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor)
            pbspro://localhost (PBS Prop Adaptor)
        
        """
         
        if self.job != None:
            raise BigJobError("One BigJob already active. Please stop BigJob first.") 
            return

        ##############################################################################
        # initialization of coordination and communication subsystem
        # Communication & Coordination initialization
        lrms_saga_url = saga.url(lrms_url)
        self.pilot_url = self.app_url + ":" + lrms_saga_url.host
        pilot_url_dict[self.pilot_url]=self
        
        logger.debug("create pilot job entry on backend server: " + self.pilot_url)
        self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False)
                
        logger.debug("set pilot state to: " + str(Unknown))
        ##############################################################################
        
        self.number_nodes=int(number_nodes)        
        
        # create job description
        jd = saga.job.description()
        
        
        logger.debug("Adaptor specific modifications: "  + str(lrms_saga_url.scheme))
        if lrms_saga_url.scheme == "condorg":
            jd.arguments = [ "-a", self.coordination.get_address(), "-b",self.pilot_url]
            logger.debug("\n\n-a", self.coordination.get_address(),"-b", self.pilot_url)
            agent_exe = os.path.abspath(os.path.join(os.getcwd(),"..","bootstrap","bigjob-condor-bootstrap.py"))
            logger.debug(agent_exe) 
            jd.executable = agent_exe
            
        else:
            bootstrap_script = self.generate_bootstrap_script(self.coordination.get_address(), self.pilot_url)
            if lrms_saga_url.scheme == "gram":
                bootstrap_script = self.escape_rsl(bootstrap_script)
            elif lrms_saga_url.scheme == "pbspro":                
                bootstrap_script = self.escape_pbs(bootstrap_script)
            elif lrms_saga_url.scheme == "ssh":
                bootstrap_script = self.escape_ssh(bootstrap_script)
            ############ submit pbs script which launches bigjob agent using ssh adaptors########## 
            elif lrms_saga_url.scheme == "pbs-ssh":
                # change the url scheme ssh to use ssh adaptors to launch job
                bootstrap_script = self.escape_ssh(bootstrap_script)
                ### convert walltime in minutes to PBS representation of time ###
                hrs=walltime/60 
                minu=walltime%60 
                walltimepbs=""+str(hrs)+":"+str(minu)+":00"
                if number_nodes%processes_per_node == 0:
                    number_nodes = number_nodes/processes_per_node
                else:
                    number_nodes = ( number_nodes/processes_per_node) + 1
                pbssshj = pbsssh(bootstrap_script,lrms_saga_url, walltimepbs,number_nodes,processes_per_node,userproxy,working_directory)
                self.job = pbssshj
                self.job.run()
                return
            elif is_bliss:
                bootstrap_script = self.escape_bliss(bootstrap_script)

            #logger.debug(bootstrap_script)
            if is_bliss==False:
                jd.number_of_processes = str(number_nodes)
                jd.processes_per_host=str(processes_per_node)
            else:
                jd.TotalCPUCount=str(int(number_nodes)*int(processes_per_node))
                
            jd.spmd_variation = "single"
            #jd.arguments = [bigjob_agent_executable, self.coordination.get_address(), self.pilot_url]
            jd.arguments = ["-c", bootstrap_script]
            jd.executable = "python"
            if queue != None:
                jd.queue = queue
            if project !=None:
                jd.job_project = [project]
            if walltime!=None:
                jd.wall_time_limit=str(walltime)
        
            # XXX Isn't the working directory about the remote site?
            if working_directory != None:
                if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork":
                    os.mkdir(working_directory)
                self.working_directory = working_directory
            else:
                self.working_directory = os.path.expanduser("~")
    
            jd.working_directory = self.working_directory
    
            logger.debug("Working directory: " + jd.working_directory)
            jd.output = os.path.join(self.__get_bigjob_working_dir(), "stdout-bigjob_agent.txt")
            jd.error = os.path.join(self.__get_bigjob_working_dir(),"stderr-bigjob_agent.txt")
         
        # Stage BJ Input files
        # build target url
        bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir()
        self.__stage_files(filetransfers, bigjob_working_directory_url)
           
        # Submit job
        js = None    
        if userproxy != None and userproxy != '':
            s = saga.session()
            os.environ["X509_USER_PROXY"]=userproxy
            ctx = saga.context("x509")
            ctx.set_attribute ("UserProxy", userproxy)
            s.add_context(ctx)
            logger.debug("use proxy: " + userproxy)
            js = saga.job.service(s, lrms_saga_url)
        else:
            logger.debug("use standard proxy")
            js = saga.job.service(lrms_saga_url)

        self.job = js.create_job(jd)
        logger.debug("Submit pilot job to: " + str(lrms_saga_url))
        self.job.run()
Exemplo n.º 6
0
    def start_pilot_job(self, 
                 lrms_url, 
                 bigjob_agent_executable,
                 number_nodes,
                 queue,
                 project,
                 working_directory,
                 userproxy,
                 walltime,
                 processes_per_node=1):
        
        
        if self.job != None:
            raise BigJobError("One BigJob already active. Please stop BigJob first.") 
            return


        #register advert entry
        lrms_saga_url = saga.url(lrms_url)
        self.pilot_url = self.app_url.get_string() + "/" + lrms_saga_url.host
        print "create advert entry: " + self.pilot_url
        self.pilot_dir = saga.advert.directory(saga.url(self.pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
        # application level state since globus adaptor does not support state detail
        self.pilot_dir.set_attribute("state", str(saga.job.Unknown)) 
        self.pilot_dir.set_attribute("stopped", "false")
        logging.debug("set pilot state to: " + self.pilot_dir.get_attribute("state"))
        self.number_nodes=int(number_nodes)        
 
        # create job description
        jd = saga.job.description()
        jd.number_of_processes = str(number_nodes)
        jd.processes_per_host=str(processes_per_node)
        jd.spmd_variation = "single"
        jd.arguments = [bigjob_agent_executable, self.database_host, self.pilot_url]
        jd.executable = "/bin/bash"
        #jd.executable = bigjob_agent_executable
        if queue != None:
            jd.queue = queue
        if project !=None:
            jd.job_project = [project]
        if walltime!=None:
            jd.wall_time_limit=str(walltime)

        # XXX Isn't the working directory about the remote site?
        if working_directory != None:
            if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork":
                os.mkdir(working_directory)
            jd.working_directory = working_directory
        else:
            jd.working_directory = "$(HOME)"
            
        print "Working directory: " + jd.working_directory
        
        jd.output = "stdout-bigjob_agent-" + str(self.uuid) + ".txt"
        jd.error = "stderr-bigjob_agent-" + str(self.uuid) + ".txt"
           
        # Submit job
        js = None	
        if userproxy != None and userproxy != '':
      	    s = saga.session()
            os.environ["X509_USER_PROXY"]=userproxy
            ctx = saga.context("x509")
            ctx.set_attribute ("UserProxy", userproxy)
            s.add_context(ctx)
            print "use proxy: " + userproxy
            js = saga.job.service(s, lrms_saga_url)
        else:
            print "use standard proxy"
            js = saga.job.service(lrms_saga_url)

        self.job = js.create_job(jd)
        print "Submit pilot job to: " + str(lrms_saga_url)
        self.job.run()
Exemplo n.º 7
0
    def start_pilot_job(self, 
                 lrms_url=None,                     # in future version one can specify a URL for a cloud (ec2:// vs. nimbus:// vs. eu://)
                 bigjob_agent_executable=None,      # n/a
                 number_nodes=1,                    # number of images requested    
                 queue=None,                        # n/a
                 project=None,                      # n/a
                 working_directory=None,            # working directory
                 userproxy=None,                    # optional: path to user credential (X509 cert or proxy cert)
                 walltime=None,                     # optional: walltime
                 cloud_type=None,
                 image_name=None):                  # optional: EC2 or Nimbus
        """ The start_pilot_job method will initialize the requested number of images """           

        print "Working directory: " + working_directory
        if not os.path.isdir(working_directory):
            os.mkdir(working_directory)
        self.working_directory=working_directory    
        self.walltime=walltime       
        self.nodes = [] 
        self.free_nodes = []
        self.busynodes = []
        self.subjobs = {}
        
        self.job_service_cache={}
        #EC2 environment
        self.env_dict={}
        self.cloud_type = cloud_type
        self.image_name = image_name
        self.number_requested_nodes=number_nodes
        self.init_thread=None
         
        # for locking 
        self.resource_lock = threading.RLock()       
        
        # spawn Cloud images
        start=time.time()
        host=socket.gethostname()
        if cloud_type == "EC2":
            self.pilot_url="ec2://"+host
            
            # SSH Context
            self.ssh_context = saga.context("ssh")
            self.ssh_context.set_attribute("UserKey", EC2_SSH_PRIVATE_KEY_FILE)
            self.session = saga.session()
            self.session.add_context(self.ssh_context)  
            self.instance_type = EC2_INSTANCE_TYPE
            self.key_name = EC2_KEYNAME
            
            #setup environment
            self.env_dict=self.read_ec2_environments(EC2_ENV_FILE)   
            self.start_ec2_images_in_background(number_nodes)
                        
        elif cloud_type == "EUCA":
            self.pilot_url="euca://"+host
            
            self.ssh_context = saga.context("ssh")
            self.ssh_context.set_attribute("UserKey", EUCA_SSH_PRIVATE_KEY_FILE)
            self.session = saga.session()
            self.session.add_context(self.ssh_context)   
            self.instance_type = EUCA_INSTANCE_TYPE
            self.key_name = EUCA_KEYNAME
            
             #setup environment
            self.env_dict=self.read_ec2_environments(EUCA_ENV_FILE)   
            self.start_ec2_images_in_background(number_nodes)
            
        elif cloud_type ==  "NIMBUS":
            self.pilot_url="nimbus://"+host
            self.ssh_context = saga.context("ssh")
            self.ssh_context.set_attribute("UserKey", NIMBUS_SSH_PRIVATE_KEY_FILE)
            self.session = saga.session() # use default
            self.session.add_context(self.ssh_context)   
            
            self.start_nimbus_images_in_background(number_nodes)
        else:
            raise UnsupportedCloudType("Cloud Type not supported")
        
        # for fast debugging     
        #self.nodes=[{"hostname":"149.165.228.103", "vmid":"i-48F80882", "private_hostname":"192.168.8.2", "cpu_count":1},
        #            {"hostname":"149.165.228.108", "vmid":"i-40820878", "private_hostname":"192.168.8.3", "cpu_count":1},
        #            ]
#       self.nodes = [{"hostname":"tp-x001.ci.uchicago.edu", "vmid":"vm-049", "cpu_count":2},
#                      {"hostname":"tp-x002.ci.uchicago.edu", "vmid":"vm-050", "cpu_count":2},
#                      {"hostname":"tp-x004.ci.uchicago.edu", "vmid":"vm-050", "cpu_count":2},      
#                      {"hostname":"tp-x005.ci.uchicago.edu", "vmid":"vm-050", "cpu_count":2}]
        
        print "Started " + str(len(self.nodes)) + " nodes in " + str(time.time()-start)      
        
        # check whether all requested nodes have been started
        #if len(self.nodes) < number_nodes:
        #    print "Not sufficent resources available: " + str(self.pilot_url)
        #    raise NoResourceAvailable("Not sufficient resources available.")  
        

                
        self.launcher_thread=threading.Thread(target=self.start_background_thread)
        self.launcher_thread.start()
        print "Finished launching of pilot jobs"
Exemplo n.º 8
0
    def start_pilot_job(self, 
                 lrms_url, 
                 bigjob_agent_executable=None,
                 number_nodes=1,
                 queue=None,
                 project=None,
                 working_directory=None,
                 userproxy=None,
                 walltime=None,
                 processes_per_node=1,
                 filetransfers=None):
        """ Start a batch job (using SAGA Job API) at resource manager. Currently, the following resource manager are supported:
            fork://localhost/ (Default Job Adaptor
            gram://qb1.loni.org/jobmanager-pbs (Globus Adaptor)
            pbspro://localhost (PBS Prop Adaptor)
        
        """
         
        if self.job != None:
            raise BigJobError("One BigJob already active. Please stop BigJob first.") 
            return

        ##############################################################################
        # initialization of coordination and communication subsystem
        # Communication & Coordination initialization
        lrms_saga_url = saga.url(lrms_url)
        self.pilot_url = self.app_url + ":" + lrms_saga_url.host
        pilot_url_dict[self.pilot_url]=self
        
        logger.debug("create pilot job entry on backend server: " + self.pilot_url)
        self.coordination.set_pilot_state(self.pilot_url, str(Unknown), False)
                
        logger.debug("set pilot state to: " + str(Unknown))
        ##############################################################################
        
        self.number_nodes=int(number_nodes)        
        
        # create job description
        jd = saga.job.description()
        
        # XXX Isn't the working directory about the remote site?
        # Yes, it is: This is to make sure that if fork
        if working_directory != None:
            if not os.path.isdir(working_directory) and lrms_saga_url.scheme=="fork":
                os.mkdir(working_directory)
            self.working_directory = working_directory
        else:
            # if no working dir is set assume use home directory
            # will fail if home directory is not the same on remote machine
            # but this is just a guess to avoid failing
            self.working_directory = os.path.expanduser("~") 
            
        # Stage BJ Input files
        # build target url
        # this will also create the remote directory for the BJ
        if lrms_saga_url.username!=None and lrms_saga_url.username!="":
            bigjob_working_directory_url = "ssh://" + lrms_saga_url.username + "@" + lrms_saga_url.host + self.__get_bigjob_working_dir()
        else:
            bigjob_working_directory_url = "ssh://" + lrms_saga_url.host + self.__get_bigjob_working_dir()
        
        # determine working directory of bigjob 
        # if a remote sandbox can be created via ssh => create a own dir for each bj job id
        # otherwise use specified working directory
        if self.__create_remote_directory(bigjob_working_directory_url)==True:
            self.working_directory = self.__get_bigjob_working_dir()
            self.__stage_files(filetransfers, bigjob_working_directory_url)
        else:        
            logger.warn("For file staging. SSH (incl. password-less authentication is required.")
         
        logger.debug("BJ Working Directory: %s", self.working_directory)
        logger.debug("Adaptor specific modifications: "  + str(lrms_saga_url.scheme))
        if lrms_saga_url.scheme == "condorg":
            jd.arguments = [ self.coordination.get_address(), self.pilot_url]
            agent_exe = os.path.abspath(os.path.join(os.path.dirname(__file__),"..","bootstrap","bigjob-condor-bootstrap.py"))
            logger.debug("agent_exe",agent_exe)
            jd.executable = agent_exe             
        else:
            bootstrap_script = self.generate_bootstrap_script(self.coordination.get_address(), self.pilot_url)
            if lrms_saga_url.scheme == "gram":
                bootstrap_script = self.escape_rsl(bootstrap_script)
            elif lrms_saga_url.scheme == "pbspro" or lrms_saga_url.scheme=="xt5torque" or lrms_saga_url.scheme=="torque":                
                bootstrap_script = self.escape_pbs(bootstrap_script)
            elif lrms_saga_url.scheme == "ssh":
                bootstrap_script = self.escape_ssh(bootstrap_script)
            ############ submit pbs script which launches bigjob agent using ssh adaptors########## 
            elif lrms_saga_url.scheme == "pbs-ssh":
                bootstrap_script = self.escape_ssh(bootstrap_script)
                # PBS specific BJ plugin
                pbssshj = pbsssh(bootstrap_script, lrms_saga_url, walltime, number_nodes, 
                                 processes_per_node, userproxy, self.working_directory, self.working_directory)
                self.job = pbssshj
                self.job.run()
                return
            ############ submit sge script which launches bigjob agent using ssh adaptors########## 
            elif lrms_saga_url.scheme == "sge-ssh":
                bootstrap_script = self.escape_ssh(bootstrap_script)
                # PBS specific BJ plugin
                sgesshj = sgessh(bootstrap_script, lrms_saga_url, walltime, number_nodes, 
                                 processes_per_node, userproxy, project, queue, self.working_directory, self.working_directory)
                self.job = sgesshj
                self.job.run()
                return
            elif is_bliss:
                bootstrap_script = self.escape_bliss(bootstrap_script)

            #logger.debug(bootstrap_script)
            if is_bliss==False:
                jd.number_of_processes = str(number_nodes)
                jd.processes_per_host=str(processes_per_node)
            else:
                jd.TotalCPUCount=str(int(number_nodes)*int(processes_per_node))
                
            jd.spmd_variation = "single"
            #jd.arguments = [bigjob_agent_executable, self.coordination.get_address(), self.pilot_url]
            jd.arguments = ["python", "-c", bootstrap_script]
            jd.executable = "/usr/bin/env"
            if queue != None:
                jd.queue = queue
            if project !=None:
                jd.job_project = [project]
            if walltime!=None:
                jd.wall_time_limit=str(walltime)
        
            jd.working_directory = self.working_directory
    
            logger.debug("Working directory: " + jd.working_directory)
            jd.output = os.path.join(self.working_directory, "stdout-bigjob_agent.txt")
            jd.error = os.path.join(self.working_directory,"stderr-bigjob_agent.txt")
          
           
        # Submit job
        js = None    
        if userproxy != None and userproxy != '':
            s = saga.session()
            os.environ["X509_USER_PROXY"]=userproxy
            ctx = saga.context("x509")
            ctx.set_attribute ("UserProxy", userproxy)
            s.add_context(ctx)
            logger.debug("use proxy: " + userproxy)
            js = saga.job.service(s, lrms_saga_url)
        else:
            logger.debug("use standard proxy")
            js = saga.job.service(lrms_saga_url)

        logger.debug("Creating pilot job with description: %s" % str(jd))
              

        self.job = js.create_job(jd)
        logger.debug("Submit pilot job to: " + str(lrms_saga_url))
        self.job.run()
        return self.pilot_url