Exemplo n.º 1
0
    def start_pilot_job(self, 
                 lrms_url, 
                 bigjob_agent_executable,
                 number_nodes,
                 queue,
                 project,
                 working_directory,
                 userproxy,
                 walltime):
        """ start advert_launcher on specified host """
        if userproxy != None and userproxy != '':
            os.environ["X509_USER_PROXY"]=userproxy
            print "use proxy: " + userproxy
        else:
            print "use standard proxy"

        #register advert entry
        lrms_saga_url = saga.url(lrms_url)
        self.pilot_url = self.app_url.get_string() + "/" + lrms_saga_url.host
        print "create advert entry: " + self.pilot_url
        self.pilot_dir = saga.advert.directory(saga.url(self.pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
        # application level state since globus adaptor does not support state detail
        self.pilot_dir.set_attribute("state", str(saga.job.Unknown)) 
        logging.debug("set pilot state to: " + self.pilot_dir.get_attribute("state"))
        self.number_nodes=int(number_nodes)        
 
        # create job description
        jd = saga.job.description()
        jd.number_of_processes = str(number_nodes)
        jd.spmd_variation = "single"
        jd.arguments = [bigjob_agent_executable, self.database_host, self.pilot_url]
        jd.executable = "/bin/bash"
        #jd.executable = bigjob_agent_executable
        if queue != None:
            jd.queue = queue
        if project !=None:
            jd.job_project = [project]
        if walltime!=None:
            jd.wall_time_limit=str(walltime)

        if working_directory != None:
            jd.working_directory = working_directory
        else:
            jd.working_directory = "$(HOME)"
            
            
      #  if not os.path.isdir(jd.working_directory):
       #     os.mkdir(jd.working_directory)
            
        print "Working directory: " + jd.working_directory
        
        jd.output = "stdout-bigjob_agent-" + str(self.uuid) + ".txt"
        jd.error = "stderr-bigjob_agent-" + str(self.uuid) + ".txt"
           
        # Submit jbo
        js = saga.job.service(lrms_saga_url)
        self.job = js.create_job(jd)
        print "Submit pilot job to: " + str(lrms_saga_url)
        self.job.run()
        return self.job
Exemplo n.º 2
0
 def copy_files_to_location(self, new_pilot_data_url):    
     new_file_locations = []
     for i in self.file_registry:
         logging.debug(i)
         try:
             dir = saga.filesystem.directory(saga.url(new_pilot_data_url), 
                                             saga.filesystem.Create |  saga.filesystem.ReadWrite)            
         except:                
             traceback.print_exc(file=sys.stdout)
             print "Could not create: " + str(new_pilot_data_url)
         
         # copy files
         try:
             source_url = saga.url(str(i))
             filename = os.path.split(source_url.path)[1]
             
             dest_url = saga.url(os.path.join(new_pilot_data_url, filename))
             print "copy file: " + str(i) + " to " + str(dest_url)
             sagafile = saga.filesystem.file(source_url)
             sagafile.copy(dest_url, saga.filesystem.Overwrite)
             new_file_locations.append(dest_url)                                
         except saga.exception, e:
             traceback.print_exc(file=sys.stdout)
             error_msg = "file %s failed to be copied to"%(i)
             logging.error(error_msg)
    def dequeue_job(self, pilot_url):
        """ deque to new job  of a certain pilot """
        self.resource_lock.acquire()
        #pilot_url = self.get_url(pilot_url)
        jobs = []
        new_job_dir_url = self.get_url(pilot_url + "/new/")
        new_job_dir = saga.advert.directory(
            saga.url(new_job_dir_url), saga.advert.Create
            | saga.advert.CreateParents | saga.advert.ReadWrite)
        new_jobs = new_job_dir.list()
        logger.debug("Pilot Job base dir: " + new_job_dir_url +
                     " #new jobs: " + str(len(new_jobs)) + " jobs: " +
                     str(new_jobs))
        if len(new_jobs) >= 1:
            job_entry = new_jobs[0]
            job_dir_url = self.get_url(pilot_url + "/new/" + "/" +
                                       job_entry.get_string())
            logger.debug("Open job at " + str(job_dir_url))
            job_dir = saga.advert.directory(
                saga.url(job_dir_url), saga.advert.Create
                | saga.advert.CreateParents | saga.advert.ReadWrite)

            #new_job_dir.open_dir(job_entry)
            job_url = job_dir.get_attribute("joburl")
            #remove old job entry
            job_dir.remove(self.__remove_dbtype(job_dir_url),
                           saga.name_space.Recursive)

            logger.debug("Dequeued new job: " + str(job_url))
            self.resource_lock.release()
            return self.__remove_dbtype(job_url)
        else:
            self.resource_lock.release()
            time.sleep(1)
            return
Exemplo n.º 4
0
    def file_stage_in_with_saga(self, input_file_list_with_path, remote_url_prefix, remote_dir):
        cwd = os.getcwd()
        for ifile in input_file_list_with_path:
            # destination url
            dest_url = saga.url(remote_url_prefix + "/")
            ifile_basename = os.path.basename(ifile)

            try:
                dest_dir = saga.url(remote_url_prefix)
                dest_dir.path = remote_dir
                saga.file.directory(dest_dir, saga.file.Create |  saga.file.ReadWrite)
            except:
                print "Could not create: " + dest_dir.get_string()

            dest_url.path = os.path.join(remote_dir, ifile_basename)

            # source url
            source_url = saga.url('file://' + os.path.join(cwd, ifile))

            if not os.path.isfile(ifile):
                error_msg = "Input file %s does not exist in %s"%(ifile_basename, os.path.dirname(ifile))
                print(error_msg)
            else:
                try:
                    print "stage file: " + source_url.get_string() + " to " + dest_url.get_string()
                    sagafile = saga.filesystem.file(source_url)
                    sagafile.copy(dest_url)
                except saga.exception, e:
                    error_msg = "Input file %s failed to be staged in"%(ifile_basename)
                    print(error_msg)
Exemplo n.º 5
0
 def dequeue_job(self, pilot_url):
     """ deque to new job  of a certain pilot """
     self.resource_lock.acquire()
     #pilot_url = self.get_url(pilot_url)
     jobs = []        
     new_job_dir_url = self.get_url(pilot_url + "/new/") 
     new_job_dir = saga.advert.directory(saga.url(new_job_dir_url), 
                                         saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     new_jobs = new_job_dir.list()
     logger.debug("Pilot Job base dir: " + new_job_dir_url + " #new jobs: " + str(len(new_jobs))
                   + " jobs: " + str(new_jobs));
     if len(new_jobs)>=1:
         job_entry=new_jobs[0]     
         job_dir_url = self.get_url(pilot_url + "/new/" + "/" + job_entry.get_string())       
         logger.debug("Open job at " + str(job_dir_url))
         job_dir = saga.advert.directory(saga.url(job_dir_url), 
                                    saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
         
         #new_job_dir.open_dir(job_entry)                        
         job_url = job_dir.get_attribute("joburl")
         #remove old job entry
         job_dir.remove(self.__remove_dbtype(job_dir_url), saga.name_space.Recursive)
             
         logger.debug("Dequeued new job: " + str(job_url))
         self.resource_lock.release()
         return self.__remove_dbtype(job_url)
     else:
         self.resource_lock.release()
         time.sleep(1)
         return 
    def file_stage_in_with_saga(self, input_file_list_with_path, remote_machine_ip, remote_dir):
        cwd = os.getcwd()
        for ifile in input_file_list_with_path:
            # destination url
            if remote_machine_ip.find("localhost") >= 0:
                dest_url_str = "file://"
            else:
                dest_url_str = "gridftp://" + remote_machine_ip + "/"
            ifile_basename = os.path.basename(ifile)
            try:
                dest_dir = dest_url_str + remote_dir
                saga.file.directory(saga.url(dest_dir), saga.file.Create | saga.file.ReadWrite)
            except:
                print "Could not create: " + dest_dir

            dest_url_str = dest_url_str + os.path.join(remote_dir, ifile_basename)
            # source url
            source_url_str = "file://" + os.path.join(cwd, ifile)

            if not os.path.isfile(ifile):
                error_msg = "Input file %s does not exist in %s" % (ifile_basename, os.path.dirname(ifile))
                logging.error(error_msg)
            else:
                try:
                    source_url = saga.url(source_url_str)
                    dest_url = saga.url(dest_url_str)
                    print "stage file: " + source_url_str + " to " + dest_url_str
                    sagafile = saga.file.file(source_url)
                    sagafile.copy(dest_url)
                    logging.info("Now Input file %s is staged into %s" % (ifile_basename, dest_url_str))
                except saga.exception, e:
                    error_msg = "Input file %s failed to be staged in" % (ifile_basename)
                    logging.error(error_msg)
Exemplo n.º 7
0
def remote_filecopy_with_saga(filename_with_local_path_from, machine_url_from, filename_with_local_path_to, machine_url_to):
    source_url = saga.url('file://' + machine_url_from + filename_with_local_path_from)
    dest_url = saga.url('gridftp://' + machine_url_to + filename_with_local_path_to)

    sagafile = saga.file.file(source_url)
    try:
        sagafile.copy(dest_url)
        print "\n(DEBUG)  remote file copy from %s of %s to %s of %s is attempted"%(filename_with_local_path_from, machine_url_from, filename_with_local_path_to, machine_url_to)
        
    except saga.exception, e:
        print "\n(WARNING) remote file copy from %s of %s to %s of %s is failed"%(filename_with_local_path_from, machine_url_from, filename_with_local_path_to, machine_url_to)
        return "Failed"
Exemplo n.º 8
0
 def file_stage_out_with_saga(self, file_list, local_dir, remote_url_prefix, remote_dir):
     for ifile in file_list:
         try:
             source_url = saga.url(remote_url_prefix)
             source_url.path= os.path.join(remote_dir, ifile)
             dest_url = saga.url("file:///" + local_dir + "/" + ifile)
             #dest_url.path = ifile
             print "(DEBUG) Staging out output.txt file at %s to %s"%(source_url.get_string(), dest_url.get_string())
             sagafile = saga.filesystem.file(source_url)
             sagafile.copy(dest_url)
         except saga.exception, e:
             error_msg = "File stage out failed: "+ source_url.get_string()
def copy_with_saga(i):
    if i<RPB:
      os.system("cp "+ WORK_DIR + "/NPT.conf " + WORK_DIR + "agent/" + str(i) + "/NPT.conf")
     # source_url = saga.url('file://' + WORK_DIR + 'NPT.conf')
     # dest_url = saga.url('file://' + WORK_DIR + 'agent/' + str(i) + '/')
    elif (i>=RPB and i<(2*RPB)):
      source_url = saga.url('file://' + WORK_DIR + 'NPT.conf')
      dest_url = saga.url('gridftp://' + REMOTE1 + WORK_DIR+'agent/'+str(i)+'/')
      sagafile = saga.filesystem.file(source_url)
      try:
        sagafile.copy(dest_url)
      except saga.exception, e:
        print "\n(ERROR) remote ###NPT.CONF####file copy from %s to %s failed"%(HOST, REMOTE1)
Exemplo n.º 10
0
 def chunk_size(self,chunksize):
     self.__chunksize = chunksize
     chunk_list=[]
     group_chunks={}
     input=saga.url(self.__input_dir).path
     temp=saga.url(self.__tmp_dir).path
     dirList=os.listdir(input)
     for fname in dirList:
         os.system("cd " + temp + "; split -d -a 5 -b " + str(chunksize) + " " +  input + "/" + fname + " " + fname + "--" )
     dirList=os.listdir(temp)
     for fname in dirList:
         chunk_list.append([temp + "/" + fname])
     return chunk_list
def stage_ifiles(i):
   if not i%2:
     try:
        os.mkdir(WORK_DIR + 'agent/' + str(i))
     except OSError:
        pass
     for ifile in os.listdir(REPLICA_DIR):
        source_url = saga.url('file://' + REPLICA_DIR + ifile)
        dest_url = saga.url('file://' + WORK_DIR + 'agent/'+ str(i)+'/')
        sagafile = saga.filesystem.file(source_url) 
        try: 
           sagafile.copy(dest_url)
        except saga.exception, e:
           print str(e) + "\n(ERROR) local file ####STAGING### copy from %s to %s failed"%(REPLICA_DIR, HOST)
Exemplo n.º 12
0
def stage_ifiles(i):
    if not i % 2:
        try:
            os.mkdir(WORK_DIR + "agent/" + str(i))
        except OSError:
            pass
        for ifile in os.listdir(REPLICA_DIR):
            source_url = saga.url("file://" + REPLICA_DIR + ifile)
            dest_url = saga.url("file://" + WORK_DIR + "agent/" + str(i) + "/")
            sagafile = saga.filesystem.file(source_url)
            try:
                sagafile.copy(dest_url)
            except saga.exception, e:
                print str(e) + "\n(ERROR) local file ####STAGING### copy from %s to %s failed" % (REPLICA_DIR, HOST)
def file_stage_in_with_saga(input_file_list_with_path, remote_machine_ip,
                            remote_dir, RE_info):
    userproxy = None
    try:
        userproxy = RE_info.userproxy[RE_info.remote_hosts.index(
            remote_machine_ip)]
    except:
        try:
            userproxy = RE_info.userproxy[RE_info.gridftp_hosts.index(
                remote_machine_ip)]
        except:
            pass
    if userproxy != None or userproxy == "":
        os.environ["X509_USER_PROXY"] = userproxy
        print "use proxy: " + userproxy
    else:
        print "use standard proxy"
    for ifile in input_file_list_with_path:

        if remote_machine_ip.find('localhost') >= 0:
            dest_url_str = 'file://'
        else:
            dest_url_str = 'gridftp://' + remote_machine_ip + "/"
        source_url_str = 'file://'
        print "stage file: " + ifile + " to " + dest_url_str

        ifile_basename = os.path.basename(ifile)
        if not os.path.isfile(ifile):
            error_msg = "Input file %s does not exist in %s" % (
                ifile_basename, os.path.dirname(ifile))
            logging.error(error_msg)
        else:

            try:
                source_url_str = source_url_str + ifile
                dest_url_str = dest_url_str + os.path.join(
                    remote_dir, ifile_basename)
                source_url = saga.url(source_url_str)
                dest_url = saga.url(dest_url_str)
                print "stage file: " + source_url_str + " to " + dest_url_str

                sagafile = saga.file.file(source_url)
                sagafile.copy(dest_url)
                logging.info("Now Input file %s is staged into %s" %
                             (ifile_basename, dest_url_str))
            except saga.exception, e:
                error_msg = "Input file %s failed to be staged in" % (
                    ifile_basename)
                logging.error(error_msg)
def copy_with_saga(i):
    print "####################start time(npt.conf copy)" + time.asctime(time.localtime(time.time())) + "##################"
    start = time.time()
    if i<RPB:
      os.system("cp "+ WORK_DIR + "/NPT.conf " + WORK_DIR + "agent/" + str(i) + "/NPT.conf")
     # source_url = saga.url('file://' + WORK_DIR + 'NPT.conf')
     # dest_url = saga.url('file://' + WORK_DIR + 'agent/' + str(i) + '/')
    elif (i>=RPB and i<(2*RPB)):
      source_url = saga.url('file://' + WORK_DIR + 'NPT.conf')
      dest_url = saga.url('gridftp://' + REMOTE1 + WORK_DIR+'agent/'+str(i)+'/')
      sagafile = saga.filesystem.file(source_url)
      try:
        sagafile.copy(dest_url)
      except saga.exception, e:
        print "\n(ERROR) remote ###NPT.CONF####file copy from %s to %s failed"%(HOST, REMOTE1)
 def test_file_move(self):
     """
     Test for the saga.filesystem.move() API call 
     http://saga.cct.lsu.edu/python/apidoc/saga.filesystem._file.html#file-move
     """
     try:
         source_url = saga.url("file:///"+self.filename_a)
         target_url = saga.url("file:///"+self.filename_d)
 
         my_file = saga.filesystem.file(source_url)
         my_file.move(target_url)
         my_file.close()
             
     except saga.exception, e: 
         self.fail(e.get_full_message())
def copy_with_saga(i):
   # print "####################start time(npt.conf copy)" + time.asctime(time.localtime(time.time())) + "##################"
    start = time.time()
    if i<RPB:
      os.system("cp "+ WORK_DIR +"agent/"+ str(replica_id)+ "/NPT.conf " + WORK_DIR + "agent/" + str(i) + "/NPT.conf")
     # source_url = saga.url('file://' + WORK_DIR + 'NPT.conf')
     # dest_url = saga.url('file://' + WORK_DIR + 'agent/' + str(i) + '/')
    elif (i>=RPB and i<(2*RPB)):
      source_url = saga.url('file://' + WORK_DIR + 'NPT.conf')
      dest_url = saga.url('gridftp://' +"gridftp.ranger.tacc.teragrid.org:2811" + WORK_DIR1+'agent/'+str(i)+'/')
      sagafile = saga.filesystem.file(source_url)
      try:
        sagafile.copy(dest_url)
      except saga.exception, e:
        print "\n(ERROR) remote ###NPT.CONF####file copy from %s to %s failed"%(HOST, REMOTE1)
Exemplo n.º 17
0
 def __parse_url(self, url):
     try:
         surl = saga.url(url)
         host = surl.host
         port = surl.port
         username = surl.username
         password = surl.password
         query = surl.query
         scheme = "%s://"%surl.scheme
     except:
         """ Fallback URL parser based on Python urlparse library """
         logger.error("URL %s could not be parsed")
         traceback.print_exc(file=sys.stderr)
         result = urlparse.urlparse(url)
         host = result.hostname
         port = result.port
         username = result.username
         password = result.password
         if url.find("?")>0:
             query = url[url.find("?")+1:]
         else:
             query = None
         scheme = "%s://"%result.scheme
         
     return scheme, username, password, host, port, query     
Exemplo n.º 18
0
 def delete_du(cls, du_url):
     du_url = cls.__get_url(du_url)
     du_dir = saga.advert.directory(saga.url(du_url), 
                                     saga.advert.Create | 
                                     saga.advert.CreateParents | 
                                     saga.advert.ReadWrite)
     du_dir.remove(du_url, saga.name_space.Recursive)  
Exemplo n.º 19
0
 def setup_charmpp_nodefile(self, allocated_nodes):
     """ Setup charm++ nodefile to use for executing NAMD  
         HACK!! Method violates layering principle
         File $HOME/machinefile in charm++ nodefileformat is written to first node in list
     """
     # Nodelist format:
     # 
     # host tp-x001 ++cpus 2 ++shell ssh 
     # host tp-x002 ++cpus 2 ++shell ssh
     
     nodefile_string=""
     for i in allocated_nodes:
         
         
         if i.has_key("private_hostname"):
             nodefile_string=nodefile_string + "host "+ i["private_hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n"
         else:
             nodefile_string=nodefile_string + "host "+ i["hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n"
         
     # copy nodefile to rank 0 node
     jd = saga.job.description()
     jd.executable = "echo"
     jd.number_of_processes = "1"
     jd.spmd_variation = "single"
     # ssh [email protected] "cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys"
     jd.arguments = ["\""+nodefile_string+"\"", ">", "machinefile"]
     jd.output = "stdout.txt"
     jd.error = "stderr.txt"
     
     job_service_url = saga.url("ssh://root@"+allocated_nodes[0]["hostname"])
     job_service = saga.job.service(self.session, job_service_url)
     job = job_service.create_job(jd)
     job.run()
     job.wait()
Exemplo n.º 20
0
 def set_job_state(self, job_url, new_state):   
     self.resource_lock.acquire()     
     job_url = self.get_url(job_url)
     logger.debug("Set state of job: " + str(job_url) + " to: " + str(new_state))
     job_dir = saga.advert.directory(saga.url(job_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     job_dir.set_attribute("state", str(new_state))
     self.resource_lock.release()
Exemplo n.º 21
0
    def submit_job(self, glidin_url, jd):
        """ submit job via advert service to NAMD-Launcher 
            dest_url - url reference to advert job or host on which the advert job is going to run"""
        print "submit job: " + str(glidin_url)
        if self.job_url==None:
            self.job_url=self.get_job_url(glidin_url)

        for i in range(0,3):
            try:
                print "create job entry "
                self.job_dir = saga.advert.directory(saga.url(self.job_url), 
                                             saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
                print "initialized advert directory for job: " + self.job_url
                # put job description attributes to advert
                attributes = jd.list_attributes()                
                for i in attributes:          
                        if jd.attribute_is_vector(i):
                            self.job_dir.set_vector_attribute(i, jd.get_vector_attribute(i))
                        else:
                            logging.debug("Add attribute: " + str(i) + " Value: " + jd.get_attribute(i))
                            self.job_dir.set_attribute(i, jd.get_attribute(i))

                self.job_dir.set_attribute("state", str(saga.job.Unknown))
                # return self object for get_state() query    
                #logging.debug("Submission time (time to create advert entries): " + str(time.time()-start) + " s")
                return self    
            except:
                traceback.print_exc(file=sys.stdout)
Exemplo n.º 22
0
 def __init__(self, database_host):        
     self.database_host = database_host
     print "init advert service session at host: " + database_host
     self.uuid = uuid.uuid1()
     self.app_url = saga.url("advert://" + database_host + "/"+APPLICATION_NAME + "-" + str(self.uuid) + "/")
     self.app_dir = saga.advert.directory(self.app_url, saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     print "created advert directory for application: " + self.app_url.get_string()
Exemplo n.º 23
0
 def get_jobs_of_pilot(self, pilot_url):
     pilot_url = self.get_url(pilot_url + "/jobs")
     """ returns array of job_url that are associated with a pilot """
     pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     jobs = pilot_dir.list()   
     j = [self.__remove_dbtype(pilot_url) + "/" + i.get_string() for i in jobs]
     return j
Exemplo n.º 24
0
    def submit_job(self, pilot_url, jd, rid):
        """ submit job via advert service to NAMD-Launcher 
            dest_url - url reference to advert job or host on which the advert job is going to run"""
        print "submit job: " + str(pilot_url)
        if self.job_url == None:
            self.job_url = self.get_job_url(pilot_url)

        for i in range(0, 3):
            try:
                print "create job entry "
                self.job_dir = saga.advert.directory(
                    saga.url(self.job_url), saga.advert.Create
                    | saga.advert.CreateParents | saga.advert.ReadWrite)
                print "initialized advert directory for job: " + self.job_url
                # put job description attributes to advert
                attributes = jd.list_attributes()
                for i in attributes:
                    if jd.attribute_is_vector(i):
                        self.job_dir.set_vector_attribute(
                            i, jd.get_vector_attribute(i))
                    else:
                        logging.debug("Add attribute: " + str(i) + " Value: " +
                                      jd.get_attribute(i))
                        self.job_dir.set_attribute(i, jd.get_attribute(i))

                self.job_dir.set_attribute("state", str(saga.job.Unknown))
                self.job_dir.set_attribute("energy", "unknown energy")
                self.job_dir.set_attribute("temp", "unknown temp")
                self.job_dir.set_attribute("replica_id", rid)
                # return self object for get_state() query
                #logging.debug("Submission time (time to create advert entries): " + str(time.time()-start) + " s")
                return self
            except:
                traceback.print_exc(file=sys.stdout)
Exemplo n.º 25
0
 def setup_charmpp_nodefile(self, allocated_nodes):
     """ Setup charm++ nodefile to use for executing NAMD  
         HACK!! Method violates layering principle
         File $HOME/machinefile in charm++ nodefileformat is written to first node in list
     """
     # Nodelist format:
     # 
     # host tp-x001 ++cpus 2 ++shell ssh 
     # host tp-x002 ++cpus 2 ++shell ssh
     
     nodefile_string=""
     for i in allocated_nodes:
         if i.has_key("private_hostname"):
             nodefile_string=nodefile_string + "host "+ i["private_hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n"
         else:
             nodefile_string=nodefile_string + "host "+ i["hostname"] + " ++cpus " + str(i["cpu_count"]) + " ++shell ssh\n"
         
     # copy nodefile to rank 0 node
     jd = saga.job.description()
     jd.executable = "echo"
     jd.number_of_processes = "1"
     jd.spmd_variation = "single"
     # ssh [email protected] "cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys"
     jd.arguments = ["\""+nodefile_string+"\"", ">", "machinefile"]
     jd.output = "stdout.txt"
     jd.error = "stderr.txt"
     
     job_service_url = saga.url("ssh://root@"+allocated_nodes[0]["hostname"])
     job_service = saga.job.service(self.session, job_service_url)
     job = job_service.create_job(jd)
     job.run()
     job.wait()
Exemplo n.º 26
0
 def setup_image(self, hostname):
     """ ensure ssh keys are properly setup (works for Nimbus, Eucalyptus and EC2 """
     jd = saga.job.description()
     jd.executable = "/usr/bin/cat"
     jd.number_of_processes = "1"
     jd.spmd_variation = "single"
     # ssh [email protected] "cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys"
     jd.arguments = ["~/.ssh/id_rsa.pub", ">>", "~/.ssh/authorized_keys" ]
     jd.output = "stdout.txt"
     jd.error = "stderr.txt"
     
     for i in range (0, 3):
         try:
             job_service_url = saga.url("ssh://root@" + hostname)
             job_service = saga.job.service(self.session, job_service_url)
             job = job_service.create_job(jd)
             job.run()
             
             # Cache job service object for later usage
             self.job_service_cache[job_service_url] =job_service
             
             # wait for completion of job
             job.wait()
             return
         except:
             pass
         time.sleep(30)
Exemplo n.º 27
0
 def set_pilot_state(self, pilot_url, new_state, stopped=False):   
     pilot_url = self.get_url(pilot_url)
     logger.debug("create advert entry: " + pilot_url)
     pilot_dir = saga.advert.directory(saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     logger.debug("update state of pilot job to: " + str(new_state) + " Stopped: " + str(stopped))
     pilot_dir.set_attribute("state", str(new_state)) 
     pilot_dir.set_attribute("stopped", str(stopped))
Exemplo n.º 28
0
 def delete_pds(cls, pds_url):
     pds_url = cls.__get_url(pds_url)
     pds_dir = saga.advert.directory(saga.url(pds_url), 
                                     saga.advert.Create | 
                                     saga.advert.CreateParents | 
                                     saga.advert.ReadWrite)
     pds_dir.remove(pds_url, saga.name_space.Recursive)  
def m_dirs():
    global i_fsize , c_size , s_out, e_time, nbr_maps, nbr_reduces, app_url, app_dir, qtimes,b_uuid
    global database_host
    
    for m,q in machine.iteritems():    
        pilot_url = saga.url("advert://" + database_host + "/"+APPLICATION_NAME + "-" + str(b_uuid) + "/" + m + "/" )
        pilot_dir = saga.advert.directory(pilot_url, saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
        pilot_dir.set_attribute("state", "Unknown")
        print " created pilot directory " + str(app_url) + " with state Unknown"
    
        new_url = saga.url("advert://" + database_host + "/"+APPLICATION_NAME + "-" + str(b_uuid) + "/" + m + "/" + "new")
        new_dir = saga.advert.directory(new_url, saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
        print " created  new directory " + str(new_url) 
    for i in range(int(q)):
        pilot_dir.set_attribute("state", "Unknown")
    pilot_dir.set_attribute("state", "Running")
Exemplo n.º 30
0
 def allocate_nodes(self, number_of_nodes):
     """ allocate nodes - remove nodes from free nodes list
         return SAGA-URL to resource ssh://tx.domain.org
     """
     allocated_nodes = []
     self.resource_lock.acquire()
     if (len(self.free_nodes)>=number_of_nodes): 
         for i in self.free_nodes[:]:
             number = i["cpu_count"]
             print "Pilot: " + self.pilot_url + " Allocate: " + i["hostname"] + " number cores: " + str(number)
             if(number_of_nodes > 0):
                     allocated_nodes.append(i)
                     self.free_nodes.remove(i)                
                     self.busynodes.append(i)
                     number_of_nodes = number_of_nodes - 1
             else:
                     break
     
         self.resource_lock.release()
         self.setup_charmpp_nodefile(allocated_nodes)
         return saga.url("ssh://root@" + allocated_nodes[0]["hostname"]), allocated_nodes
     else:
             print "BigJob: " + str(self.pilot_url) + ": Not sufficient resources for job."
             self.resource_lock.release()
             return "", []
Exemplo n.º 31
0
 def __init__(self, database_host):        
     self.database_host = database_host
     print "init advert service session at host: " + database_host
     self.uuid = uuid.uuid1()
     self.app_url = saga.url("advert://" + database_host + "/"+APPLICATION_NAME + "-" + str(self.uuid) + "/")
     self.app_dir = saga.advert.directory(self.app_url, saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     print "created advert directory for application: " + self.app_url.get_string()
Exemplo n.º 32
0
 def delete_job(self):
     print "delete job and close dirs: " + self.job_url
     try:
         self.job_dir.change_dir("..")
         self.job_dir.remove(saga.url(self.job_url), saga.name_space.Recursive)
         self.job_dir.close()
     except:
         pass
Exemplo n.º 33
0
 def from_advert(ps_url):
     logging.debug("Open pilot store at: " + ps_url.get_string())     
     ps_dir = saga.advert.directory(ps_url, saga.advert.Create | 
                                            saga.advert.CreateParents | 
                                            saga.advert.ReadWrite)
     ps = pilot_store()        
     ps.base_dir=saga.url(ps_dir.get_attribute("base_dir"))
     ps.name=ps_dir.get_attribute("name")
     ps.uuid=ps_dir.get_attribute("uuid")
     ps.weight=ps_dir.get_attribute("weight")
     ps.pd_url=saga.url(ps_dir.get_attribute("pd_url"))
     ps.number_of_chunks=int(ps_dir.get_attribute("number_of_chunks"))
     if (ps_dir.attribute_exists("file_registry") == True):
         ps.file_registry = [saga.url(x) for x in ps_dir.get_vector_attribute("file_registry")]
     else:
         ps.file_registry=[]
     return ps
Exemplo n.º 34
0
 def cancel(self):
     print "delete job and close dirs: " + self.job_url
     try:
         self.job_dir.change_dir("..")
         self.job_dir.remove(saga.url(self.job_url), saga.name_space.Recursive)
         self.job_dir.close()
     except:
         pass
Exemplo n.º 35
0
 def get_job(self, job_url):
     #job_dir = saga.advert.directory(saga.url(job_url),
     #                                saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     job_url = self.get_url(job_url + "/job-description")
     logger.debug("Get job description from: %s" % (job_url))
     job_desc_entry = saga.advert.entry(saga.url(job_url), saga.advert.Read)
     job_dict = json.loads(job_desc_entry.retrieve_string())
     return job_dict
 def __init__(self, database_host):        
     self.database_host = database_host
     #print "init advert service session at host: " + database_host
     self.uuid = get_uuid()
     self.app_url = saga.url("advert://" + database_host + "/"+APPLICATION_NAME + "-" + str(self.uuid) + "/")
     self.app_dir = saga.advert.directory(self.app_url, saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     self.state=saga.job.Unknown
     self.pilot_url=""
Exemplo n.º 37
0
    def __store_entry(cls, entry_url, content):
        entry_url = cls.__get_url(entry_url)

        # directory is recursively created
        entry = saga.advert.entry(
            saga.url(entry_url), saga.advert.Create | saga.advert.CreateParents
            | saga.advert.ReadWrite)
        entry.store_string(json.dumps(content))
Exemplo n.º 38
0
 def __store_entry(cls, entry_url, content):
     entry_url = cls.__get_url(entry_url)
     
     # directory is recursively created
     entry = saga.advert.entry(saga.url(entry_url),
                                        saga.advert.Create | 
                                        saga.advert.CreateParents | saga.advert.ReadWrite)
     entry.store_string(json.dumps(content))
Exemplo n.º 39
0
    def __init__(self, args):
        
        self.database_host = args[1]
        # objects to store running jobs and processes
        self.jobs = []
        self.processes = {}
        self.freenodes = []
        self.busynodes = []
        self.restarted = {}

        # read config file
        conf_file = os.path.dirname(args[0]) + "/" + CONFIG_FILE
        config = ConfigParser.ConfigParser()
        print ("read configfile: " + conf_file)
        config.read(conf_file)
        default_dict = config.defaults()
        self.CPR = default_dict["cpr"]
        self.SHELL=default_dict["shell"]
        self.MPIRUN=default_dict["mpirun"]
        print "cpr: " + self.CPR + " mpi: " + self.MPIRUN + " shell: " + self.SHELL
        
        # init cpr monitoring
        self.init_cpr()
        # init rms (SGE/PBS)
        self.init_rms()

        self.failed_polls = 0
        # open advert service base url
        hostname = socket.gethostname()
        self.base_url = args[2]
        print "Open advert: " + self.base_url
        try:
            self.base_dir = saga.advert.directory(saga.url(self.base_url), saga.advert.Create | saga.advert.ReadWrite)
            self.new_job_dir = saga.advert.directory(saga.url(self.base_url+"/new/"), saga.advert.Create| saga.advert.CreateParents | saga.advert.ReadWrite)
        except:
            print "No advert entry found at specified url: " + self.base_url
            traceback.print_exc(file=sys.stderr)
            

        # update state of glidin job to running
        self.update_glidin_state()
        # start background thread for polling new jobs and monitoring current jobs
        self.resource_lock=threading.Lock()
        self.launcher_thread=threading.Thread(target=self.start_background_thread())
        self.launcher_thread.start()
Exemplo n.º 40
0
 def get_job(self, job_url):
     #job_dir = saga.advert.directory(saga.url(job_url), 
     #                                saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     job_url = self.get_url(job_url+"/job-description")
     logger.debug("Get job description from: %s"%(job_url))
     job_desc_entry = saga.advert.entry(saga.url(job_url),
                                        saga.advert.Read)
     job_dict = json.loads(job_desc_entry.retrieve_string())
     return job_dict    
Exemplo n.º 41
0
    def __init__(self,
                 server=ADVERT_SERVER,
                 server_port=ADVERT_SERVER_PORT,
                 server_connect_url=None,
                 username=None,
                 password=None,
                 dbtype=None,
                 url_prefix=None):
        '''
        Constructor
        '''

        #pdb.set_trace()
        if url_prefix == None:
            url_prefix = ADVERT_URL_SCHEME

        if username != None and username != "":
            url_prefix = url_prefix + username
            if password != None:
                url_prefix = url_prefix + ":" + password
            url_prefix = url_prefix + "@"
        if server_connect_url != None:
            self.address = server_connect_url
        elif server_port != None:
            self.address = url_prefix + "%s:%i" % (server, server_port)
        elif server != None:
            self.address = url_prefix + "%s" % (server)

        self.username = ""
        self.password = ""
        self.dbtype = ""
        surl = saga.url(self.address)
        if server_connect_url == None:  # Manager
            if username != None:
                surl.username = username
                self.username = username
            if password != None:
                surl.password = password
                self.password = password
            if dbtype != None:
                #surl.query = dbtype
                self.dbtype = dbtype
        else:  # Agent
            if surl.query != None:
                self.dbtype = surl.query
                surl.query = ""

        self.address = str(surl)
        self.pilot_url = self.address
        logger.debug("Server: " + str(server) + " Port " + str(server_port) +
                     " Url prefix: " + str(url_prefix) + " Address: " +
                     str(self.get_address()) + " server_connect_url: " +
                     str(server_connect_url))
        logger.debug("Initialized Coordination to: %s (DB: %s)" %
                     (self.address, self.dbtype))
        self.resource_lock = threading.RLock()
Exemplo n.º 42
0
    def map_job_submit(self):
        ##########################################################################################
        print " >>> Starting Mapping ..................... \n"
        jobs = []
        job_start_times = {}
        job_states = {}

        for u in self.__chunk_list:
            k = u.replace('//', '/').split('/')
            uname = (os.path.split(u))[1]
            temp_abs_path = "/" + "/".join(k[2:len(k) - 1]) + "/" + uname
            print " >>> chunk path/name to be submitted to map subjob  " + temp_abs_path + " >>> " + uname

            # create job description
            try:

                jd = saga.job.description()
                jd.executable = self.__mapper
                jd.number_of_processes = self.workers
                jd.spmd_variation = "single"
                jd.arguments = [temp_abs_path, str(self.__nbr_reduces)]
                jd.working_directory = saga.url(self.__tmp_dir).path
                jd.output = self.workingdirectory + "/stdout-" + uname + ".txt"
                jd.error = self.workingdirectory + "/stderr-" + uname + ".txt"
                js = saga.job.service(saga.url(self.resource_url))
                job = js.create_job(jd)
                print "Submited sub-job " + self.resource_url + "."
                job.run()
                jobs.append(job)
                job_start_times[job] = time.time()
                job_states[job] = job.get_state()
            except:
                #traceback.print_exc(file=sys.stdout)
                print " Map Job failed. Cancelling framework......"
                sys.exit(0)

        print "************************ All Jobs submitted ************************"
        print " No of map subjobs created - " + str(len(jobs))
        # Wait for task completion of map tasks - synchronization

        ############################################################################################
        # Wait for task completion of map tasks - synchronization
        wait_for_all_jobs(jobs, job_start_times, job_states, 5)
Exemplo n.º 43
0
 def get_pilot_state(self, pilot_url):
     pilot_url = self.get_url(pilot_url)
     pilot_dir = saga.advert.directory(saga.url(pilot_url),
                                       saga.advert.Read)
     state = pilot_dir.get_attribute("state")
     stopped = pilot_dir.get_attribute("stopped")
     if stopped == "false" or stopped == "False":
         return {"state": state, "stopped": False}
     else:
         return {"state": state, "stopped": True}
Exemplo n.º 44
0
 def set_pilot_state(self, pilot_url, new_state, stopped=False):
     pilot_url = self.get_url(pilot_url)
     logger.debug("create advert entry: " + pilot_url)
     pilot_dir = saga.advert.directory(
         saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents
         | saga.advert.ReadWrite)
     logger.debug("update state of pilot job to: " + str(new_state) +
                  " Stopped: " + str(stopped))
     pilot_dir.set_attribute("state", str(new_state))
     pilot_dir.set_attribute("stopped", str(stopped))
Exemplo n.º 45
0
    def get_job_url(self, pilot_url):
        self.saga_pilot_url = saga.url(pilot_url)
        if(self.saga_pilot_url.scheme=="advert"): #
            pass

        else: # any other url, try to guess pilot job url
            host=""
            try:
                host = self.saga_pilot_url.host
            except:
                pass
            if host =="":
                host=socket.gethostname()
            # create dir for destination url
            self.saga_pilot_url = saga.url("advert://" +  self.database_host + "/"+APPLICATION_NAME + "/" + host)

        # create dir for job
        self.job_url = self.saga_pilot_url.get_string() + "/" + str(self.uuid)
        return self.job_url
Exemplo n.º 46
0
 def fastq_chunk(self,lines):
     chunk_list=[]
     group_chunks={}
     input=saga.url(self.__input_dir).path
     temp=saga.url(self.__tmp_dir).path
     dirList=os.listdir(input)
     for fname in dirList:
         os.system("cd " + temp + "; split -d -a 5 -l " + str(lines) + " " +  input + "/" + fname + " " + fname + "--" )
     dirList=os.listdir(temp)
     for fname in sorted(dirList):
         chunk_list.append(temp + "/" + fname)
     for chunk in chunk_list:
         seq=chunk.split("--")[1]
         if group_chunks.has_key(seq):
            group_chunks[seq].append(chunk)
         else:
            group_chunks[seq] = chunk.split()
     self.__chunk_list = group_chunks.values()   
     return self.__chunk_list
 def submit_job_cpr(self, dest_url_string, jd, checkpt_files):
     error_string = ""
     start = time.time()
     js = saga.cpr.service(saga.url(dest_url_string))
     jd_start = jd
     jd_restart = jd
     new_cpr_job = js.create_job(jd_start, jd_restart)
     new_cpr_job.run()
     print "job state: " + str(new_cpr_job.get_state())
     print "spawning time " + "%d" % (time.time() - start) + " s"
     return error_string, new_cpr_job
Exemplo n.º 48
0
 def __retrieve_entry(cls, entry_url):
     entry_url = cls.__get_url(entry_url)
     #logger.debug("Retrieve Advert entry at: " + entry_url)
     # directory is recursively created
     entry = saga.advert.entry(
         saga.url(entry_url), saga.advert.Create | saga.advert.CreateParents
         | saga.advert.ReadWrite)
     content = json.loads(entry.retrieve_string())
     #logger.debug("Retrieve Advert entry at: " + entry_url
     #              + " Content: " + str(json.dumps(content)))
     return content
Exemplo n.º 49
0
    def get_job_url(self, glidin_url):
        self.saga_glidin_url = saga.url(glidin_url)
        if(self.saga_glidin_url.scheme=="advert"): #
            pass

        else: # any other url, try to guess glidin job url
            host=""
            try:
                host = self.saga_glidin_url.host
            except:
                pass
            if host =="":
                host=socket.gethostname()
            # create dir for destination url
            self.saga_glidin_url = saga.url("advert://" +  self.database_host + "/"+APPLICATION_NAME + "/" + host)
            #self.glidin_dir = saga.advert.directory(self.saga_glidin_url, 
            #                                        saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
        # create dir for job
        self.job_url = self.saga_glidin_url.get_string() + "/" + str(self.uuid)
        return self.job_url
Exemplo n.º 50
0
 def queue_job(self, pilot_url, job_url):
     self.resource_lock.acquire()
     #pilot_url = self.get_url(pilot_url)
     job_url = self.get_url(job_url)
     """ queue new job to pilot """
     new_job_url = self.get_url(pilot_url + "/new/" + str(uuid.uuid1()))
     logger.debug("Job URL: %s Create new job entry at: %s" %
                  (job_url, new_job_url))
     new_job_dir = saga.advert.directory(
         saga.url(new_job_url), saga.advert.Create
         | saga.advert.CreateParents | saga.advert.ReadWrite)
     new_job_dir.set_attribute("joburl", job_url)
     self.resource_lock.release()
    def file_stage_in_with_saga(self, input_file_list_with_path,
                                remote_machine_ip, remote_dir):
        cwd = os.getcwd()
        for ifile in input_file_list_with_path:
            # destination url
            if remote_machine_ip.find('localhost') >= 0:
                dest_url_str = 'file://'
            else:
                dest_url_str = 'gridftp://' + remote_machine_ip + "/"
            ifile_basename = os.path.basename(ifile)
            try:
                dest_dir = dest_url_str + remote_dir
                saga.file.directory(saga.url(dest_dir),
                                    saga.file.Create | saga.file.ReadWrite)
            except:
                print "Could not create: " + dest_dir

            dest_url_str = dest_url_str + os.path.join(remote_dir,
                                                       ifile_basename)
            # source url
            source_url_str = 'file://' + os.path.join(cwd, ifile)

            if not os.path.isfile(ifile):
                error_msg = "Input file %s does not exist in %s" % (
                    ifile_basename, os.path.dirname(ifile))
                logging.error(error_msg)
            else:
                try:
                    source_url = saga.url(source_url_str)
                    dest_url = saga.url(dest_url_str)
                    print "stage file: " + source_url_str + " to " + dest_url_str
                    sagafile = saga.file.file(source_url)
                    sagafile.copy(dest_url)
                    logging.info("Now Input file %s is staged into %s" %
                                 (ifile_basename, dest_url_str))
                except saga.exception, e:
                    error_msg = "Input file %s failed to be staged in" % (
                        ifile_basename)
                    logging.error(error_msg)
Exemplo n.º 52
0
 def add_pd(cls, pds_url, pd):
     pds_url = cls.__remove_dbtype(pds_url)
     pd_url = pds_url + "/" + pd.id
     pd_description_url = cls.__get_url(pd_url + "/description")
     logger.debug("PDS URL: %s, PD Description URL: %s" %
                  (pds_url, pd_description_url))
     # directory is recursively created
     pd_desc_entry = saga.advert.entry(
         saga.url(pd_description_url), saga.advert.Create
         | saga.advert.CreateParents | saga.advert.ReadWrite)
     logger.debug("initialized advert entry for pds: " + pd_description_url)
     pd_desc_entry.store_string(json.dumps(pd.data_unit_description))
     return pd_url
Exemplo n.º 53
0
 def set_job(self, job_url, job_dict):
     job_dir_url = self.get_url(job_url)
     job_description_url = self.get_url(job_url + "/job-description")
     logger.debug("Job URL: %s, Job Description URL: %s" %
                  (job_dir_url, job_description_url))
     #job_dir = saga.advert.directory(saga.url(job_dir_url),
     #                                saga.advert.Create | saga.advert.CreateParents | saga.advert.ReadWrite)
     # directory is recursively created
     job_desc_entry = saga.advert.entry(
         saga.url(job_description_url), saga.advert.Create
         | saga.advert.CreateParents | saga.advert.ReadWrite)
     logger.debug("initialized advert entry for job: " + job_dir_url)
     job_desc_entry.store_string(json.dumps(job_dict))
     self.set_job_state(job_url, str(saga.job.Unknown))
Exemplo n.º 54
0
 def get_jobs_of_pilot(self, pilot_url):
     pilot_url = self.get_url(pilot_url + "/jobs")
     """ returns array of job_url that are associated with a pilot """
     pilot_dir = saga.advert.directory(
         saga.url(pilot_url), saga.advert.Create | saga.advert.CreateParents
         | saga.advert.ReadWrite)
     jobs = pilot_dir.list()
     job_urls = [
         self.__get_colon_url(
             self.__remove_dbtype(pilot_url) + "/" + i.get_string())
         for i in jobs
     ]
     if self.dbtype != None:
         job_urls = [i + "?" + self.dbtype for i in job_urls]
     return job_urls
Exemplo n.º 55
0
 def monitor_checkpoints(self):
     """ parses all job working directories and registers files with Migol via SAGA/CPR """
     #get current files from AIS
     url = saga.url("advert_launcher_checkpoint")
     checkpoint = saga.cpr.checkpoint(url)
     files = checkpoint.list_files()
     for i in files:
         print i
     dir_listing = os.listdir(os.getcwd())
     for i in dir_listing:
         filename = dir + "/" + i
         if (os.path.isfile(filename)):
             if (check_file(files, filename == False)):
                 url = self.build_url(filename)
                 print str(self.build_url(filename))