def submit_subjob(self,replica_id, jd): ####### submit job via pilot job ###### i=replica_id if(i < self.RPB): k=0 sj = subjob() sj.submit_job(self.bjs[k].pilot_url, jd) self.job_start_times[sj]=time.time() self.job_states[sj] = sj.get_state() return sj elif((i>=self.RPB) and (i<2*self.RPB)): k=1 sj = subjob() sj.submit_job(self.bjs[k].pilot_url, jd) self.job_start_times[sj]=time.time() self.job_states[sj] = sj.get_state() return sj elif((i>=2*self.RPB) and (i<3*self.RPB)): k=2 sj = subjob() sj.submit_job(self.bjs[k].pilot_url, jd) self.job_start_times[sj]=time.time() self.job_states[sj] = sj.get_state() return sj else: k=3 sj = subjob() sj.submit_job(self.bjs[k].pilot_url, jd) self.job_start_times[sj]=time.time() self.job_states[sj] = sj.get_state() return sj
def submit_subjob(self, replica_id, jd): ####### submit job via pilot job ###### i = replica_id if (i < self.RPB): k = 0 sj = subjob() sj.submit_job(self.bjs[k].pilot_url, jd) self.job_start_times[sj] = time.time() self.job_states[sj] = sj.get_state() return sj elif ((i >= self.RPB) and (i < 2 * self.RPB)): k = 1 sj = subjob() sj.submit_job(self.bjs[k].pilot_url, jd) self.job_start_times[sj] = time.time() self.job_states[sj] = sj.get_state() return sj elif ((i >= 2 * self.RPB) and (i < 3 * self.RPB)): k = 2 sj = subjob() sj.submit_job(self.bjs[k].pilot_url, jd) self.job_start_times[sj] = time.time() self.job_states[sj] = sj.get_state() return sj else: k = 3 sj = subjob() sj.submit_job(self.bjs[k].pilot_url, jd) self.job_start_times[sj] = time.time() self.job_states[sj] = sj.get_state() return sj
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=1 number_of_processes = 1 workingdirectory="." # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) #lrms_url = "ec2+ssh://localhost" # resource url to run on GCE lrms_url = "gce+ssh://locahost" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] jd.input_data = ["hi", "ho"] # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def _submit_cu(self, compute_unit): """ Submits compute unit to Bigjob """ logger.debug("Submit CU to big-job") sj = subjob() sj.submit_job(self.__bigjob.pilot_url, compute_unit.subjob_description) self.__subjobs.append(sj) compute_unit._update_subjob(sj) return compute_unit
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 8 number_nodes = 24 workingdirectory = os.getcwd() # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "pbs://localhost" # resource url to run the jobs on localhost ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, number_nodes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/hostname" jd.number_of_processes = "2" jd.spmd_variation = "single" jd.arguments = [""] #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" for i in range(0, 12): sj = subjob() sj.submit_job(bj.pilot_url, jd) ########################################################################################## # Cleanup - stop BigJob bj.wait() bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 1 number_of_processes = 1 workingdirectory = "." # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) #lrms_url = "ec2+ssh://localhost" # resource url to run on GCE lrms_url = "gce+ssh://locahost" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] jd.input_data = ["hi", "ho"] # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def __submit_cu(self, compute_unit): """ Submits compute unit to Bigjob """ if len(self.pilot_job_services)!=1: raise PilotError("No PilotComputeService found. Please start a PCS before submitting ComputeUnits.") sj = subjob() self.pcs_coordination_namespace=self.pilot_job_services[0].coordination_queue logger.debug("Submit CU to big-job via external queue: %s"%self.pcs_coordination_namespace) sj.submit_job(self.pcs_coordination_namespace, compute_unit.subjob_description) compute_unit._update_subjob(sj) return compute_unit
def __init__(self, compute_unit_description=None, compute_data_service=None, cu_url=None): if cu_url==None: self.id = self.CU_ID_PREFIX + str(uuid.uuid1()) if compute_data_service!=None: self.url = compute_data_service.url + "/" + self.id logger.debug("Created CU: %s"%self.url) self.state = State.New self.__subjob = None # reference to BigJob Subjob self.compute_unit_description = compute_unit_description # CU Description self.subjob_description = self.__translate_cu_sj_description(compute_unit_description) else: self.__subjob = subjob(subjob_url=cu_url)
def __init__(self, compute_unit_description=None, compute_data_service=None, cu_url=None): if cu_url == None: self.id = self.CU_ID_PREFIX + str(uuid.uuid1()) if compute_data_service != None: self.url = compute_data_service.url + "/" + self.id logger.debug("Created CU: %s" % self.url) self.state = State.New self.__subjob = None # reference to BigJob Subjob self.compute_unit_description = compute_unit_description # CU Description self.subjob_description = self.__translate_cu_sj_description( compute_unit_description) else: self.__subjob = subjob(subjob_url=cu_url)
project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.get_url() + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jobs = [] job_start_times = {} job_states = {} for i in range(0, NUMBER_JOBS): jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = [""] jd.output = "sj-stdout-"+str(i)+".txt" jd.error = "sj-stderr-"+str(i)+".txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) jobs.append(sj) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() print "Terminating application. You can reconnect to BJ via the following URL: %s"%bj.get_url()
########################################################################################## # Submit SubJob through BigJob jobs = [] job_start_times = {} job_states = {} for i in range(0, NUMBER_JOBS): jd = saga.job.description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = [""] jd.working_directory = os.getcwd() jd.output = "sj-stdout-" + str(i) + ".txt" jd.error = "sj-stderr" + str(i) + ".txt" sj = subjob(advert_host) sj.submit_job(bj.pilot_url, jd) jobs.append(sj) job_start_times[sj] = time.time() job_states[sj] = sj.get_state() # busy wait for completion while 1: finish_counter = 0 result_map = {} for i in range(0, NUMBER_JOBS): old_state = job_states[jobs[i]] state = jobs[i].get_state() if result_map.has_key(state) == False: result_map[state] = 1 else:
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 workingdirectory= os.path.join(os.getcwd(), "agent") userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "condor://localhost" ########################################################################################## input_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.txt") bj_filetransfers = [input_file +" > test.txt"] print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node, bj_filetransfers) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/cat" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = ["test.txt"] #jd.working_directory = "" jd.output = "sj-stdout.txt" jd.error = "sj-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) bj_state = bj.get_state() print "bj state: " + str(bj_state) + " state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = "normal" # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 4 number_of_processes = 8 #workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent workingdirectory = "agent" userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "ssh://boskop" #lrms_url = "sge://localhost" #lrms_url = "fork://localhost" ########################################################################################## # for i in range(99999): # js = saga.job.Service (lrms_url) # j = js.run_job ("/bin/sleep 1000") # print "%4d: %s" % (i, j.state) for i in range(99999): print i print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] #jd.spmd_variation = "mpi" # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
sjs = [] cpr = CPR for i in range(0, NUMBER_REPLICAS): stage_files(i) jd = saga.job.description() # jd.executable = "namd2" jd.number_of_processes = cpr jd.spmd_variation = "mpi" # jd.arguments = ["NPT.conf"] jd.working_directory = WORK_DIR + "agent/" + str(i) + "/" # os.system("cp NPT.conf NPT.conf") jd.arguments = ["NPT.conf"] jd.output = str(i) + "/stdout-" + str(i) + ".txt" jd.error = str(i) + "/stderr-" + str(i) + ".txt" jds.append(jd) sj = bigjob.subjob(advert_host) sjs.append(sj) # prepare config and scp other files to remote machine NAMD_config(i) if i < RPB: j = 0 jd.executable = EXE copy_with_saga(i) sjs[i].submit_job(bjs[j].pilot_url, jds[i], str(i)) elif i >= RPB and i < (2 * RPB): j = 1 jd.executable = EXE1 copy_with_saga(i) sjs[i].submit_job(bjs[j].pilot_url, jds[i], str(i)) elif i >= (2 * RPB) and i < (3 * RPB): j = 2
job_states = {} # Submit Jobs through BigJob # Here you can add any arguments to each SubJob, change the ouput and error filenames and so on # change this to your heart's content, but be careful for i in range(0, NUMBER_JOBS): jd = description() jd.executable = "/bin/echo" jd.number_of_processes = "4" jd.spmd_variation = "mpi" # for serial codes jd.spmd_variation="single" jd.arguments = ["$INFRASTRUCTURE"] jd.environment = ["INFRASTRUCTURE=FutureGrid"] jd.output = "sj-stdout-"+str(i)+".txt" jd.error = "sj-stderr-"+str(i)+".txt" sj = subjob() jobs.append(sj) sj.submit_job(bj.pilot_url, jd) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() # busy wait for completion while 1: finish_counter=0 result_map = {} for i in range(0, NUMBER_JOBS): old_state = job_states[jobs[i]] state = jobs[i].get_state() if result_map.has_key(state)==False: result_map[state]=1
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "sge-ssh://lonestar.tacc.teragrid.org" """ To use Globus Online the working directory must be specified using the following conventions """ workingdirectory="go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=xsede#lonestar4&path=~/bigjob/" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj_filetransfers = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) + "/test.txt > BIGJOB_WORK_DIR"] bj.start_pilot_job( lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node, bj_filetransfers) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/cat" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = ["test.txt"] jd.output = "stdout.txt" jd.error = "stderr.txt" jd.file_transfer = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) + "/test.txt > SUBJOB_WORK_DIR"] sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 4 number_of_processes = 8 workingdirectory = os.path.join(os.getcwd(), "agent") # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ #lrms_url = "fork://localhost" # resource url to run the jobs on localhost lrms_url = "condorg://brgw1.renci.org:2119/jobmanager-pbs" #lrms_url = "ssh://[email protected]" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" #jd.arguments = ["match -f bgr1.fa -A 0 -r reads_1.fastq -n 4 -T /tmp/ > bfast.matches.file.bgr.1.bmf"] jd.arguments = [""] #jd.working_directory = "" jd.output = "bfast-stdout.txt" jd.error = "bfast-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue="normal" # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 #workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent workingdirectory="agent" userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "fork://localhost" # resource url to run the jobs on localhost #lrms_url = "sge://localhost" # resource url to run the jobs on localhost #lrms_url = "ssh://localhost" # resource url to run the jobs on localhost ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] #jd.spmd_variation = "mpi" # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
jds = [] sjs = [] for i in range(0, NUMBER_REPLICAS): stage_files(i) jd = saga.job.description() jd.executable = "namd2" jd.number_of_processes = "16" jd.spmd_variation = "mpi" # jd.arguments = ["NPT.conf"] jd.working_directory = WORK_DIR + "agent/" + str(i) + "/" #os.system("cp NPT.conf NPT.conf") jd.arguments = ["NPT.conf"] jd.output = str(i) + "/stdout-" + str(i) + ".txt" jd.error = str(i) + "/stderr-" + str(i) + ".txt" jds.append(jd) sj = bigjob.subjob(advert_host) sjs.append(sj) #prepare config and scp other files to remote machine NAMD_config(i) if i < RPB: j = 0 copy_with_saga(i) sjs[i].submit_job(bjs[j].pilot_url, jds[i], str(i)) elif (i >= RPB and i < (2 * RPB)): j = 1 copy_with_saga(i) sjs[i].submit_job(bjs[j].pilot_url, jds[i], str(i)) elif (i >= (2 * RPB) and i < (3 * RPB)): j = 2 copy_with_saga(i) sjs[i].submit_job(bjs[j].pilot_url, jds[i], str(i))
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 4 number_of_processes = 8 workingdirectory = os.path.join(os.getcwd(), "agent") userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "condor://localhost" ########################################################################################## input_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.txt") bj_filetransfers = [input_file + " > test.txt"] print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node, bj_filetransfers) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/cat" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = ["test.txt"] #jd.working_directory = "" jd.output = "sj-stdout.txt" jd.error = "sj-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) bj_state = bj.get_state() print "bj state: " + str(bj_state) + " state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
sj_nimbus.submit_job(jd_nimbus) jobs_nimbus.append(sj_nimbus) number_started_jobs = number_started_jobs + 1 for i in range(0, NUMBER_JOBS_EC2): print "Start job no.: " +str(number_started_jobs + 1) + " on EC2" sj_ec2 = bigjob_cloud.subjob(bigjob=bj_ec2) jd_ec2.output = "stdout_ec2.txt."+str(number_started_jobs+1) jd_ec2.error = "stderr_ec2.txt."+str(number_started_jobs+1) sj_ec2.submit_job(jd_ec2) jobs_ec2.append(sj_ec2) number_started_jobs = number_started_jobs + 1 for i in range(0, NUMBER_JOBS_GRID): print "Start job no.: " +str(number_started_jobs + 1) + " on Grid" sj_tg = bigjob.subjob(advert_host) jd.output = "stdout_tg.txt."+str(number_started_jobs+1) jd.error = "stderr_tg.txt."+str(number_started_jobs+1) sj_tg.submit_job(bj_tg.pilot_url, jd) jobs_tg.append(sj_tg) number_started_jobs = number_started_jobs + 1 for i in range(0, NUMBER_JOBS_CONDOR): print "Start job no.: " +str(number_started_jobs + 1) + " on Condor" sj_condor = bigjob_condor.subjob(bigjob=bj_condor) jd_condor.output = "stdout_condor.txt."+str(number_started_jobs+1) jd_condor.error = "stderr_condor.txt."+str(number_started_jobs+1) sj_condor.submit_job(jd_condor) jobs_condor.append(sj_condor) number_started_jobs = number_started_jobs + 1