def __translate_cu_sj_description(self, compute_unit_description): jd = description() if compute_unit_description.has_key("executable"): jd.executable = compute_unit_description["executable"] if compute_unit_description.has_key("spmd_variation"): jd.spmd_variation = compute_unit_description["spmd_variation"] else: jd.spmd_variation = "single" if compute_unit_description.has_key("arguments"): jd.arguments = compute_unit_description["arguments"] if compute_unit_description.has_key("environment"): jd.environment = compute_unit_description["environment"] # handling number of processes if compute_unit_description.has_key("number_of_processes"): jd.number_of_processes=int(compute_unit_description["number_of_processes"]) elif compute_unit_description.has_key("total_cpu_count"): jd.number_of_processes=int(compute_unit_description["total_cpu_count"]) else: jd.number_of_processes=1 if compute_unit_description.has_key("working_directory"): jd.working_directory = compute_unit_description["working_directory"] if compute_unit_description.has_key("output"): jd.output = compute_unit_description["output"] if compute_unit_description.has_key("error"): jd.error = compute_unit_description["error"] if compute_unit_description.has_key("file_transfer"): jd.file_transfer=compute_unit_description["file_transfer"] if compute_unit_description.has_key("input_data"): jd.input_data=compute_unit_description["input_data"] if compute_unit_description.has_key("output_data"): jd.output_data=compute_unit_description["output_data"] return jd
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=1 number_of_processes = 1 workingdirectory="." # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) #lrms_url = "ec2+ssh://localhost" # resource url to run on GCE lrms_url = "gce+ssh://locahost" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] jd.input_data = ["hi", "ho"] # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 8 number_nodes = 24 workingdirectory = os.getcwd() # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "pbs://localhost" # resource url to run the jobs on localhost ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, number_nodes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/hostname" jd.number_of_processes = "2" jd.spmd_variation = "single" jd.arguments = [""] #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" for i in range(0, 12): sj = subjob() sj.submit_job(bj.pilot_url, jd) ########################################################################################## # Cleanup - stop BigJob bj.wait() bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 1 number_of_processes = 1 workingdirectory = "." # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) #lrms_url = "ec2+ssh://localhost" # resource url to run on GCE lrms_url = "gce+ssh://locahost" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] jd.input_data = ["hi", "ho"] # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def get_job_description(self, replica_id): jd = description() jd.executable = self.working_directory + "sync_agent_16/" + str(replica_id) + "/namd2" jd.number_of_processes = "4" jd.spmd_variation = "single" jd.arguments = ["NPT.conf"] jd.working_directory = self.working_directory + "sync_agent_16/" + str(replica_id) + "/" jd.output = "stdout-" + str(replica_id) + ".txt" jd.error = "stderr-" + str(replica_id) + ".txt" return jd
def get_job_description(self, replica_id): jd = description() jd.executable = self.working_directory + "async_agent_4/" + str(replica_id) + "/namd2" jd.number_of_processes = "2" jd.spmd_variation = "single" jd.arguments = ["NPT.conf"] jd.working_directory = self.working_directory + "async_agent_4/" + str(replica_id) + "/" jd.output = "stdout-" + str(replica_id) + ".txt" jd.error = "stderr-" + str(replica_id) + ".txt" return jd
def __translate_cu_sj_description(self, compute_unit_description): jd = description() if compute_unit_description.has_key("executable"): jd.executable = compute_unit_description["executable"] if compute_unit_description.has_key("spmd_variation"): jd.spmd_variation = compute_unit_description["spmd_variation"] else: jd.spmd_variation = "single" if compute_unit_description.has_key("arguments"): jd.arguments = compute_unit_description["arguments"] if compute_unit_description.has_key("environment"): env = compute_unit_description["environment"] if type(env) == dict: # convet to 'old-style' argument list env_list = list() for (key, val) in env.iteritems(): env_list.append("%s=%s" % (key, val)) jd.environment = env_list else: jd.environment = env # handling number of processes if compute_unit_description.has_key("number_of_processes"): jd.number_of_processes = int( compute_unit_description["number_of_processes"]) elif compute_unit_description.has_key("total_cpu_count"): jd.number_of_processes = int( compute_unit_description["total_cpu_count"]) else: jd.number_of_processes = 1 if compute_unit_description.has_key("working_directory"): jd.working_directory = compute_unit_description[ "working_directory"] if compute_unit_description.has_key("output"): jd.output = compute_unit_description["output"] if compute_unit_description.has_key("error"): jd.error = compute_unit_description["error"] if compute_unit_description.has_key("file_transfer"): jd.file_transfer = compute_unit_description["file_transfer"] if compute_unit_description.has_key("input_data"): jd.input_data = compute_unit_description["input_data"] if compute_unit_description.has_key("output_data"): jd.output_data = compute_unit_description["output_data"] return jd
def __translate_cu_sj_description(self, compute_unit_description): jd = description() if compute_unit_description.has_key("executable"): jd.executable = compute_unit_description["executable"] if compute_unit_description.has_key("spmd_variation"): jd.spmd_variation = compute_unit_description["spmd_variation"] else: jd.spmd_variation = "single" if compute_unit_description.has_key("arguments"): jd.arguments = compute_unit_description["arguments"] if compute_unit_description.has_key("environment"): env = compute_unit_description["environment"] if type(env) == dict: # convet to 'old-style' argument list env_list = list() for (key, val) in env.iteritems(): env_list.append("%s=%s" % (key, val)) jd.environment = env_list else: jd.environment = env # handling number of processes if compute_unit_description.has_key("number_of_processes"): jd.number_of_processes=int(compute_unit_description["number_of_processes"]) elif compute_unit_description.has_key("total_cpu_count"): jd.number_of_processes=int(compute_unit_description["total_cpu_count"]) else: jd.number_of_processes=1 if compute_unit_description.has_key("working_directory"): jd.working_directory = compute_unit_description["working_directory"] if compute_unit_description.has_key("output"): jd.output = compute_unit_description["output"] if compute_unit_description.has_key("error"): jd.error = compute_unit_description["error"] if compute_unit_description.has_key("file_transfer"): jd.file_transfer=compute_unit_description["file_transfer"] if compute_unit_description.has_key("input_data"): jd.input_data=compute_unit_description["input_data"] if compute_unit_description.has_key("output_data"): jd.output_data=compute_unit_description["output_data"] return jd
def submit_wu(self, wu): jd = description() jd.executable = wu["executable"] jd.number_of_processes = "1" # wu["number_of_processes"] jd.spmd_variation = wu["spmd_variation"] jd.arguments = [wu["arguments"]] jd.environment = wu["environment"].split(",") jd.working_directory = wu["working_directory"] jd.output = wu["output"] jd.error = wu["error"] subjob = self.mjs[int(wu["resource"])].create_job(jd) subjob.run() print "Submited sub-job " + "." self.jobs.append(subjob) self.job_start_times[subjob] = time.time() self.job_states[subjob] = subjob.get_state() self.logger.info("jd.number_of_processes " + str(jd.number_of_processes)) self.logger.info("jd exec " + jd.executable)
def __translate_cu_sj_description(self, compute_unit_description): jd = description() if compute_unit_description.has_key("executable"): jd.executable = compute_unit_description["executable"] if compute_unit_description.has_key("spmd_variation"): jd.spmd_variation = compute_unit_description["spmd_variation"] else: jd.spmd_variation = "single" if compute_unit_description.has_key("arguments"): jd.arguments = compute_unit_description["arguments"] if compute_unit_description.has_key("environment"): jd.environment = compute_unit_description["environment"] # handling number of processes if compute_unit_description.has_key("number_of_processes"): jd.number_of_processes = int( compute_unit_description["number_of_processes"]) elif compute_unit_description.has_key("total_cpu_count"): jd.number_of_processes = int( compute_unit_description["total_cpu_count"]) else: jd.number_of_processes = 1 if compute_unit_description.has_key("working_directory"): jd.working_directory = compute_unit_description[ "working_directory"] if compute_unit_description.has_key("output"): jd.output = compute_unit_description["output"] if compute_unit_description.has_key("error"): jd.error = compute_unit_description["error"] if compute_unit_description.has_key("file_transfer"): jd.file_transfer = compute_unit_description["file_transfer"] if compute_unit_description.has_key("input_data"): jd.input_data = compute_unit_description["input_data"] if compute_unit_description.has_key("output_data"): jd.output_data = compute_unit_description["output_data"] return jd
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 4 number_of_processes = 8 workingdirectory = os.path.join(os.getcwd(), "agent") # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ #lrms_url = "fork://localhost" # resource url to run the jobs on localhost lrms_url = "condorg://brgw1.renci.org:2119/jobmanager-pbs" #lrms_url = "ssh://[email protected]" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" #jd.arguments = ["match -f bgr1.fa -A 0 -r reads_1.fastq -n 4 -T /tmp/ > bfast.matches.file.bgr.1.bmf"] jd.arguments = [""] #jd.working_directory = "" jd.output = "bfast-stdout.txt" jd.error = "bfast-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = None # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 4 number_of_processes = 8 workingdirectory = os.path.join(os.getcwd(), "agent") userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "condor://localhost" ########################################################################################## input_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.txt") bj_filetransfers = [input_file + " > test.txt"] print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node, bj_filetransfers) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/cat" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = ["test.txt"] #jd.working_directory = "" jd.output = "sj-stdout.txt" jd.error = "sj-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) bj_state = bj.get_state() print "bj state: " + str(bj_state) + " state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue="normal" # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 #workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent workingdirectory="agent" userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "fork://localhost" # resource url to run the jobs on localhost #lrms_url = "sge://localhost" # resource url to run the jobs on localhost #lrms_url = "ssh://localhost" # resource url to run the jobs on localhost ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] #jd.spmd_variation = "mpi" # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## jobs = [] job_start_times = {} job_states = {} # Submit Jobs through BigJob # Here you can add any arguments to each SubJob, change the ouput and error filenames and so on # change this to your heart's content, but be careful for i in range(0, NUMBER_JOBS): jd = description() jd.executable = "/bin/echo" jd.number_of_processes = "4" jd.spmd_variation = "mpi" # for serial codes jd.spmd_variation="single" jd.arguments = ["$INFRASTRUCTURE"] jd.environment = ["INFRASTRUCTURE=FutureGrid"] jd.output = "sj-stdout-"+str(i)+".txt" jd.error = "sj-stderr-"+str(i)+".txt" sj = subjob() jobs.append(sj) sj.submit_job(bj.pilot_url, jd) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() # busy wait for completion while 1:
queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.get_url() + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jobs = [] job_start_times = {} job_states = {} for i in range(0, NUMBER_JOBS): jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = [""] jd.output = "sj-stdout-"+str(i)+".txt" jd.error = "sj-stderr-"+str(i)+".txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) jobs.append(sj) job_start_times[sj]=time.time() job_states[sj] = sj.get_state() print "Terminating application. You can reconnect to BJ via the following URL: %s"%bj.get_url()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 workingdirectory= os.path.join(os.getcwd(), "agent") userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "condor://localhost" ########################################################################################## input_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "test.txt") bj_filetransfers = [input_file +" > test.txt"] print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node, bj_filetransfers) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/cat" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = ["test.txt"] #jd.working_directory = "" jd.output = "sj-stdout.txt" jd.error = "sj-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) bj_state = bj.get_state() print "bj state: " + str(bj_state) + " state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue = "normal" # if None default queue is used project = None # if None default allocation is used walltime = 10 processes_per_node = 4 number_of_processes = 8 #workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent workingdirectory = "agent" userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "ssh://boskop" #lrms_url = "sge://localhost" #lrms_url = "fork://localhost" ########################################################################################## # for i in range(99999): # js = saga.job.Service (lrms_url) # j = js.run_job ("/bin/sleep 1000") # print "%4d: %s" % (i, j.state) for i in range(99999): print i print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job(lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str( bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/echo" #jd.executable = "$HOME/hello.sh" jd.number_of_processes = "1" jd.arguments = ["$HELLOWORLD"] jd.environment = ['HELLOWORLD=hello_world'] #jd.spmd_variation = "mpi" # specify an optinal working directory if sub-job should be executed outside of bigjob sandbox #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if (state == "Failed" or state == "Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=600 processes_per_node=12 number_of_processes=24 workingdirectory="/lustre/scratch/aluckow/agent" # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "xt5torque://localhost" # resource url to run the jobs on localhost ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/hostname" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = [""] #jd.working_directory = "/tmp" jd.output = "stdout.txt" jd.error = "stderr.txt" sjs = [] for i in range(0,24): sj = subjob() sj.submit_job(bj.pilot_url, jd) sjs.append(sj) ######################################### # busy wait for completion while 1: for idx, sj in enumerate(sjs): state = str(sj.get_state()) print "sj: %d state: %s"%(idx,state) if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ lrms_url = "sge-ssh://lonestar.tacc.teragrid.org" """ To use Globus Online the working directory must be specified using the following conventions """ workingdirectory="go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=xsede#lonestar4&path=~/bigjob/" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj_filetransfers = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) + "/test.txt > BIGJOB_WORK_DIR"] bj.start_pilot_job( lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node, bj_filetransfers) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/cat" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = ["test.txt"] jd.output = "stdout.txt" jd.error = "stderr.txt" jd.file_transfer = ["go://"+GLOBUS_ONLINE_USER+":"+GLOBUS_ONLINE_PASSWORD+"@globusonline.org?ep=drelu#MacBook&path=" + os.path.dirname(os.path.abspath(__file__)) + "/test.txt > SUBJOB_WORK_DIR"] sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): # Start BigJob ########################################################################################## # Edit parameters for BigJob queue=None # if None default queue is used project=None # if None default allocation is used walltime=10 processes_per_node=4 number_of_processes = 8 workingdirectory=os.path.join(os.getcwd(), "agent") # working directory for agent userproxy = None # userproxy (not supported yet due to context issue w/ SAGA) """ URL of the SAGA Job Service that is used to dispatch the pilot job. The following URLs are accepted: lrms_url = "gram://oliver1.loni.org/jobmanager-pbs" # globus resource url used when globus is used. (LONI) lrms_url = "pbspro://louie1.loni.org" # pbspro resource url used when pbspro scheduling system is used.(Futuregrid or LSU Machines) lrms_url = "ssh://louie1.loni.org" # ssh resource url which launches jobs on target machine. Jobs not submitted to scheduling system. lrms_url = "pbs-ssh://louie1.loni.org" # Submit jobs to scheduling system of remote machine. lrms_url = "xt5torque://localhost" # torque resource url. Please ensure that the respective SAGA adaptor is installed and working """ #lrms_url = "fork://localhost" # resource url to run the jobs on localhost lrms_url = "condorg://brgw1.renci.org:2119/jobmanager-pbs" #lrms_url = "ssh://[email protected]" ########################################################################################## print "Start Pilot Job/BigJob at: " + lrms_url bj = bigjob(COORDINATION_URL) bj.start_pilot_job( lrms_url, None, number_of_processes, queue, project, workingdirectory, userproxy, walltime, processes_per_node) print "Pilot Job/BigJob URL: " + bj.pilot_url + " State: " + str(bj.get_state()) ########################################################################################## # Submit SubJob through BigJob jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" #jd.arguments = ["match -f bgr1.fa -A 0 -r reads_1.fastq -n 4 -T /tmp/ > bfast.matches.file.bgr.1.bmf"] jd.arguments = [""] #jd.working_directory = "" jd.output = "bfast-stdout.txt" jd.error = "bfast-stderr.txt" sj = subjob() sj.submit_job(bj.pilot_url, jd) ######################################### # busy wait for completion while 1: state = str(sj.get_state()) print "state: " + state if(state=="Failed" or state=="Done"): break time.sleep(2) ########################################################################################## # Cleanup - stop BigJob bj.cancel()
def main(): try: print "ManyJob load test with " + str(NUMBER_JOBS) + " jobs." starttime=time.time() """ submit via mj abstraction resource_list.append( {"resource_url" : "gram://eric1.loni.org/jobmanager-pbs", "processes_per_node":"4", "number_of_processes" : "4", "allocation" : None, "queue" : "workq", "working_directory": (os.getcwd() + "/agent"), "walltime":10 }) """ resource_list = [] resource_dictionary = {"resource_url" : "fork://localhost/", "number_of_processes" : "32", "processes_per_node":"1", "allocation" : None, "queue" : None, "working_directory": (os.getcwd() + "/agent"), "walltime":3600 } resource_list.append(resource_dictionary) #Flags for controlling dynamic BigJob add_additional_resources=True remove_additional_resources=False print "Create Dynamic BigJob Service " mjs = many_job_service(resource_list, COORDINATION_URL) jobs = [] job_start_times = {} job_states = {} cwd = os.getcwd() for i in range(0, NUMBER_JOBS): # create job description jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = [""] jd.working_directory = os.getcwd(); jd.output = "stdout-" + str(i) + ".txt" jd.error = "stderr-" + str(i) + ".txt" subjob = mjs.create_job(jd) subjob.run() print "Submited sub-job " + "%d"%i + "." jobs.append(subjob) job_start_times[subjob]=time.time() job_states[subjob] = subjob.get_state() print "************************ All Jobs submitted ************************" while 1: finish_counter=0 result_map = {} for i in range(0, NUMBER_JOBS): old_state = job_states[jobs[i]] state = jobs[i].get_state() if result_map.has_key(state) == False: result_map[state]=0 result_map[state] = result_map[state]+1 #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state if old_state != state: print "Job " + str(jobs[i]) + " changed from: " + old_state + " to " + state if old_state != state and has_finished(state)==True: print "Job: " + str(jobs[i]) + " Runtime: " + str(time.time()-job_start_times[jobs[i]]) + " s." if has_finished(state)==True: finish_counter = finish_counter + 1 job_states[jobs[i]]=state # Dynamic BigJob add resources at runtime # if more than 30 s - add additional resource if time.time()-starttime > 10 and add_additional_resources==True: print "***add additional resources***" mjs.add_resource(resource_dictionary) add_additional_resources=False # remove resources from dynamic bigjob if (time.time()-starttime > 15 and remove_additional_resources==True): bj_list = mjs.get_resources() if len(bj_list)>0: print "***remove resources: " + str(bj_list[0]) mjs.remove_resource(bj_list[0]) remove_additional_resources=False print "Current states: " + str(result_map) time.sleep(5) if finish_counter == NUMBER_JOBS: break mjs.cancel() runtime = time.time()-starttime print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str(runtime/NUMBER_JOBS) except: traceback.print_exc(file=sys.stdout) try: mjs.cancel() except: pass
def main(): try: print "ManyJob load test with " + str(NUMBER_JOBS) + " jobs." starttime = time.time() """ submit via mj abstraction resource_list.append( {"resource_url" : "gram://eric1.loni.org/jobmanager-pbs", "processes_per_node":"4", "number_of_processes" : "4", "allocation" : None, "queue" : "workq", "working_directory": (os.getcwd() + "/agent"), "walltime":10 }) """ resource_list = [] resource_dictionary = { "resource_url": "fork://localhost/", "number_of_processes": "32", "processes_per_node": "1", "allocation": None, "queue": None, "working_directory": (os.getcwd() + "/agent"), "walltime": 3600 } resource_list.append(resource_dictionary) #Flags for controlling dynamic BigJob add_additional_resources = True remove_additional_resources = False print "Create Dynamic BigJob Service " mjs = many_job_service(resource_list, COORDINATION_URL) jobs = [] job_start_times = {} job_states = {} cwd = os.getcwd() for i in range(0, NUMBER_JOBS): # create job description jd = description() jd.executable = "/bin/date" jd.number_of_processes = "1" jd.spmd_variation = "single" jd.arguments = [""] jd.working_directory = os.getcwd() jd.output = "stdout-" + str(i) + ".txt" jd.error = "stderr-" + str(i) + ".txt" subjob = mjs.create_job(jd) subjob.run() print "Submited sub-job " + "%d" % i + "." jobs.append(subjob) job_start_times[subjob] = time.time() job_states[subjob] = subjob.get_state() print "************************ All Jobs submitted ************************" while 1: finish_counter = 0 result_map = {} for i in range(0, NUMBER_JOBS): old_state = job_states[jobs[i]] state = jobs[i].get_state() if result_map.has_key(state) == False: result_map[state] = 0 result_map[state] = result_map[state] + 1 #print "counter: " + str(i) + " job: " + str(jobs[i]) + " state: " + state if old_state != state: print "Job " + str( jobs[i] ) + " changed from: " + old_state + " to " + state if old_state != state and has_finished(state) == True: print "Job: " + str(jobs[i]) + " Runtime: " + str( time.time() - job_start_times[jobs[i]]) + " s." if has_finished(state) == True: finish_counter = finish_counter + 1 job_states[jobs[i]] = state # Dynamic BigJob add resources at runtime # if more than 30 s - add additional resource if time.time( ) - starttime > 10 and add_additional_resources == True: print "***add additional resources***" mjs.add_resource(resource_dictionary) add_additional_resources = False # remove resources from dynamic bigjob if (time.time() - starttime > 15 and remove_additional_resources == True): bj_list = mjs.get_resources() if len(bj_list) > 0: print "***remove resources: " + str(bj_list[0]) mjs.remove_resource(bj_list[0]) remove_additional_resources = False print "Current states: " + str(result_map) time.sleep(5) if finish_counter == NUMBER_JOBS: break mjs.cancel() runtime = time.time() - starttime print "Runtime: " + str(runtime) + " s; Runtime per Job: " + str( runtime / NUMBER_JOBS) except: traceback.print_exc(file=sys.stdout) try: mjs.cancel() except: pass