def generate_processing_script_container(self, processing): param_values = { 'MAX_POINTS': self.max_points, 'NUM_POINTS': self.points_to_generate, 'IN': self.input_json, 'OUT': self.output_json } proxy_filename = 'x509up' if 'X509_USER_PROXY' in os.environ and os.environ['X509_USER_PROXY']: proxy_filename = os.path.basename(os.environ['X509_USER_PROXY']) param_values['X509_USER_PROXY_FULLNAME'] = os.environ[ 'X509_USER_PROXY'] param_values['X509_USER_PROXY_BASENAME'] = proxy_filename executable = replace_parameters_with_values(self.executable, param_values) arguments = replace_parameters_with_values(self.arguments, param_values) script = "#!/bin/bash\n\n" script += "executable=%s\n" % str(executable) script += "arguments=%s\n" % str(arguments) script += "input_json=%s\n" % str(self.input_json) script += "output_json=%s\n" % str(self.output_json) script += "\n" script += "env\n" script += "echo $X509_USER_PROXY\n" script += "\n" script += "echo 'user id:'\n" script += "id\n" script += "\n" if self.sandbox and 'docker' in executable: arguments = 'run --rm -v $(pwd):%s -v /cvmfs:/cvmfs -e X509_USER_PROXY=%s/%s %s ' % ( self.container_workdir, self.container_workdir, proxy_filename, self.sandbox) + arguments script += "echo '%s' '%s'\n" % (str(executable), str(arguments)) script += '%s %s\n' % (str(executable), str(arguments)) if self.sandbox and 'docker' in executable: script += 'docker image rm -f %s\n' % self.sandbox script += '\n' long_id = self.get_long_id(processing) script_name = 'processing_%s.sh' % long_id script_name = os.path.join(self.get_working_dir(processing), script_name) with open(script_name, 'w') as f: f.write(script) run_command("chmod +x %s" % script_name) return script_name
def generate_processing_script_sandbox(self, processing): param_values = { 'MAX_POINTS': self.max_points, 'NUM_POINTS': self.points_to_generate, 'IN': self.input_json, 'OUT': self.output_json } if 'X509_USER_PROXY' in os.environ and os.environ['X509_USER_PROXY']: proxy_filename = os.path.basename(os.environ['X509_USER_PROXY']) param_values['X509_USER_PROXY_FULLNAME'] = os.environ[ 'X509_USER_PROXY'] param_values['X509_USER_PROXY_BASENAME'] = proxy_filename executable = replace_parameters_with_values(self.executable, param_values) arguments = replace_parameters_with_values(self.arguments, param_values) script = "#!/bin/bash\n\n" script += "sandbox=%s\n" % str(self.sandbox) script += "executable=%s\n" % str(executable) script += "arguments=%s\n" % str(arguments) script += "input_json=%s\n" % str(self.input_json) script += "output_json=%s\n" % str(self.output_json) script += "\n" script += "env\n" script += "echo $X509_USER_PROXY\n" script += "\n" script += "echo 'user id:'\n" script += "id\n" script += "\n" script += "wget $sandbox\n" script += 'base_sandbox="$(basename -- $sandbox)"\n' script += 'tar xzf $base_sandbox\n' script += 'chmod +x %s\n' % str(executable) script += "echo '%s' '%s'\n" % (str(executable), str(arguments)) script += '%s %s\n' % (str(executable), str(arguments)) script += '\n' long_id = self.get_long_id(processing) script_name = 'processing_%s.sh' % long_id script_name = os.path.join(self.get_working_dir(processing), script_name) with open(script_name, 'w') as f: f.write(script) run_command("chmod +x %s" % script_name) return script_name
def poll_job_status(self, processing_id, job_id): # 0 Unexpanded U # 1 Idle I # 2 Running R # 3 Removed X # 4 Completed C # 5 Held H # 6 Submission_err E cmd = "condor_q -format '%s' ClusterId -format ' %s' Processing_id -format ' %s' JobStatus " + str( job_id) status, output, error = run_command(cmd) self.logger.debug("poll job status: %s" % cmd) self.logger.debug("status: %s, output: %s, error: %s" % (status, output, error)) if status == 0 and len(output) == 0: cmd = "condor_history -format '%s' ClusterId -format ' %s' Processing_id -format ' %s' JobStatus " + str( job_id) status, output, error = run_command(cmd) self.logger.debug("poll job status: %s" % cmd) self.logger.debug("status: %s, output: %s, error: %s" % (status, output, error)) ret_err = None if status == 0: lines = output.split('\n') for line in lines: c_job_id, c_processing_id, c_job_status = line.split(' ') if str(c_job_id) != str(job_id): continue c_processing_id = int(c_processing_id) c_job_status = int(c_job_status) if c_processing_id != processing_id: final_job_status = ProcessingStatus.Failed ret_err = 'jobid and the processing_id mismatched' else: job_status = c_job_status if job_status < 2: final_job_status = ProcessingStatus.Submitted elif job_status == 2: final_job_status = ProcessingStatus.Running elif job_status == 3: final_job_status = ProcessingStatus.Cancel elif job_status == 4: final_job_status = ProcessingStatus.Finished else: final_job_status = ProcessingStatus.Failed else: final_job_status = ProcessingStatus.Submitted return final_job_status, ret_err
def submit_processing_task(): outDS = "user.wguan.altest%s" % str(int(time.time())) cmd = "cd /afs/cern.ch/user/w/wguan/workdisk/iDDS/main/lib/idds/tests/activelearning_test_codes; prun --exec 'python simplescript.py 0.5 0.5 200 output.json' --outDS %s --outputs output.json --nJobs=10" % outDS status, output, error = run_command(cmd) """ print("status:") print(status) print("output:") print(output) print("error:") print(error) status: 0 output: error: INFO : gathering files under /afs/cern.ch/user/w/wguan/workdisk/iDDS/main/lib/idds/tests/activelearning_test_codes INFO : upload source files INFO : submit user.wguan.altest1234/ INFO : succeeded. new jediTaskID=23752996 """ if status == 0: task_id = get_task_id(output, error) return task_id else: raise Exception(output + error)
def generate_processing_script_nevergrad(self, processing): executable = self.agent_attributes['nevergrad']['executable'] arguments = self.agent_attributes['nevergrad']['arguments'] param_values = { 'MAX_POINTS': self.max_points, 'NUM_POINTS': self.points_to_generate, 'IN': self.input_json, 'OUT': self.output_json } if 'X509_USER_PROXY' in os.environ and os.environ['X509_USER_PROXY']: proxy_filename = os.path.basename(os.environ['X509_USER_PROXY']) param_values['X509_USER_PROXY_FULLNAME'] = os.environ[ 'X509_USER_PROXY'] param_values['X509_USER_PROXY_BASENAME'] = proxy_filename arguments = replace_parameters_with_values(arguments, param_values) script = "#!/bin/bash\n\n" script += "executable=%s\n" % os.path.basename(executable) script += "arguments='%s'\n" % str(arguments) script += "input_json=%s\n" % str(self.input_json) script += "output_json=%s\n" % str(self.output_json) script += "\n" script += "env\n" script += "echo $X509_USER_PROXY\n" script += "\n" script += "echo 'user id:'\n" script += "id\n" script += "\n" script += "echo '%s' '%s'\n" % (os.path.basename(executable), str(arguments)) script += '%s %s\n' % (os.path.basename(executable), str(arguments)) script += '\n' long_id = self.get_long_id(processing) script_name = 'processing_%s.sh' % long_id script_name = os.path.join(self.get_working_dir(processing), script_name) with open(script_name, 'w') as f: f.write(script) run_command("chmod +x %s" % script_name) return script_name
def generate_processing_script_sandbox(self, processing): arguments = self.parse_arguments() script = "#!/bin/bash\n\n" script += self.get_rucio_setup_env() script += "\n" script += "sandbox=%s\n" % str(self.sandbox) script += "executable=%s\n" % str(self.executable) script += "arguments=%s\n" % str(arguments) script += "output_json=%s\n" % str(self.output_json) script += "\n" script += "env\n" script += "echo $X509_USER_PROXY\n" script += "\n" script += "echo 'user id:'\n" script += "id\n" script += "\n" script += "wget $sandbox\n" script += 'base_sandbox="$(basename -- $sandbox)"\n' script += 'tar xzf $base_sandbox\n' dataset = self.collections[self._primary_input_collection] script += 'rucio download %s:%s\n' % (dataset['scope'], dataset['name']) script += 'chmod +x %s\n' % str(self.executable) script += "echo '%s' '%s'\n" % (str(self.executable), str(arguments)) script += '%s %s\n' % (str(self.executable), str(arguments)) script += 'ls\n\n' long_id = self.get_long_id(processing) script_name = 'processing_%s.sh' % long_id script_name = os.path.join(self.get_working_dir(processing), script_name) with open(script_name, 'w') as f: f.write(script) run_command("chmod +x %s" % script_name) return script_name
def submit_job(self, processing_id, sandbox, executable, arguments, input_list, input_json, output_json, should_transfer_executable=False): jdl_file = self.generate_submit_file(processing_id, sandbox, executable, arguments, input_list, input_json, output_json, should_transfer_executable=should_transfer_executable) cmd = "condor_submit " + jdl_file status, output, error = run_command(cmd) jobid = None self.logger.info("submiting the job to cluster: %s" % cmd) self.logger.info("status: %s, output: %s, error: %s " % (status, output, error)) if status == 0 or str(status) == '0': if output and 'submitted to cluster' in output: for line in output.split('\n'): if 'submitted to cluster' in line: jobid = line.split(' ')[-1].replace('.', '') return jobid, None return None, output + error
def submit_condor_processing(self, processing): jdl_file, err_msg = self.generate_processing_submit_file(processing) if not jdl_file: return None, err_msg cmd = "condor_submit " + jdl_file status, output, error = run_command(cmd) jobid = None self.logger.info("submiting the job to cluster: %s" % cmd) self.logger.info("status: %s, output: %s, error: %s " % (status, output, error)) if status == 0 or str(status) == '0': if output and 'submitted to cluster' in output: for line in output.split('\n'): if 'submitted to cluster' in line: jobid = line.split(' ')[-1].replace('.', '') return jobid, None return None, output + error
def poll_condor_job_status(self, processing, job_id): # 0 Unexpanded U # 1 Idle I # 2 Running R # 3 Removed X # 4 Completed C # 5 Held H # 6 Submission_err E cmd = "condor_q -format '%s' ClusterId -format ' %s' Processing_id -format ' %s' JobStatus -format ' %s' Iwd -format ' %s' Cmd -format ' %s' Err " + str( job_id) status, output, error = run_command(cmd) self.logger.info("poll job status: %s" % cmd) self.logger.info("status: %s, output: %s, error: %s" % (status, output, error)) if status == 0 and len(output) == 0: cmd = "condor_history -format '%s' ClusterId -format ' %s' Processing_id -format ' %s' JobStatus -format ' %s' Iwd -format ' %s' Cmd -format ' %s' Err " + str( job_id) status, output, error = run_command(cmd) self.logger.info("poll job status: %s" % cmd) self.logger.info("status: %s, output: %s, error: %s" % (status, output, error)) ret_err = '' job_cmd_msg, job_err_msg = '', '' if status == 0: lines = output.split('\n') for line in lines: c_job_id, c_processing_id, c_job_status, job_workdir, job_cmd, job_err = line.split( ' ') if str(c_job_id) != str(job_id): continue processing_id = self.get_long_id(processing) c_job_status = int(c_job_status) if c_processing_id != processing_id: final_job_status = ProcessingStatus.Failed ret_err = 'jobid and the processing_id mismatched' else: job_status = c_job_status if job_status < 2: final_job_status = ProcessingStatus.Submitted elif job_status == 2: final_job_status = ProcessingStatus.Submitted elif job_status == 2: final_job_status = ProcessingStatus.Running elif job_status == 3: final_job_status = ProcessingStatus.Cancelled elif job_status == 4: final_job_status = ProcessingStatus.Finished else: final_job_status = ProcessingStatus.Failed if final_job_status in [ProcessingStatus.Failed]: job_cmd_msg = self.get_job_err_message( job_workdir, job_cmd) job_cmd_msg = job_cmd_msg[-500:] job_err_msg = self.get_job_err_message( job_workdir, job_err) else: final_job_status = ProcessingStatus.Submitted # if output: # ret_err += output if error: ret_err += error if job_cmd_msg: ret_err += "Command output: " + job_cmd_msg if job_err_msg: ret_err += "Stderr: " + job_err_msg return final_job_status, ret_err
def poll_job_status(self, processing_id, job_id): # 0 Unexpanded U # 1 Idle I # 2 Running R # 3 Removed X # 4 Completed C # 5 Held H # 6 Submission_err E cmd = "condor_q -format '%s' ClusterId -format ' %s' Processing_id -format ' %s' JobStatus -format ' %s' Out -format ' %s' Err " + str( job_id) status, output, error = run_command(cmd) self.logger.info("poll job status: %s" % cmd) self.logger.info("status: %s, output: %s, error: %s" % (status, output, error)) if status == 0 and len(output) == 0: cmd = "condor_history -format '%s' ClusterId -format ' %s' Processing_id -format ' %s' JobStatus -format ' %s' Out -format ' %s' Err " + str( job_id) status, output, error = run_command(cmd) self.logger.info("poll job status: %s" % cmd) self.logger.info("status: %s, output: %s, error: %s" % (status, output, error)) ret_err = None if status == 0: lines = output.split('\n') for line in lines: c_job_id, c_processing_id, c_job_status, c_job_out_file, c_job_err_file = line.split( ' ') if str(c_job_id) != str(job_id): continue c_processing_id = int(c_processing_id) c_job_status = int(c_job_status) if c_processing_id != processing_id: final_job_status = ProcessingStatus.Failed ret_err = 'jobid and the processing_id mismatched' else: job_status = c_job_status if job_status < 2: final_job_status = ProcessingStatus.Submitted elif job_status == 2: final_job_status = ProcessingStatus.Running elif job_status == 3: final_job_status = ProcessingStatus.Cancel elif job_status == 4: final_job_status = ProcessingStatus.Finished else: final_job_status = ProcessingStatus.Failed else: final_job_status = ProcessingStatus.Submitted out_msg, err_msg = None, None if final_job_status in [ ProcessingStatus.Cancel, ProcessingStatus.Finished, ProcessingStatus.Failed ]: if os.path.exists(c_job_out_file): with open(c_job_out_file) as f: out_msg = f.read() if os.path.exists(c_job_err_file): with open(c_job_err_file) as f: err_msg = f.read() return final_job_status, ret_err, out_msg, err_msg