def kill_processing_force(self, processing): try: if processing: from pandaclient import Client proc = processing['processing_metadata']['processing'] task_id = proc.workload_id # task_id = processing['processing_metadata']['task_id'] Client.killTask(task_id) # Client.finishTask(task_id, soft=True) except Exception as ex: msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) raise exceptions.IDDSException(msg)
def get_panda_task_id(self, processing): from pandaclient import Client start_time = datetime.datetime.utcnow() - datetime.timedelta(hours=10) start_time = start_time.strftime('%Y-%m-%d %H:%M:%S') status, results = Client.getJobIDsJediTasksInTimeRange( start_time, task_type=self.task_type, verbose=False) if status != 0: self.logger.warn( "Error to poll latest tasks in last ten hours: %s, %s" % (status, results)) return None proc = processing['processing_metadata']['processing'] task_id = None for req_id in results: task_name = results[req_id]['taskName'] if proc.workload_id is None and task_name == self.task_name: task_id = results[req_id]['jediTaskID'] # processing['processing_metadata']['task_id'] = task_id # processing['processing_metadata']['workload_id'] = task_id proc.workload_id = task_id if task_id: proc.submitted_at = datetime.datetime.utcnow() return task_id
def submit_panda_task(self, processing): try: from pandaclient import Client proc = processing['processing_metadata']['processing'] task_param = proc.processing_metadata['task_param'] return_code = Client.insertTaskParams(task_param, verbose=True) if return_code[0] == 0: try: task_id = int(return_code[1][1]) return task_id except Exception as ex: self.logger.warn( "task id is not retruned: (%s) is not task id: %s" % (return_code[1][1], str(ex))) # jediTaskID=26468582 if return_code[1][1] and 'jediTaskID=' in return_code[1][1]: parts = return_code[1][1].split(" ") for part in parts: if 'jediTaskID=' in part: task_id = int(part.split("=")[1]) return task_id else: self.logger.warn("submit_panda_task, return_code: %s" % str(return_code)) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc())) return None
def get_refresh_token_string(verbose=False): try: curl = Client._Curl() curl.verbose = verbose tmp_log = PLogger.getPandaLogger() oidc = curl.get_oidc(tmp_log) token_file = oidc.get_token_path() if os.path.exists(token_file): with open(token_file) as f: data = json.load(f) enc = data['id_token'].split('.')[1] enc += '=' * (-len(enc) % 4) dec = json.loads(base64.urlsafe_b64decode(enc.encode())) exp_time = datetime.datetime.utcfromtimestamp(dec['exp']) delta = exp_time - datetime.datetime.utcnow() minutes = delta.total_seconds() / 60 print('Token will expire in %s minutes.' % minutes) print('Token expiration time : {0} UTC'.format( exp_time.strftime("%Y-%m-%d %H:%M:%S"))) if delta < datetime.timedelta(minutes=0): print("Token already expired. Cannot refresh.") return False, None, None return True, data['refresh_token'], delta else: print("Cannot find token file.") except Exception as e: print('failed to decode cached token with {0}'.format(e)) return False, None, None
def poll_panda_task_output(self, processing=None, input_output_maps=None): task_id = None try: from pandaclient import Client if processing: output_metadata = {} proc = processing['processing_metadata']['processing'] task_id = proc.workload_id if task_id is None: task_id = self.get_panda_task_id(processing) if task_id: # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False) task_info = Client.getJediTaskDetails( {'jediTaskID': task_id}, True, True, verbose=False) self.logger.info("poll_panda_task, task_info: %s" % str(task_info)) if task_info[0] != 0: self.logger.warn( "poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) return ProcessingStatus.Submitting, [], {} task_info = task_info[1] processing_status = self.get_processing_status_from_panda_status( task_info["status"]) if processing_status in [ProcessingStatus.SubFinished]: if self.retry_number < self.num_retries: self.reactivate_processing(processing) processing_status = ProcessingStatus.Submitted self.retry_number += 1 if processing_status in [ ProcessingStatus.SubFinished, ProcessingStatus.Finished ]: output_status, output_metadata = self.process_outputs( processing) if not output_status: err = "Failed to process processing(processing_id: %s, task_id: %s) outputs" % ( processing['processing_id'], task_id) self.logger.error(err) self.add_errors(err) processing_status = ProcessingStatus.Failed return processing_status, [], {}, output_metadata else: return ProcessingStatus.Failed, [], {}, output_metadata except Exception as ex: msg = "Failed to check the processing (%s) status: %s" % (str( processing['processing_id']), str(ex)) self.logger.error(msg) self.logger.error(ex) self.logger.error(traceback.format_exc()) # raise exceptions.IDDSException(msg) return ProcessingStatus.Submitting, [], {}, {}
def poll_panda_task_status(self, processing): if 'processing' in processing['processing_metadata']: from pandaclient import Client proc = processing['processing_metadata']['processing'] status, task_status = Client.getTaskStatus(proc.workload_id) if status == 0: return task_status else: return 'failed' return None
def get_token_info(verbose=False): # c = panda_api.get_api() curl = Client._Curl() curl.verbose = verbose token_info = curl.get_token_info() # print(token_info) if token_info and type(token_info) in [dict]: for key in token_info: print("%s: %s" % (key, token_info[key])) get_expire_time() else: print(token_info)
def submit_panda_task(self, processing): try: from pandaclient import Client task_param = processing['processing_metadata']['task_param'] return_code = Client.insertTaskParams(task_param, verbose=True) if return_code[0] == 0: return return_code[1][1] except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc())) return None
def refresh_token(minutes=30, verbose=False): curl = Client._Curl() curl.verbose = verbose tmp_log = PLogger.getPandaLogger() oidc = curl.get_oidc(tmp_log) status, refresh_token, delta = get_refresh_token_string() if not status: print("Cannot refresh token.") return False print("Fetching auth configuration from: %s" % str(oidc.auth_config_url)) s, o = oidc.fetch_page(oidc.auth_config_url) if not s: print("Failed to get Auth configuration: " + o) return False auth_config = o print("Fetching endpoint configuration from: %s" % str(auth_config['oidc_config_url'])) s, o = oidc.fetch_page(auth_config['oidc_config_url']) if not s: print("Failed to get endpoint configuration: " + o) return False endpoint_config = o # s, o = oidc.refresh_token(endpoint_config['token_endpoint'], auth_config['client_id'], # auth_config['client_secret'], refresh_token) s, o = oidc_refresh_token(oidc, endpoint_config['token_endpoint'], auth_config['client_id'], auth_config['client_secret'], refresh_token) if not s: print("Failed to refresh token: " + o) if delta < datetime.timedelta(minutes=minutes): print( "The left lifetime of the token is less than required %s minutes" % minutes) return False else: return True else: print("Success to refresh token: " + o) if delta < datetime.timedelta(minutes=minutes): print( "The left lifetime of the token is less than required %s minutes" % minutes) return False else: return True return True
def reactivate_processing(self, processing): try: if processing: from pandaclient import Client # task_id = processing['processing_metadata']['task_id'] proc = processing['processing_metadata']['processing'] task_id = proc.workload_id # Client.retryTask(task_id) status, out = Client.retryTask(task_id, newParams={}) self.logger.warn("Retry processing(%s) with task id(%s): %s, %s" % (processing['processing_id'], task_id, status, out)) # Client.reactivateTask(task_id) # Client.resumeTask(task_id) except Exception as ex: msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) raise exceptions.IDDSException(msg)
def get_expire_time(verbose=False): try: # token_file = openidc_utils.OpenIdConnect_Utils().get_token_path() curl = Client._Curl() curl.verbose = verbose tmp_log = PLogger.getPandaLogger() oidc = curl.get_oidc(tmp_log) token_file = oidc.get_token_path() if os.path.exists(token_file): with open(token_file) as f: data = json.load(f) enc = data['id_token'].split('.')[1] enc += '=' * (-len(enc) % 4) dec = json.loads(base64.urlsafe_b64decode(enc.encode())) exp_time = datetime.datetime.utcfromtimestamp(dec['exp']) delta = exp_time - datetime.datetime.utcnow() minutes = delta.total_seconds() / 60 print('Token will expire in %s minutes.' % minutes) print('Token expiration time : {0} UTC'.format( exp_time.strftime("%Y-%m-%d %H:%M:%S"))) else: print("Cannot find token file.") except Exception as e: print('failed to decode cached token with {0}'.format(e))
def main(get_taskparams=False, ext_args=None, dry_mode=False): # tweak sys.argv sys.argv.pop(0) sys.argv.insert(0, 'phpo') usage = """phpo [options] """ optP = GroupArgParser(usage=usage, conflict_handler="resolve") group_input = optP.add_group('input', 'input dataset(s)/files/format') group_output = optP.add_group('output', 'output dataset/files') group_config = optP.add_group( 'config', 'single configuration file to set multiple options') group_submit = optP.add_group('submit', 'job submission/site/retry') group_expert = optP.add_group('expert', 'for experts/developers only') optP.add_helpGroup() group_config.add_argument('--version', action='store_const', const=True, dest='version', default=False, help='Displays version') group_config.add_argument( '--loadJson', action='store', dest='loadJson', default=None, help= 'Read task parameters from a json file. Some parameters can be overridden ' 'by using command-line arguments') group_config.add_argument( '--dumpJson', action='store', dest='dumpJson', default=None, help='Dump all command-line parameters and submission result ' 'such as returnCode, returnOut, and jediTaskID to a json file') group_config.add_argument( '--nParallelEvaluation', action='store', dest='nParallelEvaluation', default=1, type=int, help= 'The number of hyperparameter points being evaluated concurrently. 1 by default' ) group_config.add_argument( '--maxPoints', action='store', dest='maxPoints', default=10, type=int, help= 'The max number of hyperparameter points to be evaluated in the entire search ' '(for each segment in segmented HPO). ' '10 by default') group_config.add_argument( '--maxEvaluationJobs', action='store', dest='maxEvaluationJobs', default=None, type=int, help='The max number of evaluation jobs in the entire search ' '(for each segment in segmented HPO). 2*maxPoints by default. ' 'The task is terminated when all hyperparameter points are evaluated or ' 'the number of evaluation jobs reaches maxEvaluationJobs') group_config.add_argument( '--maxPointsPerEvaluationJob', action='store', dest='maxPointsPerEvaluationJob', default=None, type=int, help= 'The max number of hyperparameter points taken in each evaluation job') group_config.add_argument( '--nPointsPerIteration', action='store', dest='nPointsPerIteration', default=2, type=int, help= 'The max number of hyperparameter points generated in each iteration. 2 by default ' 'Simply speaking, the steering container is executed maxPoints/nPointsPerIteration ' 'times when minUnevaluatedPoints is 0. The number of new points is ' 'nPointsPerIteration-minUnevaluatedPoints') group_config.add_argument( '--minUnevaluatedPoints', action='store', dest='minUnevaluatedPoints', default=None, type=int, help= 'The next iteration is triggered to generate new hyperparameter points when the number ' 'of unevaluated hyperparameter points goes below minUnevaluatedPoints. 0 by default' ) group_config.add_argument( '--steeringContainer', action='store', dest='steeringContainer', default=None, help='The container image for steering run by docker') group_config.add_argument( '--steeringExec', action='store', dest='steeringExec', default=None, help= 'Execution string for steering. If --steeringContainer is specified, the string ' 'is executed inside of the container. Otherwise, the string is used as command-line ' 'arguments for the docker command') group_config.add_argument( '--searchSpaceFile', action='store', dest='searchSpaceFile', default=None, help= 'External json filename to define the search space which is described as a dictionary. ' 'None by default. ' 'If this option is used together with --segmentSpecFile the json file contains a list ' 'of search space dictionaries. It is possible to contain only one search space ' 'dictionary if all segments use the same search space. In this case the search space ' 'dictionary is cloned for every segment') group_config.add_argument('--evaluationContainer', action='store', dest='evaluationContainer', default=None, help='The container image for evaluation') group_config.add_argument( '--evaluationExec', action='store', dest='evaluationExec', default=None, help='Execution string to run evaluation in singularity') group_config.add_argument( '--evaluationInput', action='store', dest='evaluationInput', default='input.json', help= 'Input filename for evaluation where a json-formatted hyperparameter point is placed. ' 'input.json by default') group_config.add_argument( '--evaluationTrainingData', action='store', dest='evaluationTrainingData', default='input_ds.json', help= 'Input filename for evaluation where a json-formatted list of training data filenames ' 'is placed. input_ds.json by default. Can be omitted if the payload directly fetches ' 'the training data using wget or something') group_config.add_argument( '--evaluationOutput', action='store', dest='evaluationOutput', default='output.json', help='Output filename of evaluation. output.json by default') group_config.add_argument( '--evaluationMeta', action='store', dest='evaluationMeta', default=None, help='The name of metadata file produced by evaluation') group_config.add_argument( '--evaluationMetrics', action='store', dest='evaluationMetrics', default=None, help='The name of metrics file produced by evaluation') group_config.add_argument('--checkPointToSave', action='store', dest='checkPointToSave', default=None, help='A comma-separated list of files and/or directories to be periodically saved ' \ 'to a tarball for checkpointing. Note that those files and directories must be placed ' \ 'in the working directory. None by default') group_config.add_argument('--checkPointToLoad', action='store', dest='checkPointToLoad', default=None, help='The name of the saved tarball for checkpointing. The tarball is given to ' \ 'the evaluation container when the training is resumed, if this option is specified. ' 'Otherwise, the tarball is automatically extracted in the working directories') group_config.add_argument( '--checkPointInterval', action='store', dest='checkPointInterval', default=None, type=int, help='Frequency to check files for checkpointing in minute. ' '5 by default') group_config.add_argument('--alrbArgs', action='store', dest='alrbArgs', default=None, help='Additional arguments for ALRB to run the evaluation container. ' \ '"setupATLAS -c --help" shows available ALRB arguments. For example, ' \ '--alrbArgs "--nocvmfs --nohome" to skip mounting /cvmfs and $HOME. ' \ 'This option is mainly for experts who know how the system and the container ' \ 'communicates with each other and how additional ALRB arguments affect ' \ 'the consequence') group_config.add_argument( '--architecture', action='store', dest='architecture', default='', help= "CPU and/or GPU requirements. #CPU_spec&GPU_spec where CPU or GPU spec can be " "omitted. CPU_spec = architecture<-vendor<-instruction set>>, " "GPU_spec = vendor<-model>. A wildcards can be used if there is no special " "requirement for the attribute. E.g., #x86_64-*-avx2&nvidia to ask for x86_64 " "CPU with avx2 support and nvidia GPU") group_config.add_argument( '--segmentSpecFile', action='store', dest='segmentSpecFile', default=None, help= 'External json filename to define segments for segmented HPO which has one model ' 'for each segment to be optimized independently. The file ' "contains a list of dictionaries {'name': arbitrary_unique_segment_name, " "'files': [filename_used_for_the_segment_in_the_training_dataset, ... ]}. " "It is possible to specify 'datasets' instead of 'files' in those dictionaries " "if the training dataset has constituent datasets and " "is partitioned with the constituent dataset boundaries. " 'None by default') group_config.add_argument('-v', action='store_const', const=True, dest='verbose', default=False, help='Verbose') group_input.add_argument('--trainingDS', action='store', dest='trainingDS', default=None, help='Name of training dataset') group_output.add_argument( '--outDS', action='store', dest='outDS', default=None, help='Name of the dataset for output and log files') group_output.add_argument('--official', action='store_const', const=True, dest='official', default=False, help='Produce official dataset') group_submit.add_argument( '--site', action='store', dest='site', default=None, help= 'The site name where jobs are sent. If omitted, jobs are automatically sent to sites ' 'where input is available. A comma-separated list of sites can be specified ' '(e.g. siteA,siteB,siteC), so that best sites are chosen from the given site list' ) group_submit.add_argument('--workingGroup', action='store', dest='workingGroup', default=None, help="set working group") group_submit.add_argument('--noSubmit', action='store_const', const=True, dest='noSubmit', default=False, help="Dry-run") group_submit.add_argument("-3", action="store_true", dest="python3", default=False, help="Use python3") group_submit.add_argument( '--voms', action='store', dest='vomsRoles', default=None, type=str, help="generate proxy with paticular roles. " "e.g., atlas:/atlas/ca/Role=production,atlas:/atlas/fr/Role=pilot") group_submit.add_argument('--noEmail', action='store_const', const=True, dest='noEmail', default=False, help='Suppress email notification') group_expert.add_argument( '--intrSrv', action='store_const', const=True, dest='intrSrv', default=False, help= "Please don't use this option. Only for developers to use the intr panda server" ) # get logger tmpLog = PLogger.getPandaLogger() options = optP.parse_args(ext_args) option_names = set(vars(options).keys()) jsonExecStr = '' if options.loadJson is not None: with open(os.path.expanduser(options.loadJson)) as f: json_options = json.load(f) for k in json_options: if k in option_names: v = json_options[k] if isinstance(v, (str, unicode)): try: v = int(v) except Exception: pass setattr(options, k, v) if v is True: jsonExecStr += ' --{0}'.format(k) else: if isinstance(v, (str, unicode)): jsonExecStr += " --{0}='{1}'".format(k, v) else: jsonExecStr += " --{0}={1}".format(k, v) else: tmpLog.warning('ignore unknown option {0} in {1}'.format( k, options.loadJson)) if options.version: print("Version: %s" % PandaToolsPkgInfo.release_version) sys.exit(0) # check grid-proxy if not dry_mode: PsubUtils.check_proxy(options.verbose, options.vomsRoles) # check options # non_null_opts = ['outDS', 'evaluationContainer', 'evaluationExec', 'steeringContainer', 'steeringExec'] non_null_opts = [ 'outDS', 'evaluationContainer', 'evaluationExec', 'steeringExec' ] for opt_name in non_null_opts: if getattr(options, opt_name) is None: tmpLog.error('--{0} is not specified'.format(opt_name)) sys.exit(1) if not options.outDS.endswith('/'): options.outDS += '/' if options.maxEvaluationJobs is None: options.maxEvaluationJobs = 2 * options.maxPoints # check output name if not dry_mode: nickName = PsubUtils.getNickname() if not PsubUtils.checkOutDsName( options.outDS, options.official, nickName, verbose=options.verbose): tmpStr = "invalid output dataset name: %s" % options.outDS tmpLog.error(tmpStr) sys.exit(1) # full execution string fullExecString = PsubUtils.convSysArgv(ext_args) fullExecString += jsonExecStr # use INTR server if options.intrSrv: Client.useIntrServer() # create tmp dir curDir = os.getcwd() tmpDir = os.path.join(curDir, MiscUtils.wrappedUuidGen()) os.makedirs(tmpDir) # exit action def _onExit(dir, del_command): del_command('rm -rf %s' % dir) atexit.register(_onExit, tmpDir, MiscUtils.commands_get_output) # sandbox if options.verbose: tmpLog.debug("=== making sandbox ===") archiveName = 'jobO.%s.tar' % MiscUtils.wrappedUuidGen() archiveFullName = os.path.join(tmpDir, archiveName) if not dry_mode: extensions = ['json', 'py', 'sh', 'yaml'] find_opt = ' -o '.join(['-name "*.{0}"'.format(e) for e in extensions]) tmpOut = MiscUtils.commands_get_output( 'find . {0} | tar cvfz {1} --files-from - '.format( find_opt, archiveFullName)) if options.verbose: print(tmpOut + '\n') tmpLog.debug("=== checking sandbox ===") tmpOut = MiscUtils.commands_get_output( 'tar tvfz {0}'.format(archiveFullName)) print(tmpOut + '\n') if not options.noSubmit: if options.verbose: tmpLog.debug("=== uploading sandbox===") os.chdir(tmpDir) status, out = Client.putFile(archiveName, options.verbose, useCacheSrv=True, reuseSandbox=True) os.chdir(curDir) if out.startswith('NewFileName:'): # found the same input sandbox to reuse archiveName = out.split(':')[-1] elif out != 'True': # failed print(out) tmpLog.error("Failed with %s" % status) sys.exit(1) matchURL = re.search("(http.*://[^/]+)/", Client.baseURLCSRVSSL) sourceURL = matchURL.group(1) # making task params taskParamMap = {} taskParamMap['noInput'] = True taskParamMap['nEventsPerJob'] = 1 taskParamMap['nEvents'] = options.nParallelEvaluation taskParamMap['maxNumJobs'] = options.maxEvaluationJobs taskParamMap['totNumJobs'] = options.maxPoints taskParamMap['taskName'] = options.outDS taskParamMap['vo'] = 'atlas' taskParamMap['architecture'] = options.architecture taskParamMap['hpoWorkflow'] = True taskParamMap['transUses'] = '' taskParamMap['transHome'] = '' taskParamMap[ 'transPath'] = 'http://pandaserver.cern.ch:25080/trf/user/runHPO-00-00-01' taskParamMap['processingType'] = 'panda-client-{0}-jedi-hpo'.format( PandaToolsPkgInfo.release_version) taskParamMap['prodSourceLabel'] = 'user' taskParamMap['useLocalIO'] = 1 taskParamMap['cliParams'] = fullExecString taskParamMap['skipScout'] = True if options.noEmail: taskParamMap['noEmail'] = True if options.workingGroup is not None: taskParamMap['workingGroup'] = options.workingGroup taskParamMap['coreCount'] = 1 if options.site is not None: if ',' in options.site: taskParamMap[ 'includedSite'] = PsubUtils.splitCommaConcatenatedItems( [options.site]) else: taskParamMap['site'] = options.site if options.evaluationContainer is not None: taskParamMap['container_name'] = options.evaluationContainer taskParamMap['multiStepExec'] = { 'preprocess': { 'command': '${TRF}', 'args': '--preprocess ${TRF_ARGS}' }, 'postprocess': { 'command': '${TRF}', 'args': '--postprocess ${TRF_ARGS}' }, 'containerOptions': { 'containerExec': 'while [ ! -f __payload_in_sync_file__ ]; do sleep 5; done; ' 'echo "=== cat exec script ==="; ' 'cat __run_main_exec.sh; ' 'echo; ' 'echo "=== exec script ==="; ' '/bin/sh __run_main_exec.sh; ' 'REAL_MAIN_RET_CODE=$?; ' 'touch __payload_out_sync_file__; ' 'exit $REAL_MAIN_RET_CODE ', 'containerImage': options.evaluationContainer } } if options.checkPointToSave is not None: taskParamMap['multiStepExec']['coprocess'] = { 'command': '${TRF}', 'args': '--coprocess ${TRF_ARGS}' } if options.alrbArgs is not None: taskParamMap['multiStepExec']['containerOptions'][ 'execArgs'] = options.alrbArgs logDatasetName = re.sub('/$', '.log/', options.outDS) taskParamMap['log'] = { 'dataset': logDatasetName, 'container': logDatasetName, 'type': 'template', 'param_type': 'log', 'value': '{0}.$JEDITASKID.${{SN}}.log.tgz'.format(logDatasetName[:-1]) } taskParamMap['hpoRequestData'] = { 'sandbox': options.steeringContainer, 'executable': 'docker', 'arguments': options.steeringExec, 'output_json': 'output.json', 'max_points': options.maxPoints, 'num_points_per_generation': options.nPointsPerIteration, } if options.minUnevaluatedPoints is not None: taskParamMap['hpoRequestData'][ 'min_unevaluated_points'] = options.minUnevaluatedPoints if options.searchSpaceFile is not None: with open(options.searchSpaceFile) as json_file: taskParamMap['hpoRequestData']['opt_space'] = json.load(json_file) taskParamMap['jobParameters'] = [ { 'type': 'constant', 'value': '-o {0} -j "" --inSampleFile {1}'.format(options.evaluationOutput, options.evaluationInput) }, { 'type': 'constant', 'value': '-a {0} --sourceURL {1}'.format(archiveName, sourceURL) }, ] taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '-p "', 'padding': False, }, ] taskParamMap['jobParameters'] += PsubUtils.convertParamStrToJediParam( options.evaluationExec, {}, '', True, False, includeIO=False) taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '"', }, ] if options.checkPointToSave is not None: taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '--checkPointToSave {0}'.format(options.checkPointToSave) }, ] if options.checkPointInterval is not None: taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '--checkPointInterval {0}'.format( options.checkPointInterval) }, ] if options.checkPointToLoad is not None: taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '--checkPointToLoad {0}'.format(options.checkPointToLoad) }, ] if options.trainingDS is not None: taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '--writeInputToTxt IN_DATA:{0}'.format( options.evaluationTrainingData) }, { 'type': 'template', 'param_type': 'input', 'value': '-i "${IN_DATA/T}"', 'dataset': options.trainingDS, 'attributes': 'nosplit,repeat', }, { 'type': 'constant', 'value': '--inMap "{\'IN_DATA\': ${IN_DATA/T}}"' }, ] if options.evaluationMeta is not None: taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '--outMetaFile={0}'.format(options.evaluationMeta), }, ] if options.segmentSpecFile is not None: taskParamMap['segmentedWork'] = True with open(options.segmentSpecFile) as f: # read segments segments = json.load(f) # search space if 'opt_space' in taskParamMap['hpoRequestData'] and \ isinstance(taskParamMap['hpoRequestData']['opt_space'], dict): space = taskParamMap['hpoRequestData']['opt_space'] taskParamMap['hpoRequestData']['opt_space'] = [] else: space = None # set model ID to each segment for i in range(len(segments)): segments[i].update({'id': i}) # make clone of search space if needed if space is not None: new_space = dict() new_space['model_id'] = i new_space['search_space'] = copy.deepcopy(space) taskParamMap['hpoRequestData']['opt_space'].append( new_space) taskParamMap['segmentSpecs'] = segments # multiply by num of segments taskParamMap['maxNumJobs'] *= len(segments) taskParamMap['totNumJobs'] *= len(segments) taskParamMap['hpoRequestData']['max_points'] *= len(segments) taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '--segmentID=${SEGMENT_ID}', }, ] if options.evaluationMetrics is not None: lfn = '$JEDITASKID.metrics.${SN}.tgz' if options.segmentSpecFile is not None: lfn = '${MIDDLENAME}.' + lfn taskParamMap['jobParameters'] += [ { 'type': 'template', 'param_type': 'output', 'value': lfn, 'dataset': options.outDS, 'hidden': True, 'allowNoOutput': True, }, { 'type': 'constant', 'value': '--outMetricsFile=${{OUTPUT0}}^{0}'.format( options.evaluationMetrics), }, ] if options.maxPointsPerEvaluationJob: taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '--maxLoopCount={}'.format(options.maxPointsPerEvaluationJob), }, ] if options.noSubmit: if options.noSubmit: if options.verbose: tmpLog.debug("==== taskParams ====") tmpKeys = list(taskParamMap) tmpKeys.sort() for tmpKey in tmpKeys: print('%s : %s' % (tmpKey, taskParamMap[tmpKey])) sys.exit(0) if get_taskparams: return taskParamMap tmpLog.info("submit {0}".format(options.outDS)) tmpStat, tmpOut = Client.insertTaskParams(taskParamMap, options.verbose, True) # result taskID = None exitCode = None if tmpStat != 0: tmpStr = "task submission failed with {0}".format(tmpStat) tmpLog.error(tmpStr) exitCode = 1 else: if tmpOut[0] in [0, 3]: tmpStr = tmpOut[1] tmpLog.info(tmpStr) try: m = re.search('jediTaskID=(\d+)', tmpStr) taskID = int(m.group(1)) except Exception: pass else: tmpStr = "task submission failed. {0}".format(tmpOut[1]) tmpLog.error(tmpStr) exitCode = 1 dumpItem = copy.deepcopy(vars(options)) dumpItem['returnCode'] = exitCode dumpItem['returnOut'] = tmpStr dumpItem['jediTaskID'] = taskID # dump if options.dumpJson is not None: with open(os.path.expanduser(options.dumpJson), 'w') as f: json.dump(dumpItem, f)
def main(): # tweak sys.argv sys.argv.pop(0) sys.argv.insert(0, 'pchain') usage = """pchain [options] """ optP = GroupArgParser(usage=usage, conflict_handler="resolve") group_output = optP.add_group('output', 'output dataset/files') group_config = optP.add_group( 'config', 'single configuration file to set multiple options') group_submit = optP.add_group('submit', 'job submission/site/retry') group_expert = optP.add_group('expert', 'for experts/developers only') group_build = optP.add_group('build', 'build/compile the package and env setup') group_check = optP.add_group('check', 'check workflow description') optP.add_helpGroup() group_config.add_argument('--version', action='store_const', const=True, dest='version', default=False, help='Displays version') group_config.add_argument('-v', action='store_const', const=True, dest='verbose', default=False, help='Verbose') group_check.add_argument('--check', action='store_const', const=True, dest='checkOnly', default=False, help='Check workflow description locally') group_check.add_argument( '--debug', action='store_const', const=True, dest='debugCheck', default=False, help='verbose mode when checking workflow description locally') group_output.add_argument( '--cwl', action='store', dest='cwl', default=None, help='Name of the main CWL file to describe the workflow') group_output.add_argument( '--yaml', action='store', dest='yaml', default=None, help='Name of the yaml file for workflow parameters') group_build.add_argument( '--useAthenaPackages', action='store_const', const=True, dest='useAthenaPackages', default=False, help= 'One or more tasks in the workflow uses locally-built Athena packages') group_build.add_argument('--vo', action='store', dest='vo', default=None, help="virtual organization name") group_build.add_argument( '--extFile', action='store', dest='extFile', default='', help='root or large files under WORKDIR are not sent to WNs by default. ' 'If you want to send some skipped files, specify their names, ' 'e.g., data.root,big.tgz,*.o') group_output.add_argument( '--outDS', action='store', dest='outDS', default=None, help='Name of the dataset for output and log files') group_output.add_argument('--official', action='store_const', const=True, dest='official', default=False, help='Produce official dataset') group_submit.add_argument('--noSubmit', action='store_const', const=True, dest='noSubmit', default=False, help="Dry-run") group_submit.add_argument("-3", action="store_true", dest="python3", default=False, help="Use python3") group_submit.add_argument( '--voms', action='store', dest='vomsRoles', default=None, type=str, help="generate proxy with paticular roles. " "e.g., atlas:/atlas/ca/Role=production,atlas:/atlas/fr/Role=pilot") group_submit.add_argument('--noEmail', action='store_const', const=True, dest='noEmail', default=False, help='Suppress email notification') group_submit.add_argument('--prodSourceLabel', action='store', dest='prodSourceLabel', default='', help="set prodSourceLabel") group_submit.add_argument('--workingGroup', action='store', dest='workingGroup', default=None, help="set workingGroup") group_expert.add_argument( '--intrSrv', action='store_const', const=True, dest='intrSrv', default=False, help= "Please don't use this option. Only for developers to use the intr panda server" ) group_expert.add_argument( '--relayHost', action='store', dest='relayHost', default=None, help= "Please don't use this option. Only for developers to use the relay host" ) # get logger tmpLog = PLogger.getPandaLogger() # show version if '--version' in sys.argv: print("Version: %s" % PandaToolsPkgInfo.release_version) sys.exit(0) # parse args options = optP.parse_args() # check for arg_name in ['cwl', 'yaml', 'outDS']: if not getattr(options, arg_name): tmpStr = "argument --{0} is required".format(arg_name) tmpLog.error(tmpStr) sys.exit(1) # check grid-proxy PsubUtils.check_proxy(options.verbose, options.vomsRoles) # check output name nickName = PsubUtils.getNickname() if not PsubUtils.checkOutDsName( options.outDS, options.official, nickName, verbose=options.verbose): tmpStr = "invalid output dataset name: %s" % options.outDS tmpLog.error(tmpStr) sys.exit(1) # create tmp dir curDir = os.getcwd() tmpDir = os.path.join(curDir, MiscUtils.wrappedUuidGen()) os.makedirs(tmpDir) # exit action def _onExit(dir, del_command): del_command('rm -rf %s' % dir) atexit.register(_onExit, tmpDir, MiscUtils.commands_get_output) # sandbox if options.verbose: tmpLog.debug("making sandbox") archiveName = 'jobO.%s.tar.gz' % MiscUtils.wrappedUuidGen() archiveFullName = os.path.join(tmpDir, archiveName) extensions = ['cwl', 'yaml', 'json'] find_opt = ' -o '.join(['-name "*.{0}"'.format(e) for e in extensions]) tmpOut = MiscUtils.commands_get_output( 'find . {0} | tar cvfz {1} --files-from - '.format( find_opt, archiveFullName)) if options.verbose: print(tmpOut + '\n') tmpLog.debug("checking sandbox") tmpOut = MiscUtils.commands_get_output( 'tar tvfz {0}'.format(archiveFullName)) print(tmpOut + '\n') if not options.noSubmit: tmpLog.info("uploading workflow sandbox") if options.vo: use_cache_srv = False else: use_cache_srv = True os.chdir(tmpDir) status, out = Client.putFile(archiveName, options.verbose, useCacheSrv=use_cache_srv, reuseSandbox=True) os.chdir(curDir) if out.startswith('NewFileName:'): # found the same input sandbox to reuse archiveName = out.split(':')[-1] elif out != 'True': # failed print(out) tmpLog.error("Failed with %s" % status) sys.exit(1) matchURL = re.search("(http.*://[^/]+)/", Client.baseURLCSRVSSL) sourceURL = matchURL.group(1) params = { 'taskParams': {}, 'sourceURL': sourceURL, 'sandbox': archiveName, 'workflowSpecFile': options.cwl, 'workflowInputFile': options.yaml, 'language': 'cwl', 'outDS': options.outDS, 'base_platform': os.environ.get('ALRB_USER_PLATFORM', 'centos7') } # making task params with dummy exec task_type_args = {'container': '--containerImage __dummy_container__'} if options.useAthenaPackages: task_type_args['athena'] = '--useAthenaPackages' for task_type in task_type_args: prun_exec_str = '--exec __dummy_exec_str__ --outDS {0} {1}'.format( options.outDS, task_type_args[task_type]) if options.noSubmit: prun_exec_str += ' --noSubmit' if options.verbose: prun_exec_str += ' -v' if options.vo: prun_exec_str += ' --vo {0}'.format(options.vo) if options.prodSourceLabel: prun_exec_str += ' --prodSourceLabel {0}'.format( options.prodSourceLabel) if options.workingGroup: prun_exec_str += ' --workingGroup {0}'.format(options.workingGroup) if options.extFile: prun_exec_str += ' --extFile {0}'.format(options.extFile) arg_dict = { 'get_taskparams': True, 'ext_args': shlex.split(prun_exec_str) } if options.checkOnly: arg_dict['dry_mode'] = True taskParamMap = PrunScript.main(**arg_dict) del taskParamMap['noInput'] del taskParamMap['nEvents'] del taskParamMap['nEventsPerJob'] params['taskParams'][task_type] = taskParamMap if options.noSubmit: if options.noSubmit: if options.verbose: tmpLog.debug("==== taskParams ====") tmpKeys = list(taskParamMap) tmpKeys.sort() for tmpKey in tmpKeys: if tmpKey in ['taskParams']: continue print('%s : %s' % (tmpKey, taskParamMap[tmpKey])) sys.exit(0) data = {'relay_host': options.relayHost, 'verbose': options.verbose} if not options.checkOnly: action_type = 'submit' else: action_type = 'check' data['check'] = True # set to use INTR server just before taking action so that sandbox files go to the regular place if options.intrSrv: Client.useIntrServer() # action tmpLog.info("{0} workflow {1}".format(action_type, options.outDS)) tmpStat, tmpOut = Client.send_workflow_request(params, **data) # result exitCode = None if tmpStat != 0: tmpStr = "workflow {0} failed with {1}".format(action_type, tmpStat) tmpLog.error(tmpStr) exitCode = 1 if tmpOut[0]: if not options.checkOnly: tmpStr = tmpOut[1] tmpLog.info(tmpStr) else: check_stat = tmpOut[1]['status'] check_log = 'messages from the server\n' + tmpOut[1]['log'] tmpLog.info(check_log) if check_stat: tmpLog.info('successfully verified workflow description') else: tmpLog.error('workflow description is corrupted') else: tmpStr = "workflow {0} failed. {1}".format(action_type, tmpOut[1]) tmpLog.error(tmpStr) exitCode = 1 return exitCode
def get_panda_task_paramsmap(panda_task_id): status, task_param_map = Client.getTaskParamsMap(panda_task_id) if status == 0: task_param_map = json.loads(task_param_map) return task_param_map return None
try: # except for macOS X readline.read_history_file(historyFile) except Exception: pass readline.set_history_length(1024) # set dummy CMTSITE if 'CMTSITE' not in os.environ: os.environ['CMTSITE'] = '' # make tmp dir tmpDir = tempfile.mkdtemp() # set tmp dir in Client Client.setGlobalTmpDir(tmpDir) # fork PID fork_child_pid = None # exit action def _onExit(dirName, hFile): # save history only for master process if fork_child_pid == 0: readline.write_history_file(hFile) # remove tmp dir commands_get_output('rm -rf %s' % dirName) atexit.register(_onExit, tmpDir, historyFile)
def main(): # parse option parser = argparse.ArgumentParser(conflict_handler="resolve") parser.add_argument("-v", action="store_true", dest="verbose", default=False, help="Verbose") parser.add_argument('-c', action='store', dest='comString', default='', type=str, help='Execute a command in the batch mode') parser.add_argument("-3", action="store_true", dest="python3", default=False, help="Use python3") parser.add_argument('--version', action='store_const', const=True, dest='version', default=False, help='Displays version') parser.add_argument('--devSrv', action='store_const', const=True, dest='devSrv', default=False, help=argparse.SUPPRESS) parser.add_argument('--intrSrv', action='store_const', const=True, dest='intrSrv', default=False, help=argparse.SUPPRESS) # option for jupyter notebook parser.add_argument('--prompt_with_newline', action='store_const', const=True, dest='prompt_with_newline', default=False, help=argparse.SUPPRESS) options, args = parser.parse_known_args() # display version if options.version: print("Version: %s" % PandaToolsPkgInfo.release_version) sys.exit(0) # use dev server if options.devSrv: Client.useDevServer() # use INTR server if options.intrSrv: Client.useIntrServer() # fork for Ctl-c global fork_child_pid fork_child_pid = os.fork() if fork_child_pid == -1: print("ERROR : Failed to fork") sys.exit(1) if fork_child_pid == 0: # main # instantiate core if options.verbose: print(options) if options.prompt_with_newline: sys.ps1 = ">>> \n" pbookCore = PBookCore.PBookCore(verbose=options.verbose) # CUI intmain(pbookCore, options.comString) else: # set handler signal.signal(signal.SIGINT, catch_sig) signal.signal(signal.SIGHUP, catch_sig) signal.signal(signal.SIGTERM, catch_sig) pid, status = os.wait() if os.WIFSIGNALED(status): sys.exit(-os.WTERMSIG(status)) elif os.WIFEXITED(status): sys.exit(os.WEXITSTATUS(status)) else: sys.exit(0)
print(job_info) print(job_info.attemptNr) print(job_info.maxAttempt) print(job_info.Files) print(job_info.Files[0]) for f in job_info.Files: # print(dir(f)) print(f._attributes) print(f.values()) print(f.type) """ jediTaskID = 10517 # 10607 jediTaskID = 59725 ret = Client.getJediTaskDetails({'jediTaskID': jediTaskID}, True, True, verbose=False) print(ret) # ret = Client.getTaskStatus(jediTaskID, verbose=False) # print(ret) task_info = ret[1] jobids = task_info['PandaID'] ret = Client.getJobStatus(ids=jobids, verbose=False) print(ret) if ret[0] == 0: jobs = ret[1] left_jobids = [] ret_jobs = []