def declare_output(job, work_report, worker_stageout_declaration): out_file_report = {} out_file_report[job.jobid] = [] for outfile in job.output_files.keys(): logger.debug( "File {} will be checked and declared for stage out".format( outfile)) if os.path.exists(outfile): file_desc = {} if outfile == job.log_file: file_desc['filetype'] = 'log' else: file_desc['filetype'] = 'output' file_desc['path'] = os.path.abspath(outfile) file_desc['fsize'] = os.path.getsize(outfile) if 'guid' in job.output_files[outfile].keys(): file_desc['guid'] = job.output_files[outfile]['guid'] elif work_report['outputfiles'] and work_report['outputfiles'][ outfile]: file_desc['guid'] = work_report['outputfiles'][outfile]['guid'] out_file_report[job.jobid].append(file_desc) else: logger.info( "Expected output file {0} missed. Job {1} will be failed". format(outfile, job.jobid)) set_pilot_state(job=job, state='failed') if out_file_report[job.jobid]: write_json(worker_stageout_declaration, out_file_report) logger.debug( 'Stagout declared in: {0}'.format(worker_stageout_declaration)) logger.debug('Report for stageout: {}'.format(out_file_report))
def get_metadata_dict_from_txt(path, storejson=False, jobid=None): """ Convert memory monitor text output to json, store it, and return a selection as a dictionary. :param path: :param storejson: store dictionary on disk if True (boolean). :param jobid: job id (string). :return: prmon metadata (dictionary). """ # get the raw memory monitor output, convert to dictionary dictionary = convert_text_file_to_dictionary(path) if dictionary and storejson: # add metadata dictionary['type'] = 'MemoryMonitorData' dictionary['pandaid'] = jobid path = os.path.join(os.path.dirname(path), get_memory_monitor_output_filename(suffix='json')) logger.debug('writing prmon dictionary to: %s' % path) write_json(path, dictionary) else: logger.debug('nothing to write (no prmon dictionary)') # filter dictionary? # .. return dictionary
def process_jobreport(payload_report_file, job_scratch_path, job_communication_point): """ Copy job report file to make it accessible by Harvester. Shrink job report file. :param payload_report_file: name of job report (string). :param job_scratch_path: path to scratch directory (string). :param job_communication_point: path to updated job report accessible by Harvester (string). :raises FileHandlingFailure: in case of IOError. """ src_file = os.path.join(job_scratch_path, payload_report_file) dst_file = os.path.join(job_communication_point, payload_report_file) try: logger.info("Copy of payload report [{0}] to access point: {1}".format( payload_report_file, job_communication_point)) # shrink jobReport job_report = read_json(src_file) if 'executor' in job_report: for executor in job_report['executor']: if 'logfileReport' in executor: executor['logfileReport'] = {} write_json(dst_file, job_report) except IOError: logger.error("Job report copy failed, execution terminated': \n %s " % (sys.exc_info()[1])) raise FileHandlingFailure("Job report copy from RAM failed")
def publish_job_report(job, args, job_report_file="jobReport.json"): """ Copy job report file to make it accessible by Harvester. Shrink job report file. :param job: job object. :param args: Pilot arguments object. :param job_report_file: name of job report (string). :raises FileHandlingFailure: in case of IOError. """ src_file = join(job.workdir, job_report_file) dst_file = join(args.harvester_workdir, job_report_file) try: logger.info("copy of payload report [{0}] to access point: {1}".format( job_report_file, args.harvester_workdir)) # shrink jobReport job_report = read_json(src_file) if 'executor' in job_report: for executor in job_report['executor']: if 'logfileReport' in executor: executor['logfileReport'] = {} write_json(dst_file, job_report) except IOError: logger.error("job report copy failed")
def get_schedconfig_queuedata(queue): """ Return and store the schedconfig queuedata. :param queue: PanDA queue name (e.g. BNL_PROD_MCORE) :return: schedconfig queuedata json dictionary """ # read it locally if the queuedata file already exists filename = os.path.join(os.environ.get('PILOT_HOME'), config.Information.queuedata) if os.path.exists(filename): queuedata = read_json(filename) return queuedata url = config.Information.schedconfig if url == "": logger.fatal('URL for schedconfig not set') return False else: # add the queuename to the URL if not url.endswith('/'): url += '/' url += queue + '.all.json' queuedata = retrieve_json(url) # also write the queuedata to disk if not write_json(filename, queuedata): logger.warning("failed to write queuedata json to file") else: logger.info("wrote queuedata to local file %s" % filename) return queuedata
def request_new_jobs(njobs=1): """ Inform Harvester that the pilot is ready to process new jobs by creating a job request file with the desired number of jobs. :param njobs: Number of jobs. Default is 1 since on grids and clouds the pilot does not know how many jobs it can process before it runs out of time. :return: """ path = get_job_request_file_name() dictionary = {'nJobs': njobs} # write it to file try: write_json(path, dictionary) except FileHandlingFailure: raise FileHandlingFailure
def write_pilot_timing(pilot_timing_dictionary): """ Write the given pilot timing dictionary to file. :param pilot_timing_dictionary: :return: """ timing_file = config.Pilot.timing_file #rank, max_ranks = get_ranks_info() #if rank is not None: # timing_file += '_{0}'.format(rank) path = os.path.join(os.environ.get('PILOT_HOME', ''), timing_file) if write_json(path, pilot_timing_dictionary): logger.debug('updated pilot timing dictionary: %s' % path) else: logger.warning('failed to update pilot timing dictionary: %s' % path)
def publish_work_report(work_report=None, worker_attributes_file="worker_attributes.json"): """ Publishing of work report to file. The work report dictionary should contain the fields defined in get_initial_work_report(). :param work_report: work report dictionary. :param worker_attributes_file: :return: """ if work_report: work_report['timestamp'] = time_stamp() if "outputfiles" in work_report: del (work_report["outputfiles"]) if "inputfiles" in work_report: del (work_report["inputfiles"]) if "xml" in work_report: del (work_report["xml"]) if write_json(worker_attributes_file, work_report): logger.info("work report published: {0}".format(work_report))
def publish_work_report(work_report=None, worker_attributes_file="worker_attributes.json"): """ Publishing of work report to file. The work report dictionary should contain the fields defined in get_initial_work_report(). :param work_report: work report dictionary. :param worker_attributes_file: :raises FileHandlingFailure: in case of IOError. :return: True or False """ if work_report: try: work_report['timestamp'] = time_stamp() if "outputfiles" in work_report: del (work_report["outputfiles"]) if "inputfiles" in work_report: del (work_report["inputfiles"]) if "xml" in work_report: del (work_report["xml"]) if write_json(worker_attributes_file, work_report): logger.info("work report published: {0}".format(work_report)) return True else: logger.error( "work report publish failed: {0}".format(work_report)) return False except IOError: logger.error("job report copy failed") return False except Exception as e: logger.error("write json file failed: {0}".format(e)) return False else: # No work_report return False return False
except Exception as e: err = str(e) errcode = -1 message(err) # put file statuses in a dictionary to be written to file file_dictionary = { } # { 'error': [error_diag, -1], 'lfn1': [status, status_code], 'lfn2':.., .. } if xfiles: message('stagein script summary of transferred files:') for fspec in xfiles: add_to_dictionary(file_dictionary, fspec.lfn, fspec.status, fspec.status_code, fspec.turl) status = fspec.status if fspec.status else "(not transferred)" message(" -- lfn=%s, status_code=%s, status=%s" % (fspec.lfn, fspec.status_code, status)) # add error info, if any if err: errcode, err = extract_error_info(err) add_to_dictionary(file_dictionary, 'error', err, errcode, None) _status = write_json( os.path.join(args.workdir, config.Container.stagein_status_dictionary), file_dictionary) if err: message("containerised file transfers failed: %s" % err) exit(TRANSFER_ERROR) message("containerised file transfers finished") exit(0)
def publish_stageout_files(job, event_status_file): """ Publishing of work report to file. The work report dictionary should contain the fields defined in get_initial_work_report(). :param args: Pilot arguments object. :param job: job object. :param event status file name: :return: Boolean. status of writing the file information to a json """ # get the harvester workdir from the event_status_file work_dir = dirname(event_status_file) out_file_report = {} out_file_report[job.jobid] = [] # first look at the logfile information (logdata) from the FileSpec objects for fspec in job.logdata: logger.debug( "File {} will be checked and declared for stage out".format( fspec.lfn)) # find the first instance of the file filename = basename(fspec.surl) path = findfile(work_dir, filename) logger.debug("Found File {} at path - {}".format(fspec.lfn, path)) # file_desc = {} file_desc['type'] = fspec.filetype file_desc['path'] = path file_desc['guid'] = fspec.guid file_desc['fsize'] = fspec.filesize file_desc['chksum'] = get_checksum_value(fspec.checksum) logger.debug("File description - {} ".format(file_desc)) out_file_report[job.jobid].append(file_desc) # Now look at the output file(s) information (outdata) from the FileSpec objects for fspec in job.outdata: logger.debug( "File {} will be checked and declared for stage out".format( fspec.lfn)) # find the first instance of the file filename = basename(fspec.surl) path = findfile(work_dir, filename) logger.debug("Found File {} at path - {}".format(fspec.lfn, path)) # file_desc = {} file_desc['type'] = fspec.filetype file_desc['path'] = path file_desc['guid'] = fspec.guid file_desc['fsize'] = fspec.filesize file_desc['chksum'] = get_checksum_value(fspec.checksum) logger.debug("File description - {} ".format(file_desc)) out_file_report[job.jobid].append(file_desc) if out_file_report[job.jobid]: if write_json(event_status_file, out_file_report): logger.debug('Stagout declared in: {0}'.format(event_status_file)) logger.debug('Report for stageout: {}'.format(out_file_report)) return True else: logger.debug( 'Failed to declare stagout in: {0}'.format(event_status_file)) return False else: logger.debug('No Report for stageout') return False
def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, external_dir, label='stage-in', container_type='container'): """ Get the middleware container execution command. Note: this function is tailor made for stage-in/out. :param job: job object. :param xdata: list of FileSpec objects. :param queue: queue name (string). :param script: name of stage-in/out script (string). :param eventtype: :param localsite: :param remotesite: :param external_dir: input or output files directory (string). :param label: optional 'stage-[in|out]' (string). :param container_type: optional 'container/bash' (string). :return: stage-in/out command (string). :raises PilotException: for stage-in/out related failures """ if label == 'stage-out': filedata_dictionary = get_filedata_strings(xdata) else: filedata_dictionary = get_filedata(xdata) # write file data to file try: status = write_json( path.join(job.workdir, config.Container.stagein_replica_dictionary), filedata_dictionary) except Exception as exc: diagnostics = 'exception caught in get_command(): %s' % exc logger.warning(diagnostics) raise PilotException(diagnostics) else: if not status: diagnostics = 'failed to write replica dictionary to file' logger.warning(diagnostics) raise PilotException(diagnostics) # copy pilot source into container directory, unless it is already there diagnostics = copy_pilot_source(job.workdir) if diagnostics: raise PilotException(diagnostics) final_script_path = path.join(job.workdir, script) environ['PYTHONPATH'] = environ.get('PYTHONPATH') + ':' + job.workdir script_path = path.join('pilot/scripts', script) full_script_path = path.join(path.join(job.workdir, script_path)) copy(full_script_path, final_script_path) if container_type == 'container': # correct the path when containers have been used final_script_path = path.join('.', script) workdir = '/srv' else: # for container_type=bash we need to add the rucio setup pilot_user = environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.container' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: final_script_path = user.get_middleware_container_script( '', final_script_path, asetup=True) except PilotException: final_script_path = 'python %s' % final_script_path workdir = job.workdir cmd = "%s -d -w %s -q %s --eventtype=%s --localsite=%s --remotesite=%s --produserid=\"%s\" --jobid=%s" % \ (final_script_path, workdir, queue, eventtype, localsite, remotesite, job.produserid.replace(' ', '%20'), job.jobid) if label == 'stage-in': cmd += " --eventservicemerge=%s --usepcache=%s --usevp=%s --replicadictionary=%s" % \ (job.is_eventservicemerge, job.infosys.queuedata.use_pcache, job.use_vp, config.Container.stagein_replica_dictionary) if external_dir: cmd += ' --inputdir=%s' % external_dir else: # stage-out cmd += ' --lfns=%s --scopes=%s --datasets=%s --ddmendpoints=%s --guids=%s' % \ (filedata_dictionary['lfns'], filedata_dictionary['scopes'], filedata_dictionary['datasets'], filedata_dictionary['ddmendpoints'], filedata_dictionary['guids']) if external_dir: cmd += ' --outputdir=%s' % external_dir cmd += ' --taskid=%s' % job.taskid cmd += ' --jobdefinitionid=%s' % job.jobdefinitionid cmd += ' --catchall=%s' % job.infosys.queuedata.catchall if container_type == 'bash': cmd += '\nexit $?' return cmd
except Exception as error: print("caught exception: %s (skipping remote file open verification)" % error) exit(1) else: if not logname: print("remote file open verification not desired") exit(0) establish_logging(args, filename=logname) logger = logging.getLogger(__name__) # get the file info file_list_dictionary = get_file_lists(args.turls) turls = file_list_dictionary.get('turls') processed_turls_dictionary = {} if turls: message('got TURLs: %s' % str(turls)) for turl in turls: processed_turls_dictionary[turl] = try_open_file(turl) # write dictionary to file with results _status = write_json( os.path.join(args.workdir, config.Pilot.remotefileverification_dictionary), processed_turls_dictionary) else: message('no TURLs to verify') exit(0)
} # { 'error': [error_diag, -1], 'lfn1': [status, status_code], 'lfn2':.., .. } if xfiles: message('stageout script summary of transferred files:') for fspec in xfiles: add_to_dictionary(file_dictionary, fspec.lfn, fspec.status, fspec.status_code, fspec.surl, fspec.turl, fspec.checksum.get('adler32'), fspec.filesize) status = fspec.status if fspec.status else "(not transferred)" message( " -- lfn=%s, status_code=%s, status=%s, surl=%s, turl=%s, checksum=%s, filesize=%s" % (fspec.lfn, fspec.status_code, status, fspec.surl, fspec.turl, fspec.checksum.get('adler32'), fspec.filesize)) # add error info, if any if err: errcode, err = extract_error_info(err) add_to_dictionary(file_dictionary, 'error', err, errcode, None, None, None, None) path = os.path.join(args.workdir, config.Container.stageout_status_dictionary) if os.path.exists(path): path += '.log' _status = write_json(path, file_dictionary) if err: message("containerised file transfers failed: %s" % err) exit(TRANSFER_ERROR) message("wrote %s" % path) message("containerised file transfers finished") exit(0)