def process_jobreport(payload_report_file, job_scratch_path, job_communication_point): """ Copy job report file to make it accessible by Harvester. Shrink job report file. :param payload_report_file: name of job report (string). :param job_scratch_path: path to scratch directory (string). :param job_communication_point: path to updated job report accessible by Harvester (string). :raises FileHandlingFailure: in case of IOError. """ src_file = os.path.join(job_scratch_path, payload_report_file) dst_file = os.path.join(job_communication_point, payload_report_file) try: logger.info("Copy of payload report [{0}] to access point: {1}".format( payload_report_file, job_communication_point)) # shrink jobReport job_report = read_json(src_file) if 'executor' in job_report: for executor in job_report['executor']: if 'logfileReport' in executor: executor['logfileReport'] = {} write_json(dst_file, job_report) except IOError: logger.error("Job report copy failed, execution terminated': \n %s " % (sys.exc_info()[1])) raise FileHandlingFailure("Job report copy from RAM failed")
def write_file(path, contents, mute=True): """ Write the given contents to a file. :param path: full path for file (string). :param contents: file contents (string). :param mute: boolean to control stdout info message :raises PilotException: FileHandlingFailure. :return: True if successful, otherwise False. """ status = False f = open_file(path, 'w') if f: try: f.write(contents) except IOError as e: raise FileHandlingFailure(e) else: status = True f.close() if not mute: logger.info('created file: %s' % path) return status
def write_json(filename, data, sort_keys=True, indent=4, separators=(',', ': ')): """ Write the dictionary to a JSON file. :param filename: file name (string). :param data: object to be written to file (dictionary or list). :param sort_keys: should entries be sorted? (boolean). :param indent: indentation level, default 4 (int). :param separators: field separators (default (',', ': ') for dictionaries, use e.g. (',\n') for lists) (tuple) :raises PilotException: FileHandlingFailure. :return: status (boolean). """ status = False try: with open(filename, 'w') as fh: dumpjson(data, fh, sort_keys=sort_keys, indent=indent, separators=separators) except IOError as exc: raise FileHandlingFailure(exc) else: status = True return status
def postprocess_workdir(workdir): """ Post-processing of working directory. Unlink paths. :param workdir: path to directory to be processed (string). :raises FileHandlingFailure: in case of IOError. """ pseudo_dir = "poolcond" try: if os.path.exists(pseudo_dir): remove(os.path.join(workdir, pseudo_dir)) except IOError: raise FileHandlingFailure( "Post processing of working directory failed")
def copy_output(job, job_scratch_dir, work_dir): cp_start = time.time() try: for outfile in job.output_files.keys(): if os.path.exists(outfile): copy(os.path.join(job_scratch_dir, outfile), os.path.join(work_dir, outfile)) os.chdir(work_dir) except IOError: raise FileHandlingFailure( "Copy from scratch dir to access point failed") finally: cp_time = time.time() - cp_start logger.info("Copy of outputs took: {0} sec.".format(cp_time)) return 0
def open_file(filename, mode): """ Open and return a file pointer for the given mode. Note: the caller needs to close the file. :param filename: file name (string). :param mode: file mode (character). :raises PilotException: FileHandlingFailure. :return: file pointer. """ f = None try: f = open(filename, mode) except IOError as exc: raise FileHandlingFailure(exc) return f
def move(path1, path2): """ Move a file from path1 to path2. :param path1: source path (string). :param path2: destination path2 (string). """ if not os.path.exists(path1): logger.warning('file copy failure: path does not exist: %s', path1) raise NoSuchFile("File does not exist: %s" % path1) try: import shutil shutil.move(path1, path2) except IOError as exc: logger.warning("exception caught during file move: %s", exc) raise FileHandlingFailure(exc) else: logger.info("moved %s to %s", path1, path2)
def copy(path1, path2): """ Copy path1 to path2. :param path1: file path (string). :param path2: file path (string). :raises PilotException: FileHandlingFailure, NoSuchFile :return: """ if not os.path.exists(path1): logger.warning('file copy failure: path does not exist: %s', path1) raise NoSuchFile("File does not exist: %s" % path1) try: copy2(path1, path2) except IOError as exc: logger.warning("exception caught during file copy: %s", exc) raise FileHandlingFailure(exc) else: logger.info("copied %s to %s", path1, path2)
def open_file(filename, mode): """ Open and return a file pointer for the given mode. Note: the caller needs to close the file. :param filename: file name (string). :param mode: file mode (character). :raises PilotException: FileHandlingFailure. :return: file pointer. """ f = None if not mode == 'w' and not os.path.exists(filename): raise NoSuchFile("File does not exist: %s" % filename) try: f = open(filename, mode) except IOError as e: raise FileHandlingFailure(e) return f
def calculate_checksum(filename, algorithm='adler32'): """ Calculate the checksum value for the given file. The default algorithm is adler32. Md5 is also be supported. Valid algorithms are 1) adler32/adler/ad32/ad, 2) md5/md5sum/md. :param filename: file name (string). :param algorithm: optional algorithm string. :raises FileHandlingFailure, NotImplementedError: exception raised when file does not exist or for unknown algorithm. :return: checksum value (string). """ if not os.path.exists(filename): raise FileHandlingFailure('file does not exist: %s' % filename) if algorithm == 'adler32' or algorithm == 'adler' or algorithm == 'ad' or algorithm == 'ad32': return calculate_adler32_checksum(filename) elif algorithm == 'md5' or algorithm == 'md5sum' or algorithm == 'md': return calculate_md5_checksum(filename) else: msg = 'unknown checksum algorithm: %s' % algorithm logger.warning(msg) raise NotImplementedError()
def write_file(path, contents, mute=True, mode='w', unique=False): """ Write the given contents to a file. If unique=True, then if the file already exists, an index will be added (e.g. 'out.txt' -> 'out-1.txt') :param path: full path for file (string). :param contents: file contents (object). :param mute: boolean to control stdout info message. :param mode: file mode (e.g. 'w', 'r', 'a', 'wb', 'rb') (string). :param unique: file must be unique (Boolean). :raises PilotException: FileHandlingFailure. :return: True if successful, otherwise False. """ status = False # add an incremental file name (add -%d if path already exists) if necessary if unique: path = get_nonexistant_path(path) f = open_file(path, mode) if f: try: f.write(contents) except IOError as exc: raise FileHandlingFailure(exc) else: status = True f.close() if not mute: if 'w' in mode: logger.info('created file: %s', path) if 'a' in mode: logger.info('appended file: %s', path) return status
def set_scratch_workdir(job, work_dir, args): """ Copy input files and some db files to RAM disk. :param job: job object. :param work_dir: job working directory (permanent FS) (string). :param args: args dictionary to collect timing metrics. :return: job working directory in scratch (string). """ scratch_path = config.HPC.scratch du = disk_usage(scratch_path) logger.info("Scratch dir available space: {0} used: {1}".format( du.free, du.used)) job_scratch_dir = os.path.join(scratch_path, str(job.jobid)) for inp_file in job.input_files: job.input_files[inp_file]["scratch_path"] = job_scratch_dir logger.debug("Job scratch path: {0}".format(job_scratch_dir)) # special data, that should be preplaced in RAM disk dst_db_path = 'sqlite200/' dst_db_filename = 'ALLP200.db' dst_db_path_2 = 'geomDB/' dst_db_filename_2 = 'geomDB_sqlite' tmp_path = 'tmp/' src_file = '/ccs/proj/csc108/AtlasReleases/21.0.15/DBRelease/current/sqlite200/ALLP200.db' src_file_2 = '/ccs/proj/csc108/AtlasReleases/21.0.15/DBRelease/current/geomDB/geomDB_sqlite' if os.path.exists(scratch_path): try: add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEIN, time.time(), args) logger.debug("Prepare \'tmp\' dir in scratch ") if not os.path.exists(scratch_path + tmp_path): os.makedirs(scratch_path + tmp_path) logger.debug("Prepare dst and copy sqlite db files") t0 = time.time() if not os.path.exists(scratch_path + dst_db_path): os.makedirs(scratch_path + dst_db_path) shutil.copyfile(src_file, scratch_path + dst_db_path + dst_db_filename) logger.debug("") sql_cp_time = time.time() - t0 logger.debug("Copy of sqlite files took: {0}".format(sql_cp_time)) logger.debug("Prepare dst and copy geomDB files") t0 = time.time() if not os.path.exists(scratch_path + dst_db_path_2): os.makedirs(scratch_path + dst_db_path_2) shutil.copyfile(src_file_2, scratch_path + dst_db_path_2 + dst_db_filename_2) geomdb_cp_time = time.time() - t0 logger.debug( "Copy of geomDB files took: {0} s".format(geomdb_cp_time)) logger.debug("Prepare job scratch dir") t0 = time.time() if not os.path.exists(job_scratch_dir): os.makedirs(job_scratch_dir) logger.debug("Copy input file") for inp_file in job.input_files: logger.debug("Copy: {0} to {1}".format( os.path.join(work_dir, inp_file), job.input_files[inp_file]["scratch_path"])) shutil.copyfile( os.path.join(work_dir, inp_file), os.path.join(job.input_files[inp_file]["scratch_path"], inp_file)) input_cp_time = time.time() - t0 logger.debug( "Copy of input files took: {0} s".format(input_cp_time)) except IOError as e: logger.error("I/O error({0}): {1}".format(e.errno, e.strerror)) logger.error( "Copy to scratch failed, execution terminated': \n %s " % (sys.exc_info()[1])) raise FileHandlingFailure("Copy to RAM disk failed") finally: add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) else: logger.info('Scratch directory (%s) dos not exist' % scratch_path) return work_dir os.chdir(job_scratch_dir) logger.debug("Current directory: {0}".format(os.getcwd())) true_dir = '/ccs/proj/csc108/AtlasReleases/21.0.15/nfs_db_files' pseudo_dir = "./poolcond" os.symlink(true_dir, pseudo_dir) du = disk_usage(scratch_path) logger.info("Scratch dir available space for job: {0} used: {1}".format( du.free, du.used)) return job_scratch_dir
def run(args): """ Main execution function for the generic HPC workflow. :param args: pilot arguments. :returns: traces object. """ # set communication point. Worker report should be placed there, matched with working directory of Harvester if args.harvester_workdir: communication_point = args.harvester_workdir else: communication_point = os.getcwd() work_report = get_initial_work_report() worker_attributes_file = config.Harvester.workerAttributesFile worker_stageout_declaration = config.Harvester.StageOutnFile payload_report_file = config.Payload.jobreport payload_stdout_file = config.Payload.payloadstdout payload_stderr_file = config.Payload.payloadstderr try: logger.info('setting up signal handling') signal.signal(signal.SIGINT, functools.partial(interrupt, args)) logger.info('setting up tracing') traces = namedtuple('traces', ['pilot']) traces.pilot = {'state': SUCCESS, 'nr_jobs': 0} if args.hpc_resource == '': logger.critical('hpc resource not specified, cannot continue') traces.pilot['state'] = FAILURE return traces # get the resource reference resource = __import__('pilot.resource.%s' % args.hpc_resource, globals(), locals(), [args.hpc_resource], -1) # get the user reference user = __import__('pilot.user.%s.common' % args.pilot_user.lower(), globals(), locals(), [args.pilot_user.lower()], -1) # get job (and rank) add_to_pilot_timing('0', PILOT_PRE_GETJOB, time.time(), args) job, rank = resource.get_job(communication_point) add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args) # cd to job working directory add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, time.time(), args) work_dir = resource.set_job_workdir(job, communication_point) work_report['workdir'] = work_dir worker_attributes_file = os.path.join(work_dir, worker_attributes_file) logger.debug("Worker attributes will be publeshied in: {0}".format( worker_attributes_file)) set_pilot_state(job=job, state="starting") work_report["jobStatus"] = job.state publish_work_report(work_report, worker_attributes_file) # Get HPC specific setup commands logger.info('setup for resource %s: %s' % (args.hpc_resource, str(resource.get_setup()))) setup_str = "; ".join(resource.get_setup()) # Prepare job scratch directory (RAM disk etc.) job_scratch_dir = resource.set_scratch_workdir(job, work_dir, args) my_command = " ".join([job.script, job.script_parameters]) my_command = resource.command_fix(my_command, job_scratch_dir) my_command = setup_str + my_command add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, time.time(), args) # Basic execution. Should be replaced with something like 'run_payload' logger.debug("Going to launch: {0}".format(my_command)) logger.debug("Current work directory: {0}".format(job_scratch_dir)) payloadstdout = open(payload_stdout_file, "w") payloadstderr = open(payload_stderr_file, "w") add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, time.time(), args) set_pilot_state(job=job, state="running") work_report["jobStatus"] = job.state work_report["startTime"] = datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") start_time = time.asctime(time.localtime(time.time())) job.startTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") publish_work_report(work_report, worker_attributes_file) stime = time.time() t0 = os.times() exit_code, stdout, stderr = execute(my_command, stdout=payloadstdout, stderr=payloadstderr, shell=True) logger.debug("Payload exit code: {0}".format(exit_code)) t1 = os.times() exetime = time.time() - stime end_time = time.asctime(time.localtime(time.time())) t = map(lambda x, y: x - y, t1, t0) t_tot = reduce(lambda x, y: x + y, t[2:3]) job.endTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") payloadstdout.close() payloadstderr.close() add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), args) state = 'finished' if exit_code == 0 else 'failed' set_pilot_state(job=job, state=state) job.exitcode = exit_code work_report["startTime"] = job.startTime work_report["endTime"] = job.endTime work_report["jobStatus"] = job.state work_report["cpuConsumptionTime"] = t_tot work_report["transExitCode"] = job.exitcode log_jobreport = "\nPayload exit code: {0} JobID: {1} \n".format( exit_code, job.jobid) log_jobreport += "CPU comsumption time: {0} JobID: {1} \n".format( t_tot, job.jobid) log_jobreport += "Start time: {0} JobID: {1} \n".format( start_time, job.jobid) log_jobreport += "End time: {0} JobID: {1} \n".format( end_time, job.jobid) log_jobreport += "Execution time: {0} sec. JobID: {1} \n".format( exetime, job.jobid) logger.info(log_jobreport) log_jobreport = "\nJob report start time: {0}\nJob report end time: {1}".format( job.startTime, job.endTime) logger.debug(log_jobreport) # Parse job report file and update of work report if os.path.exists(payload_report_file): payload_report = user.parse_jobreport_data( read_json(payload_report_file)) work_report.update(payload_report) resource.process_jobreport(payload_report_file, job_scratch_dir, work_dir) resource.postprocess_workdir(job_scratch_dir) # output files should not be packed with logs protectedfiles = job.output_files.keys() # log file not produced (yet), so should be excluded if job.log_file in protectedfiles: protectedfiles.remove(job.log_file) else: logger.info("Log files was not declared") logger.info("Cleanup of working directory") protectedfiles.extend( [worker_attributes_file, worker_stageout_declaration]) user.remove_redundant_files(job_scratch_dir, protectedfiles) res = tar_files(job_scratch_dir, protectedfiles, job.log_file) if res > 0: raise FileHandlingFailure("Log file tar failed") add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args) # Copy of output to shared FS for stageout if not job_scratch_dir == work_dir: copy_output(job, job_scratch_dir, work_dir) add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args) logger.info("Declare stage-out") add_to_pilot_timing(job.jobid, PILOT_PRE_FINAL_UPDATE, time.time(), args) declare_output(job, work_report, worker_stageout_declaration) logger.info("All done") publish_work_report(work_report, worker_attributes_file) traces.pilot['state'] = SUCCESS logger.debug("Final report: {0}".format(work_report)) add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(), args) except Exception as e: work_report["jobStatus"] = "failed" work_report["exitMsg"] = str(e) publish_work_report(work_report, worker_attributes_file) logging.exception('exception caught:') traces.pilot['state'] = FAILURE return traces