def write_workflow_taskfile(config, jobnum, tasks): """ Write the list of wrapper executions for a single job to a file """ taskfile = config.get_filename( 'jobtasklist', { pfwdefs.PF_CURRVALS: { 'jobnum': jobnum }, 'required': True, intgdefs.REPLACE_VARS: True }) tjpad = pfwutils.pad_jobnum(jobnum) miscutils.coremakedirs(tjpad) with open(f"{tjpad}/{taskfile}", 'w') as tasksfh: for task in sorted(tasks, key=lambda singletask: int(singletask[0])): tasksfh.write( f"{task[0]}, {task[1]}, {task[2]}, {task[3]}, {task[4]}\n") return taskfile
def begblock(argv): """Program entry point. """ if argv == None: argv = sys.argv configfile = argv[0] config = pfwconfig.PfwConfig({'wclfile': configfile}) config.set_block_info() blknum = config[pfwdefs.PF_BLKNUM] blkdir = config.getfull('block_dir') os.chdir(blkdir) (exists, submit_des_services) = config.search('submit_des_services') if exists and submit_des_services is not None: os.environ['DES_SERVICES'] = submit_des_services (exists, submit_des_db_section) = config.search('submit_des_db_section') if exists and submit_des_db_section is not None: os.environ['DES_DB_SECTION'] = submit_des_db_section dbh = None blktid = -1 if miscutils.fwdebug_check(3, 'PFWBLOCK_DEBUG'): miscutils.fwdebug_print("blknum = %s" % (config[pfwdefs.PF_BLKNUM])) if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh = pfwdb.PFWDB(submit_des_services, submit_des_db_section) dbh.insert_block(config) blktid = config['task_id']['block'][str(blknum)] config['task_id']['begblock'] = dbh.create_task( name='begblock', info_table=None, parent_task_id=blktid, root_task_id=int(config['task_id']['attempt']), label=None, do_begin=True, do_commit=True) try: modulelist = miscutils.fwsplit( config.getfull(pfwdefs.SW_MODULELIST).lower()) modules_prev_in_list = {} joblist = {} parlist = OrderedDict() masterdata = OrderedDict() filelist = {'infiles': {}, 'outfiles': {}} for num, modname in enumerate(modulelist): print("XXXXXXXXXXXXXXXXXXXX %s XXXXXXXXXXXXXXXXXXXX" % modname) if modname not in config[pfwdefs.SW_MODULESECT]: miscutils.fwdie( "Error: Could not find module description for module %s\n" % (modname), pfwdefs.PF_EXIT_FAILURE) moddict = config[pfwdefs.SW_MODULESECT][modname] runqueries(config, configfile, modname, modules_prev_in_list) pfwblock.read_master_lists(config, modname, masterdata, modules_prev_in_list) (infsect, outfsect) = pfwblock.get_datasect_types(config, modname) pfwblock.fix_master_lists(config, modname, masterdata, outfsect) if pfwdefs.PF_NOOP not in moddict or not miscutils.convertBool( moddict[pfwdefs.PF_NOOP]): pfwblock.create_fullnames(config, modname, masterdata) if miscutils.fwdebug_check( 9, 'PFWBLOCK_DEBUG') and modname in masterdata: with open('%s-masterdata.txt' % modname, 'w') as fh: miscutils.pretty_print_dict(masterdata[modname], fh) pfwblock.add_file_metadata(config, modname) sublists = pfwblock.create_sublists(config, modname, masterdata) if sublists is not None: if miscutils.fwdebug_check(3, 'PFWBLOCK_DEBUG'): miscutils.fwdebug_print("sublists.keys() = %s" % (list(sublists.keys()))) loopvals = pfwblock.get_wrapper_loopvals(config, modname) wrapinst = pfwblock.create_wrapper_inst( config, modname, loopvals) wcnt = 1 for winst in list(wrapinst.values()): if miscutils.fwdebug_check(6, 'PFWBLOCK_DEBUG'): miscutils.fwdebug_print("winst %d - BEG" % wcnt) pfwblock.assign_data_wrapper_inst(config, modname, winst, masterdata, sublists, infsect, outfsect) pfwblock.finish_wrapper_inst(config, modname, winst, outfsect) tempfiles = pfwblock.create_module_wrapper_wcl( config, modname, winst) for fl in tempfiles['infiles']: if fl not in list(filelist['infiles'].keys()): filelist['infiles'][fl] = num for fl in tempfiles['outfiles']: filelist['outfiles'][fl] = num #filelist['infiles'] += tempfiles['infiles'] #filelist['outfiles'] += tempfiles['outfiles'] pfwblock.divide_into_jobs(config, modname, winst, joblist, parlist) if miscutils.fwdebug_check(6, 'PFWBLOCK_DEBUG'): miscutils.fwdebug_print("winst %d - %s - END" % (wcnt, etime - stime)) wcnt += 1 modules_prev_in_list[modname] = True if miscutils.fwdebug_check( 9, 'PFWBLOCK_DEBUG') and modname in masterdata: with open('%s-masterdata.txt' % modname, 'w') as fh: miscutils.pretty_print_dict(masterdata[modname], fh) scriptfile = pfwblock.write_runjob_script(config) intersect = list( set(filelist['infiles'].keys()) & set(filelist['outfiles'].keys())) finallist = [] for fl in list(filelist['infiles'].keys()): if fl not in intersect: finallist.append(fl) else: if filelist['infiles'][fl] <= filelist['outfiles'][fl]: raise Exception( 'Input file %s requested before it is generated.' % (fl)) if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): missingfiles = dbh.check_files(config, finallist) if len(missingfiles) > 0: raise Exception( "The following input files cannot be found in the archive:" + ",".join(missingfiles)) miscutils.fwdebug_print("Creating job files - BEG") for jobkey, jobdict in sorted(joblist.items()): jobdict['jobnum'] = pfwutils.pad_jobnum(config.inc_jobnum()) jobdict['jobkeys'] = jobkey jobdict['numexpwrap'] = len(jobdict['tasks']) if miscutils.fwdebug_check(6, 'PFWBLOCK_DEBUG'): miscutils.fwdebug_print("jobnum = %s, jobkey = %s:" % (jobkey, jobdict['jobnum'])) jobdict['tasksfile'] = write_workflow_taskfile( config, jobdict['jobnum'], jobdict['tasks']) if (len(jobdict['inlist']) > 0 and config.getfull(pfwdefs.USE_HOME_ARCHIVE_OUTPUT) != 'never' and 'submit_files_mvmt' in config and (pfwdefs.PF_DRYRUN not in config or not miscutils.convertBool( config.getfull(pfwdefs.PF_DRYRUN)))): # get home archive info home_archive = config.getfull('home_archive') archive_info = config[pfwdefs.SW_ARCHIVESECT][home_archive] # load filemgmt class attempt_tid = config['task_id']['attempt'] filemgmt = pfwutils.pfw_dynam_load_class( dbh, config, attempt_tid, attempt_tid, "filemgmt", archive_info['filemgmt'], archive_info) # save file information filemgmt.register_file_data('list', jobdict['inlist'], config['pfw_attempt_id'], attempt_tid, False, None, None) pfwblock.copy_input_lists_home_archive(config, filemgmt, archive_info, jobdict['inlist']) filemgmt.commit() jobdict['inputwcltar'] = pfwblock.tar_inputfiles( config, jobdict['jobnum'], jobdict['inwcl'] + jobdict['inlist']) if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh.insert_job(config, jobdict) pfwblock.write_jobwcl(config, jobkey, jobdict) if ('glidein_use_wall' in config and miscutils.convertBool( config.getfull('glidein_use_wall')) and 'jobwalltime' in config): jobdict['wall'] = config['jobwalltime'] miscutils.fwdebug_print("Creating job files - END") numjobs = len(joblist) if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh.update_block_numexpjobs(config, numjobs) #if miscutils.fwdebug_check(6, 'PFWBLOCK_DEBUG'): # miscutils.fwdebug_print("inputfiles: %s, %s" % (type(inputfiles), inputfiles)) # miscutils.fwdebug_print("outputfiles: %s, %s" % (type(outputfiles), outputfiles)) #files2stage = set(inputfiles) - set(outputfiles) #pfwblock.stage_inputs(config, files2stage) #if pfwdefs.USE_HOME_ARCHIVE_OUTPUT in config and \ # config.getfull(pfwdefs.USE_HOME_ARCHIVE_OUTPUT).lower() == 'block': # config['block_outputlist'] = 'potential_outputfiles.list' # pfwblock.write_output_list(config, outputfiles) dagfile = config.get_filename('jobdag') pfwblock.create_jobmngr_dag(config, dagfile, scriptfile, joblist) except: retval = pfwdefs.PF_EXIT_FAILURE with open(configfile, 'w') as cfgfh: config.write( cfgfh) # save config, have updated jobnum, wrapnum, etc if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh.end_task(config['task_id']['begblock'], retval, True) dbh.end_task(blktid, retval, True) raise # save config, have updated jobnum, wrapnum, etc with open(configfile, 'w') as cfgfh: config.write(cfgfh) (exists, dryrun) = config.search(pfwdefs.PF_DRYRUN) if exists and miscutils.convertBool(dryrun): retval = pfwdefs.PF_EXIT_DRYRUN else: retval = pfwdefs.PF_EXIT_SUCCESS if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh.end_task(config['task_id']['begblock'], retval, True) miscutils.fwdebug_print("END - exiting with code %s" % retval) return retval
def blockpost(argv=None): """Program entry point. """ if argv is None: argv = sys.argv # open file to catch error messages about command line debugfh = open('blockpost.out', 'w') sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # print command line for debugging print("running on %s" % (socket.gethostname())) if len(argv) != 3: print('Usage: blockpost.py configfile retval') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] retval = int(argv[2]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("configfile = %s" % configfile) miscutils.fwdebug_print("retval = %s" % retval) # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename( 'block', {pfwdefs.PF_CURRVALS: { 'flabel': 'blockpost', 'fsuffix': 'out' }}) new_log_name = "%s/%s" % (blkdir, new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod('blockpost.out', 0o666) os.rename('blockpost.out', new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh os.chdir(blkdir) log_pfw_event(config, blockname, 'blockpost', 'j', ['posttask', retval]) dryrun = config.getfull(pfwdefs.PF_DRYRUN) run = config.getfull('run') attid = config['pfw_attempt_id'] reqnum = config.getfull(pfwdefs.REQNUM) unitname = config.getfull(pfwdefs.UNITNAME) attnum = config.getfull(pfwdefs.ATTNUM) blknum = int(config.getfull(pfwdefs.PF_BLKNUM)) blktid = None msg2 = "" dbh = None job_byblk = {} wrap_byjob = {} wrap_bymod = {} wrapinfo = {} jobinfo = {} failedwraps = {} whyfailwraps = {} # mod failures for other modname, shouldn't happen usedb = miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)) verify_files = miscutils.convertBool(config.getfull('verify_files')) verify_status = 0 if verify_files and not usedb: print('Skipping file verification due to lack of database connection') if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): sem = None try: miscutils.fwdebug_print("Connecting to DB") dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) if verify_files: curs = dbh.cursor() curs.execute("select root from ops_archive where name='%s'" % (config.getfull('home_archive'))) rows = curs.fetchall() if rows is None or len(rows) != 1: raise Exception( "Invalid archive name (%s). Found %s rows in ops_archive" % (config.getfull('home_archive'), len(rows))) root = rows[0][0] if not os.path.isdir(root): print( "Cannot read archive root directory:%s This program must be run on an NCSA machine with access to the archive storage system." % (config.getfull('home_archive'))) sem = dbsem.DBSemaphore( 'verify_files_10', None, config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) print( "\n\nVerifying archive file sizes on disk (0 is success)") verify_status = cu.compare( dbh=dbh, archive=config.getfull('home_archive'), pfwid=attid, filesize=True, md5sum=False, quick=True, debug=False, script=False, verbose=False, silent=True) if sem is not None: del sem print(" Verification of files returned status %i" % (verify_status)) if verify_status != 0: print( " This indicates that one or more files do not have the correct file size (based on DB entries). Run" ) print( "\n compare_db.py --des_services %s --section %s --archive %s --pfwid %i --filesize --verbose" % (config.getfull('submit_des_services'), config.getfull('submit_des_db_section'), config.getfull('home_archive'), int(attid))) print("\n to see the details.") if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_QCF)): import qcframework.qcfdb as qcfdb qdbh = qcfdb.QCFDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) print("\n\nChecking non-job block task status from task table in DB (%s is success)" % \ pfwdefs.PF_EXIT_SUCCESS) num_bltasks_failed = 0 bltasks = {} blktid = None if ('block' in config['task_id'] and str(blknum) in config['task_id']['block']): blktid = int(config['task_id']['block'][str(blknum)]) miscutils.fwdebug_print("Getting block task info from DB") start_time = time.time() bltasks = dbh.get_block_task_info(blktid) end_time = time.time() miscutils.fwdebug_print( "Done getting block task info from DB (%s secs)" % (end_time - start_time)) for bltdict in list(bltasks.values()): print("Block status = ", bltdict['status']) if bltdict['status'] == pfwdefs.PF_EXIT_DRYRUN: print("setting return value to dryrun") retval = bltdict['status'] elif bltdict['status'] != pfwdefs.PF_EXIT_SUCCESS: num_bltasks_failed += 1 msg2 += "\t%s" % (bltdict['name']) if bltdict['label'] is not None: msg2 += " - %s" % (bltdict['label']) msg2 += " failed\n" if bltdict['name'] == 'begblock': # try to read the begblock.out and begblock.err files print( "Trying to get begblock.out and begblock.err") msg2 += get_subblock_output("begblock") # try to get QCF messages (especially from query codes) begblock_tid = int(config['task_id']['begblock']) sql = "select id from task where parent_task_id=%i and status!=0" % ( begblock_tid) curs = dbh.cursor() curs.execute(sql) res = curs.fetchall() msg2 += "\n===== QCF Messages =====\n" msg2 += "\n begblock\n" wrapids = [blktid, begblock_tid] for r in res: wrapids.append(r[0]) wrapmsg = {} if qdbh is not None: miscutils.fwdebug_print( "Querying QCF messages") start_time = time.time() wrapmsg = qdbh.get_qcf_messages_for_wrappers( wrapids) end_time = time.time() miscutils.fwdebug_print( "Done querying QCF messages (%s secs)" % (end_time - start_time)) miscutils.fwdebug_print("wrapmsg = %s" % wrapmsg) if len(wrapmsg) == 0: msg2 += " No QCF messages\n" else: for msgs in list(wrapmsg.values()): for m in msgs: msg2 += " " + m['message'] + "\n" retval = pfwdefs.PF_EXIT_FAILURE if retval != pfwdefs.PF_EXIT_DRYRUN: print("\n\nChecking job status from pfw_job table in DB (%s is success)" % \ pfwdefs.PF_EXIT_SUCCESS) miscutils.fwdebug_print("Getting job info from DB") start_time = time.time() jobinfo = dbh.get_job_info({'pfw_block_task_id': blktid}) end_time = time.time() miscutils.fwdebug_print( "Done getting job info from DB (%s secs)" % (end_time - start_time)) miscutils.fwdebug_print("Getting wrapper info from DB") start_time = time.time() wrapinfo = dbh.get_wrapper_info(pfw_attempt_id=attid, pfw_block_task_id=blktid) end_time = time.time() miscutils.fwdebug_print( "Done getting wrapper info from DB (%s secs)" % (end_time - start_time)) else: msg = "Could not find task id for block %s in config.des" % blockname print("Error:", msg) if 'attempt' in config['task_id']: miscutils.fwdebug_print("Saving pfw message") start_time = time.time() Messaging.pfw_message(dbh, attid, config['task_id']['attempt'], msg, pfw_utils.PFW_DB_INFO, 'blockpost.out', 0) end_time = time.time() miscutils.fwdebug_print( "Done saving pfw message (%s secs)" % (end_time - start_time)) print("all the task ids:", config['task_id']) archive = None if pfwdefs.HOME_ARCHIVE in config: archive = config.getfull(pfwdefs.HOME_ARCHIVE) logfullnames = dbh.get_fail_log_fullnames(attid, archive) dbh.close() print("len(jobinfo) = ", len(jobinfo)) print("len(wrapinfo) = ", len(wrapinfo)) job_byblk = pfwutils.index_job_info(jobinfo) print("blktid: ", blktid) print("job_byblk:", job_byblk) if blktid not in job_byblk: print("Warn: could not find jobs for block %s" % blknum) print(" This is ok if attempt died before jobs ran") print(" block task_ids in job_byblk:" % list(job_byblk.keys())) else: wrap_byjob, wrap_bymod = pfwutils.index_wrapper_info(wrapinfo) #print "wrap_byjob:", wrap_byjob #print "wrap_bymod:", wrap_bymod for jobtid, jobdict in sorted(job_byblk[blktid].items()): failedwraps[jobtid] = [] whyfailwraps[jobtid] = [] jobkeys = "" # don't print out successful wrappers if jobtid in wrap_byjob and jobdict[ 'status'] == pfwdefs.PF_EXIT_SUCCESS: continue if jobdict['jobkeys'] is not None: jobkeys = jobdict['jobkeys'] #print "jobkeys = ", jobkeys, type(jobkeys) submit_job_path = "%s/B%02d-%s/%04d" % ( config.getfull('work_dir'), int(config.getfull('blknum')), config.getfull('blockname'), int(jobdict['jobnum'])) msg2 += "\n\t%s (%s) " % (pfwutils.pad_jobnum( jobdict['jobnum']), jobkeys) if jobtid not in wrap_byjob: msg2 += "\tNo wrapper instances" else: #print "wrapnum in job =", wrap_byjob[jobtid].keys() maxwrap = max(wrap_byjob[jobtid].keys()) #print "maxwrap =", maxwrap modname = wrap_byjob[jobtid][maxwrap]['modname'] #print "modname =", modname msg2 += "%d/%s %s" % (len( wrap_byjob[jobtid]), jobdict['expect_num_wrap'], modname) # determine wrappers for this job without success exit for wrapnum, wdict in list(wrap_byjob[jobtid].items()): if wdict['status'] is None or wdict[ 'status'] != pfwdefs.PF_EXIT_SUCCESS: if wdict['modname'] == modname: failedwraps[jobtid].append(wrapnum) else: whyfailwraps[jobtid].append(wrapnum) if jobdict['status'] == pfwdefs.PF_EXIT_EUPS_FAILURE: msg2 += " - FAIL - EUPS setup failure" retval = jobdict['status'] elif jobdict['status'] == pfwdefs.PF_EXIT_CONDOR: msg2 += " - FAIL - Condor/Globus failure" retval = jobdict['status'] elif jobdict['status'] is None: msg2 += " - FAIL - NULL status" retval = pfwdefs.PF_EXIT_FAILURE elif jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS: msg2 += " - FAIL - Non-zero status" retval = jobdict['status'] if jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS: msg2 += "\n\t\t%s/runjob.out " % (submit_job_path) msg2 += '\n' # print pfw_messages if 'message' in jobdict: print(jobdict['message']) for msgdict in sorted(jobdict['message'], key=lambda k: k['message_time']): level = int(msgdict['message_lvl']) levelstr = 'info' if level == pfwdefs.PFWDB_MSG_WARN: levelstr = 'WARN' elif level == pfwdefs.PFWDB_MSG_ERROR: levelstr = 'ERROR' msg2 += "\t\t%s - %s\n" % ( levelstr, msgdict['message'].replace( '\n', '\n\t\t\t')) if jobtid in wrap_byjob: # print log file name for failed/unfinished wrappers for wrapnum in failedwraps[jobtid]: wrapdict = wrap_byjob[jobtid][wrapnum] if wrapdict['log'] in logfullnames: msg2 += "\t\t%s - %s\n" % ( wrapnum, logfullnames[wrapdict['log']]) else: msg2 += "\t\t%s - Could not find log in archive (%s)\n" % ( wrapnum, wrapdict['log']) wrapmsg = get_qcf_messages(qdbh, config, [wrapdict['task_id']]) msg2 = print_qcf_messages(config, wrapdict, wrapmsg, msg2) msg2 += '\n' # If weirdness happened in run, print a message if len(whyfailwraps[jobtid]) > 0: msg2 += "\n*** Contact framework developers. Wrappers ran after at least 1 wrapper from a previous module that doesn't have success status.\n" msg2 += "\t%s\n" % ','.join(whyfailwraps[jobtid]) except Exception as exc: if sem is not None: del sem msg2 += "\n\nEncountered error trying to gather status information for email." msg2 += "\nCheck output for blockpost for further details." print( "\n\nEncountered error trying to gather status information for email" ) print("%s: %s" % (exc.__class__.__name__, str(exc))) (extype, exvalue, trback) = sys.exc_info() traceback.print_exception(extype, exvalue, trback, file=sys.stdout) retval = pfwdefs.PF_EXIT_FAILURE retval = int(retval) + verify_status print("before email retval =", retval) when_to_email = 'run' if 'when_to_email' in config: when_to_email = config.getfull('when_to_email').lower() if miscutils.convertBool(dryrun): if when_to_email != 'never': print("dryrun = ", dryrun) print("Sending dryrun email") if retval == pfwdefs.PF_EXIT_DRYRUN: msg1 = "%s: In dryrun mode, block %s has finished successfully." % ( run, blockname) else: msg1 = "%s: In dryrun mode, block %s has failed." % ( run, blockname) send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending dryrun email") print("retval = ", retval) retval = pfwdefs.PF_EXIT_DRYRUN elif retval: if when_to_email != 'never': print("Sending block failed email\n") msg1 = "%s: block %s has failed." % (run, blockname) send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending failed email") print("retval = ", retval) elif retval == pfwdefs.PF_EXIT_SUCCESS: if when_to_email == 'block': msg1 = "%s: block %s has finished successfully." % (run, blockname) msg2 = "" print("Sending success email\n") send_email(config, blockname, retval, "", msg1, msg2) elif when_to_email == 'run': numblocks = len( miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')) if int(config[pfwdefs.PF_BLKNUM]) == numblocks: msg1 = "%s: run has finished successfully." % (run) msg2 = "" print("Sending success email\n") send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending run email because not last block") print("retval = ", retval) else: print("Not sending success email") print("retval = ", retval) else: print("Not sending email") print("retval = ", retval) # Store values in DB and hist file dbh = None if miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]): dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) if blktid is not None: print("Updating end of block task", blktid) dbh.end_task(blktid, retval, True) else: print("Could not update end of block task without block task id") if retval != pfwdefs.PF_EXIT_SUCCESS: print("Updating end of attempt", config['task_id']['attempt']) dbh.end_task(config['task_id']['attempt'], retval, True) dbh.commit() dbh.close() print("before next block retval = ", retval) if retval == pfwdefs.PF_EXIT_SUCCESS: # Get ready for next block config.inc_blknum() with open(configfile, 'w') as cfgfh: config.write(cfgfh) print("new blknum = ", config[pfwdefs.PF_BLKNUM]) print("number of blocks = ", len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ','))) miscutils.fwdebug_print("Returning retval = %s (%s)" % (retval, type(retval))) miscutils.fwdebug_print("END") debugfh.close() return int(retval)
def jobpre(argv=None): """ Program entry point """ if argv is None: argv = sys.argv #debugfh = tempfile.NamedTemporaryFile(prefix='jobpre_', dir='.', delete=False) default_log = f"jobpre_{random.randint(1,10000000):08d}.out" debugfh = open(default_log, 'w') tmpfn = debugfh.name outorig = sys.stdout errorig = sys.stderr sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # command line for debugging print(os.getcwd()) if len(argv) < 3: print("Usage: jobpre configfile jobnum") debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = sys.argv[1] jobnum = sys.argv[2] # could also be uberctrl # read wcl file config = pfwconfig.PfwConfig({'wclfile': configfile}) blockname = config.getfull('blockname') blkdir = config.get('block_dir') tjpad = pfwutils.pad_jobnum(jobnum) # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM:jobnum, 'flabel': 'jobpre', 'fsuffix':'out'}}) new_log_name = f"{blkdir}/{tjpad}/{new_log_name}" miscutils.fwdebug_print(f"new_log_name = {new_log_name}") debugfh.close() sys.stdout = outorig sys.stderr = errorig os.chmod(tmpfn, 0o666) os.rename(tmpfn, new_log_name) dbh = None if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): if config.dbh is None: dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) else: dbh = config.dbh if 'use_qcf' in config and config['use_qcf']: debugfh = Messaging.Messaging(new_log_name, 'jobpre.py', config['pfw_attempt_id'], dbh=dbh, mode='a+', usedb=dbh is not None) else: debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): ctstr = dbh.get_current_timestamp_str() dbh.update_job_info(config, tjpad, {'condor_submit_time': ctstr, 'target_submit_time': ctstr}) log_pfw_event(config, blockname, tjpad, 'j', ['pretask']) miscutils.fwdebug_print("jobpre done") debugfh.close() sys.stdout = outorig sys.stderr = errorig return pfwdefs.PF_EXIT_SUCCESS
def jobpost(argv=None): """Performs steps needed after a pipeline job. """ condor2db = {'jobid': 'condor_job_id', 'csubmittime': 'condor_submit_time', 'gsubmittime': 'target_submit_time', 'starttime': 'condor_start_time', 'endtime': 'condor_end_time'} if argv is None: argv = sys.argv debugfh = tempfile.NamedTemporaryFile(mode='w+', prefix='jobpost_', dir='.', delete=False) tmpfn = debugfh.name sys.stdout = debugfh sys.stderr = debugfh miscutils.fwdebug_print("temp log name = %s" % tmpfn) print('cmd>', ' '.join(argv)) # print command line for debugging if len(argv) < 7: # open file to catch error messages about command line print('Usage: jobpost.py configfile block jobnum inputtar outputtar retval') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] blockname = argv[2] jobnum = argv[3] inputtar = argv[4] outputtar = argv[5] retval = pfwdefs.PF_EXIT_FAILURE if len(argv) == 7: retval = int(sys.argv[6]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("configfile = %s" % configfile) miscutils.fwdebug_print("block = %s" % blockname) miscutils.fwdebug_print("jobnum = %s" % jobnum) miscutils.fwdebug_print("inputtar = %s" % inputtar) miscutils.fwdebug_print("outputtar = %s" % outputtar) miscutils.fwdebug_print("retval = %s" % retval) # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") # now that have more information, rename output file if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("before get_filename") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') tjpad = pfwutils.pad_jobnum(jobnum) os.chdir("%s/%s" % (blkdir, tjpad)) new_log_name = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM: jobnum, 'flabel': 'jobpost', 'fsuffix': 'out'}}) new_log_name = "%s" % (new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod(tmpfn, 0o666) os.rename(tmpfn, new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh dbh = None if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) # get job information from the job stdout if exists (tjobinfo, tjobinfo_task) = parse_job_output(config, jobnum, dbh, retval) if dbh and len(tjobinfo) > 0: print("tjobinfo: ", tjobinfo) dbh.update_tjob_info(config['task_id']['job'][jobnum], tjobinfo) # get job information from the condor job log logfilename = 'runjob.log' if os.path.exists(logfilename) and os.path.getsize(logfilename) > 0: # if made it to submitting/running jobs try: # update job info in DB from condor log print("Updating job info in DB from condor log") condorjobinfo = pfwcondor.parse_condor_user_log(logfilename) if len(list(condorjobinfo.keys())) > 1: print("More than single job in job log") j = list(condorjobinfo.keys())[0] cjobinfo = condorjobinfo[j] djobinfo = {} for ckey, dkey in list(condor2db.items()): if ckey in cjobinfo: djobinfo[dkey] = cjobinfo[ckey] print(djobinfo) dbh.update_job_info(config, cjobinfo['jobname'], djobinfo) if 'holdreason' in cjobinfo and cjobinfo['holdreason'] is not None: msg = "Condor HoldReason: %s" % cjobinfo['holdreason'] print(msg) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_WARN) if 'abortreason' in cjobinfo and cjobinfo['abortreason'] is not None: tjobinfo_task['start_time'] = cjobinfo['starttime'] tjobinfo_task['end_time'] = cjobinfo['endtime'] if 'condor_rm' in cjobinfo['abortreason']: tjobinfo_task['status'] = pfwdefs.PF_EXIT_OPDELETE else: tjobinfo_task['status'] = pfwdefs.PF_EXIT_CONDOR else: pass except Exception: (extype, exvalue, trback) = sys.exc_info() traceback.print_exception(extype, exvalue, trback, file=sys.stdout) else: print("Warning: no job condor log file") if dbh: # update job task if 'status' not in tjobinfo_task: tjobinfo_task['status'] = pfwdefs.PF_EXIT_CONDOR if 'end_time' not in tjobinfo_task: tjobinfo_task['end_time'] = datetime.now() wherevals = {'id': config['task_id']['job'][jobnum]} dbh.basic_update_row('task', tjobinfo_task, wherevals) dbh.commit() log_pfw_event(config, blockname, jobnum, 'j', ['posttask', retval]) # input wcl should already exist in untar form if os.path.exists(inputtar): print("found inputtar: %s" % inputtar) os.unlink(inputtar) else: print("Could not find inputtar: %s" % inputtar) # untar output wcl tar and delete tar if os.path.exists(outputtar): print("Size of output wcl tar:", os.path.getsize(outputtar)) if os.path.getsize(outputtar) > 0: print("found outputtar: %s" % outputtar) pfwutils.untar_dir(outputtar, '..') os.unlink(outputtar) else: msg = "Warn: outputwcl tarball (%s) is 0 bytes." % outputtar print(msg) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_WARN) else: msg = "Warn: outputwcl tarball (%s) does not exist." % outputtar print(msg) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_WARN) if retval != pfwdefs.PF_EXIT_SUCCESS: miscutils.fwdebug_print("Setting failure retval") retval = pfwdefs.PF_EXIT_FAILURE miscutils.fwdebug_print("Returning retval = %s" % retval) miscutils.fwdebug_print("jobpost done") debugfh.close() return int(retval)
def print_single_wrap(wrapnum, numwraps, expnumwrap, jdict, jwdict, wdict, indent='\t'): """ """ state = "UNK" modname = "UNK" wrapkeys = "" jstate = "UNK" jstatus = "UNK" if jdict is None or jdict['start_time'] is None: jstate = "PRE" jstatus = None else: jstatus = jdict['status'] if jdict['end_time'] is None: if numwraps == expnumwrap and jwdict['end_time'] is not None: jstate = "POST" else: jstate = "EXEC" elif jstatus == 0: jstate = "DONE" else: jstate = "FAIL" if jwdict is None: if jdict['end_time'] is None: state = "UNK" modname = "UNK" wrapkeys = "" status = "UNK - maybe first wrapper hasn't started yet" else: state = "UNK" modname = "UNK" wrapkeys = "" status = "UNK" elif jwdict['end_time'] is not None: status = jwdict['status'] if status == 0: state = "DONE" else: state = "FAIL" modname = wdict['modname'] wrapkeys = wdict['wrapkeys'] elif wdict is None: state = "PRE" if jwdict['status'] is None: status = jdict['status'] else: status = jwdict['status'] elif wdict['end_time'] is not None and jwdict['end_time'] is None: state = "POST" # after wrapper, but still in job_wrapper status = wdict['status'] modname = wdict['modname'] wrapkeys = wdict['wrapkeys'] elif wdict['end_time'] is None and wdict['start_time'] is not None: state = "EXEC" status = "" modname = wdict['modname'] wrapkeys = wdict['wrapkeys'] else: print("Didn't fit conditions:") print(jwdict) print(wdict) print("%sjob: %s (jk=%s) %d/%d %s - %s wrap: %s %s (wk=%s) - %s %s" % \ (indent, pfwutils.pad_jobnum(jdict['jobnum']), jdict['jobkeys'], numwraps, expnumwrap, jstate, jstatus, wrapnum, modname, wrapkeys, state, status))
def jobpre(argv=None): """Program entry point. """ if argv is None: argv = sys.argv debugfh = tempfile.NamedTemporaryFile(mode='w+', prefix='jobpre_', dir='.', delete=False) tmpfn = debugfh.name sys.stdout = debugfh sys.stderr = debugfh print(' '.join(sys.argv)) # command line for debugging print(os.getcwd()) if len(argv) < 3: print('Usage: jobpre configfile jobnum') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = sys.argv[1] jobnum = sys.argv[2] # could also be uberctrl # read wcl file config = pfwconfig.PfwConfig({'wclfile': configfile}) blockname = config.getfull('blockname') blkdir = config.get('block_dir') tjpad = pfwutils.pad_jobnum(jobnum) # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename( 'job', { pfwdefs.PF_CURRVALS: { pfwdefs.PF_JOBNUM: jobnum, 'flabel': 'jobpre', 'fsuffix': 'out' } }) new_log_name = "%s/%s/%s" % (blkdir, tjpad, new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod(tmpfn, 0o666) os.rename(tmpfn, new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) ctstr = dbh.get_current_timestamp_str() dbh.update_job_info(config, tjpad, { 'condor_submit_time': ctstr, 'target_submit_time': ctstr }) log_pfw_event(config, blockname, tjpad, 'j', ['pretask']) miscutils.fwdebug_print("jobpre done") debugfh.close() return pfwdefs.PF_EXIT_SUCCESS