def main(argv=None): """Program entry point. """ if argv is None: argv = sys.argv if len(argv) != 3: miscutils.fwdie("Usage: runqueries.pl configfile condorjobid\n", pfwdefs.PF_EXIT_FAILURE) configfile = argv[1] condorid = argv[2] config = pfwconfig.PfwConfig({'wclfile': configfile}) # log condor jobid log_pfw_event(config, config['curr_block'], 'runqueries', 'j', ['cid', condorid]) if pfwdefs.SW_MODULELIST not in config: miscutils.fwdie("Error: No modules to run.", pfwdefs.PF_EXIT_FAILURE) ### Get master lists and files calling external codes when needed modulelist = miscutils.fwsplit(config[pfwdefs.SW_MODULELIST].lower()) modules_prev_in_list = {} for modname in modulelist: if modname not in config[pfwdefs.SW_MODULESECT]: miscutils.fwdie("Error: Could not find module description for module %s\n" % (modname), pfwdefs.PF_EXIT_FAILURE) runqueries(config, configfile, modname, modules_prev_in_list) modules_prev_in_list[modname] = True return 0
def submit_main_dag(config, dagfile, logfh): """ Submit main DAG file to Condor""" (exitcode, outtuple) = pfwcondor.condor_submit('%s.condor.sub' % (dagfile)) if exitcode or re.search('ERROR', outtuple[0]): sys.stderr.write('\n%s\n' % (outtuple[0])) logfh.write('\ncondor_submit %s.condor.sub\n%s\n' % (dagfile, outtuple[0])) else: print '\nImage processing successfully submitted to condor:' print '\tRun = %s' % (config.getfull('submit_run')) print "\tpfw_attempt_id = %s" % (config['pfw_attempt_id']) print "\tpfw_attempt task_id = %s" % (config['task_id']['attempt']) print '\n' # for completeness, log condorid of pipeline manager dagjob = pfwcondor.parse_condor_user_log('%s/%s.dagman.log' % \ (config.getfull('uberctrl_dir'), dagfile)) jobids = dagjob.keys() condorid = None if len(jobids) == 1: condorid = int(jobids[0]) pfwlog.log_pfw_event(config, 'analysis', 'j', 'mngr', 'pretask') pfwlog.log_pfw_event(config, 'analysis', 'j', 'mngr', {'cid': condorid}) return condorid
def submit_main_dag(config, dagfile, logfh): """ Submit main DAG file to Condor""" (exitcode, outtuple) = pfwcondor.condor_submit(f"{dagfile}.condor.sub") if exitcode or re.search('ERROR', outtuple[0]): sys.stderr.write(f"\n{outtuple[0]}\n") logfh.write(f"\ncondor_submit {dagfile}.condor.sub\n{outtuple[0]}\n") logfh.flush() else: print('\nImage processing successfully submitted to condor:') print(f"\tRun = {config.getfull('submit_run')}") print(f"\tpfw_attempt_id = {config['pfw_attempt_id']}") print(f"\tpfw_attempt task_id = {config['task_id']['attempt']}") print('\n') # for completeness, log condorid of pipeline manager dagjob = pfwcondor.parse_condor_user_log( f"{config.getfull('uberctrl_dir')}/{dagfile}.dagman.log") jobids = list(dagjob.keys()) condorid = None if len(jobids) == 1: condorid = int(jobids[0]) pfwlog.log_pfw_event(config, 'analysis', 'j', 'mngr', 'pretask') pfwlog.log_pfw_event(config, 'analysis', 'j', 'mngr', {'cid': condorid}) return condorid
def logpre(argv=None): """Program entry point. """ if argv is None: argv = sys.argv default_log = 'logpre.out' debugfh = open(default_log, 'w') sys.stdout = debugfh sys.stderr = debugfh print(' '.join(sys.argv)) # command line for debugging if len(argv) < 5: print('Usage: logpre configfile block subblocktype subblock') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = sys.argv[1] blockname = sys.argv[2] # could also be uberctrl subblocktype = sys.argv[3] subblock = sys.argv[4] # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') new_log_name = config.get_filename( 'block', { pfwdefs.PF_CURRVALS: { 'subblock': subblock, 'flabel': '${subblock}_logpre', 'fsuffix': 'out' } }) new_log_name = "%s/%s" % (blkdir, new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod(default_log, 0o666) os.rename(default_log, new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh log_pfw_event(config, blockname, subblock, subblocktype, ['pretask']) print("logpre done") debugfh.close() return pfwdefs.PF_EXIT_SUCCESS
def blockpost(argv=None): """Program entry point. """ if argv is None: argv = sys.argv # open file to catch error messages about command line debugfh = open('blockpost.out', 'w') sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # print command line for debugging print("running on %s" % (socket.gethostname())) if len(argv) != 3: print('Usage: blockpost.py configfile retval') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] retval = int(argv[2]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("configfile = %s" % configfile) miscutils.fwdebug_print("retval = %s" % retval) # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename( 'block', {pfwdefs.PF_CURRVALS: { 'flabel': 'blockpost', 'fsuffix': 'out' }}) new_log_name = "%s/%s" % (blkdir, new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod('blockpost.out', 0o666) os.rename('blockpost.out', new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh os.chdir(blkdir) log_pfw_event(config, blockname, 'blockpost', 'j', ['posttask', retval]) dryrun = config.getfull(pfwdefs.PF_DRYRUN) run = config.getfull('run') attid = config['pfw_attempt_id'] reqnum = config.getfull(pfwdefs.REQNUM) unitname = config.getfull(pfwdefs.UNITNAME) attnum = config.getfull(pfwdefs.ATTNUM) blknum = int(config.getfull(pfwdefs.PF_BLKNUM)) blktid = None msg2 = "" dbh = None job_byblk = {} wrap_byjob = {} wrap_bymod = {} wrapinfo = {} jobinfo = {} failedwraps = {} whyfailwraps = {} # mod failures for other modname, shouldn't happen usedb = miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)) verify_files = miscutils.convertBool(config.getfull('verify_files')) verify_status = 0 if verify_files and not usedb: print('Skipping file verification due to lack of database connection') if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): sem = None try: miscutils.fwdebug_print("Connecting to DB") dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) if verify_files: curs = dbh.cursor() curs.execute("select root from ops_archive where name='%s'" % (config.getfull('home_archive'))) rows = curs.fetchall() if rows is None or len(rows) != 1: raise Exception( "Invalid archive name (%s). Found %s rows in ops_archive" % (config.getfull('home_archive'), len(rows))) root = rows[0][0] if not os.path.isdir(root): print( "Cannot read archive root directory:%s This program must be run on an NCSA machine with access to the archive storage system." % (config.getfull('home_archive'))) sem = dbsem.DBSemaphore( 'verify_files_10', None, config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) print( "\n\nVerifying archive file sizes on disk (0 is success)") verify_status = cu.compare( dbh=dbh, archive=config.getfull('home_archive'), pfwid=attid, filesize=True, md5sum=False, quick=True, debug=False, script=False, verbose=False, silent=True) if sem is not None: del sem print(" Verification of files returned status %i" % (verify_status)) if verify_status != 0: print( " This indicates that one or more files do not have the correct file size (based on DB entries). Run" ) print( "\n compare_db.py --des_services %s --section %s --archive %s --pfwid %i --filesize --verbose" % (config.getfull('submit_des_services'), config.getfull('submit_des_db_section'), config.getfull('home_archive'), int(attid))) print("\n to see the details.") if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_QCF)): import qcframework.qcfdb as qcfdb qdbh = qcfdb.QCFDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) print("\n\nChecking non-job block task status from task table in DB (%s is success)" % \ pfwdefs.PF_EXIT_SUCCESS) num_bltasks_failed = 0 bltasks = {} blktid = None if ('block' in config['task_id'] and str(blknum) in config['task_id']['block']): blktid = int(config['task_id']['block'][str(blknum)]) miscutils.fwdebug_print("Getting block task info from DB") start_time = time.time() bltasks = dbh.get_block_task_info(blktid) end_time = time.time() miscutils.fwdebug_print( "Done getting block task info from DB (%s secs)" % (end_time - start_time)) for bltdict in list(bltasks.values()): print("Block status = ", bltdict['status']) if bltdict['status'] == pfwdefs.PF_EXIT_DRYRUN: print("setting return value to dryrun") retval = bltdict['status'] elif bltdict['status'] != pfwdefs.PF_EXIT_SUCCESS: num_bltasks_failed += 1 msg2 += "\t%s" % (bltdict['name']) if bltdict['label'] is not None: msg2 += " - %s" % (bltdict['label']) msg2 += " failed\n" if bltdict['name'] == 'begblock': # try to read the begblock.out and begblock.err files print( "Trying to get begblock.out and begblock.err") msg2 += get_subblock_output("begblock") # try to get QCF messages (especially from query codes) begblock_tid = int(config['task_id']['begblock']) sql = "select id from task where parent_task_id=%i and status!=0" % ( begblock_tid) curs = dbh.cursor() curs.execute(sql) res = curs.fetchall() msg2 += "\n===== QCF Messages =====\n" msg2 += "\n begblock\n" wrapids = [blktid, begblock_tid] for r in res: wrapids.append(r[0]) wrapmsg = {} if qdbh is not None: miscutils.fwdebug_print( "Querying QCF messages") start_time = time.time() wrapmsg = qdbh.get_qcf_messages_for_wrappers( wrapids) end_time = time.time() miscutils.fwdebug_print( "Done querying QCF messages (%s secs)" % (end_time - start_time)) miscutils.fwdebug_print("wrapmsg = %s" % wrapmsg) if len(wrapmsg) == 0: msg2 += " No QCF messages\n" else: for msgs in list(wrapmsg.values()): for m in msgs: msg2 += " " + m['message'] + "\n" retval = pfwdefs.PF_EXIT_FAILURE if retval != pfwdefs.PF_EXIT_DRYRUN: print("\n\nChecking job status from pfw_job table in DB (%s is success)" % \ pfwdefs.PF_EXIT_SUCCESS) miscutils.fwdebug_print("Getting job info from DB") start_time = time.time() jobinfo = dbh.get_job_info({'pfw_block_task_id': blktid}) end_time = time.time() miscutils.fwdebug_print( "Done getting job info from DB (%s secs)" % (end_time - start_time)) miscutils.fwdebug_print("Getting wrapper info from DB") start_time = time.time() wrapinfo = dbh.get_wrapper_info(pfw_attempt_id=attid, pfw_block_task_id=blktid) end_time = time.time() miscutils.fwdebug_print( "Done getting wrapper info from DB (%s secs)" % (end_time - start_time)) else: msg = "Could not find task id for block %s in config.des" % blockname print("Error:", msg) if 'attempt' in config['task_id']: miscutils.fwdebug_print("Saving pfw message") start_time = time.time() Messaging.pfw_message(dbh, attid, config['task_id']['attempt'], msg, pfw_utils.PFW_DB_INFO, 'blockpost.out', 0) end_time = time.time() miscutils.fwdebug_print( "Done saving pfw message (%s secs)" % (end_time - start_time)) print("all the task ids:", config['task_id']) archive = None if pfwdefs.HOME_ARCHIVE in config: archive = config.getfull(pfwdefs.HOME_ARCHIVE) logfullnames = dbh.get_fail_log_fullnames(attid, archive) dbh.close() print("len(jobinfo) = ", len(jobinfo)) print("len(wrapinfo) = ", len(wrapinfo)) job_byblk = pfwutils.index_job_info(jobinfo) print("blktid: ", blktid) print("job_byblk:", job_byblk) if blktid not in job_byblk: print("Warn: could not find jobs for block %s" % blknum) print(" This is ok if attempt died before jobs ran") print(" block task_ids in job_byblk:" % list(job_byblk.keys())) else: wrap_byjob, wrap_bymod = pfwutils.index_wrapper_info(wrapinfo) #print "wrap_byjob:", wrap_byjob #print "wrap_bymod:", wrap_bymod for jobtid, jobdict in sorted(job_byblk[blktid].items()): failedwraps[jobtid] = [] whyfailwraps[jobtid] = [] jobkeys = "" # don't print out successful wrappers if jobtid in wrap_byjob and jobdict[ 'status'] == pfwdefs.PF_EXIT_SUCCESS: continue if jobdict['jobkeys'] is not None: jobkeys = jobdict['jobkeys'] #print "jobkeys = ", jobkeys, type(jobkeys) submit_job_path = "%s/B%02d-%s/%04d" % ( config.getfull('work_dir'), int(config.getfull('blknum')), config.getfull('blockname'), int(jobdict['jobnum'])) msg2 += "\n\t%s (%s) " % (pfwutils.pad_jobnum( jobdict['jobnum']), jobkeys) if jobtid not in wrap_byjob: msg2 += "\tNo wrapper instances" else: #print "wrapnum in job =", wrap_byjob[jobtid].keys() maxwrap = max(wrap_byjob[jobtid].keys()) #print "maxwrap =", maxwrap modname = wrap_byjob[jobtid][maxwrap]['modname'] #print "modname =", modname msg2 += "%d/%s %s" % (len( wrap_byjob[jobtid]), jobdict['expect_num_wrap'], modname) # determine wrappers for this job without success exit for wrapnum, wdict in list(wrap_byjob[jobtid].items()): if wdict['status'] is None or wdict[ 'status'] != pfwdefs.PF_EXIT_SUCCESS: if wdict['modname'] == modname: failedwraps[jobtid].append(wrapnum) else: whyfailwraps[jobtid].append(wrapnum) if jobdict['status'] == pfwdefs.PF_EXIT_EUPS_FAILURE: msg2 += " - FAIL - EUPS setup failure" retval = jobdict['status'] elif jobdict['status'] == pfwdefs.PF_EXIT_CONDOR: msg2 += " - FAIL - Condor/Globus failure" retval = jobdict['status'] elif jobdict['status'] is None: msg2 += " - FAIL - NULL status" retval = pfwdefs.PF_EXIT_FAILURE elif jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS: msg2 += " - FAIL - Non-zero status" retval = jobdict['status'] if jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS: msg2 += "\n\t\t%s/runjob.out " % (submit_job_path) msg2 += '\n' # print pfw_messages if 'message' in jobdict: print(jobdict['message']) for msgdict in sorted(jobdict['message'], key=lambda k: k['message_time']): level = int(msgdict['message_lvl']) levelstr = 'info' if level == pfwdefs.PFWDB_MSG_WARN: levelstr = 'WARN' elif level == pfwdefs.PFWDB_MSG_ERROR: levelstr = 'ERROR' msg2 += "\t\t%s - %s\n" % ( levelstr, msgdict['message'].replace( '\n', '\n\t\t\t')) if jobtid in wrap_byjob: # print log file name for failed/unfinished wrappers for wrapnum in failedwraps[jobtid]: wrapdict = wrap_byjob[jobtid][wrapnum] if wrapdict['log'] in logfullnames: msg2 += "\t\t%s - %s\n" % ( wrapnum, logfullnames[wrapdict['log']]) else: msg2 += "\t\t%s - Could not find log in archive (%s)\n" % ( wrapnum, wrapdict['log']) wrapmsg = get_qcf_messages(qdbh, config, [wrapdict['task_id']]) msg2 = print_qcf_messages(config, wrapdict, wrapmsg, msg2) msg2 += '\n' # If weirdness happened in run, print a message if len(whyfailwraps[jobtid]) > 0: msg2 += "\n*** Contact framework developers. Wrappers ran after at least 1 wrapper from a previous module that doesn't have success status.\n" msg2 += "\t%s\n" % ','.join(whyfailwraps[jobtid]) except Exception as exc: if sem is not None: del sem msg2 += "\n\nEncountered error trying to gather status information for email." msg2 += "\nCheck output for blockpost for further details." print( "\n\nEncountered error trying to gather status information for email" ) print("%s: %s" % (exc.__class__.__name__, str(exc))) (extype, exvalue, trback) = sys.exc_info() traceback.print_exception(extype, exvalue, trback, file=sys.stdout) retval = pfwdefs.PF_EXIT_FAILURE retval = int(retval) + verify_status print("before email retval =", retval) when_to_email = 'run' if 'when_to_email' in config: when_to_email = config.getfull('when_to_email').lower() if miscutils.convertBool(dryrun): if when_to_email != 'never': print("dryrun = ", dryrun) print("Sending dryrun email") if retval == pfwdefs.PF_EXIT_DRYRUN: msg1 = "%s: In dryrun mode, block %s has finished successfully." % ( run, blockname) else: msg1 = "%s: In dryrun mode, block %s has failed." % ( run, blockname) send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending dryrun email") print("retval = ", retval) retval = pfwdefs.PF_EXIT_DRYRUN elif retval: if when_to_email != 'never': print("Sending block failed email\n") msg1 = "%s: block %s has failed." % (run, blockname) send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending failed email") print("retval = ", retval) elif retval == pfwdefs.PF_EXIT_SUCCESS: if when_to_email == 'block': msg1 = "%s: block %s has finished successfully." % (run, blockname) msg2 = "" print("Sending success email\n") send_email(config, blockname, retval, "", msg1, msg2) elif when_to_email == 'run': numblocks = len( miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')) if int(config[pfwdefs.PF_BLKNUM]) == numblocks: msg1 = "%s: run has finished successfully." % (run) msg2 = "" print("Sending success email\n") send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending run email because not last block") print("retval = ", retval) else: print("Not sending success email") print("retval = ", retval) else: print("Not sending email") print("retval = ", retval) # Store values in DB and hist file dbh = None if miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]): dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) if blktid is not None: print("Updating end of block task", blktid) dbh.end_task(blktid, retval, True) else: print("Could not update end of block task without block task id") if retval != pfwdefs.PF_EXIT_SUCCESS: print("Updating end of attempt", config['task_id']['attempt']) dbh.end_task(config['task_id']['attempt'], retval, True) dbh.commit() dbh.close() print("before next block retval = ", retval) if retval == pfwdefs.PF_EXIT_SUCCESS: # Get ready for next block config.inc_blknum() with open(configfile, 'w') as cfgfh: config.write(cfgfh) print("new blknum = ", config[pfwdefs.PF_BLKNUM]) print("number of blocks = ", len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ','))) miscutils.fwdebug_print("Returning retval = %s (%s)" % (retval, type(retval))) miscutils.fwdebug_print("END") debugfh.close() return int(retval)
def summary(argv=None): """ Create and send summary email """ if argv is None: argv = sys.argv debugfh = open('summary.out', 'w') sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) if len(argv) < 2: print("Usage: summary configfile status") debugfh.close() return pfwdefs.PF_EXIT_FAILURE if len(argv) == 3: status = argv[2] # dagman always exits with 0 or 1 if status == 1: status = pfwdefs.PF_EXIT_FAILURE else: print("summary: Missing status value") status = None # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': argv[1]}) log_pfw_event(config, 'process', 'mngr', 'j', ['posttask', status]) msgstr = "" msg1 = "" subject = "" if not status: msg1 = f"Processing finished with unknown results.\n{msgstr}" elif pfwdefs.PF_DRYRUN in config and miscutils.convertBool( config.getfull(pfwdefs.PF_DRYRUN)): msg1 = f"Processing ended after DRYRUN\n{msgstr}" if int(status) == pfwdefs.PF_EXIT_SUCCESS: msg1 = "Processing has successfully completed.\n" subject = "" else: print(f"status = '{status}'") print("type(status) =", type(status)) print(f"SUCCESS = '{pfwdefs.PF_EXIT_SUCCESS}'") print("type(SUCCESS) =", type(pfwdefs.PF_EXIT_SUCCESS)) msg1 = f"Processing aborted with status {status}.\n" subject = "" pfwemail.send_email(config, "processing", status, subject, msg1, '') if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) dbh.update_attempt_end_vals(config['pfw_attempt_id'], status) print(f"summary: status = '{status}'") print("summary:", msg1) print("summary: End") debugfh.close() return status
def jobpre(argv=None): """ Program entry point """ if argv is None: argv = sys.argv #debugfh = tempfile.NamedTemporaryFile(prefix='jobpre_', dir='.', delete=False) default_log = f"jobpre_{random.randint(1,10000000):08d}.out" debugfh = open(default_log, 'w') tmpfn = debugfh.name outorig = sys.stdout errorig = sys.stderr sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # command line for debugging print(os.getcwd()) if len(argv) < 3: print("Usage: jobpre configfile jobnum") debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = sys.argv[1] jobnum = sys.argv[2] # could also be uberctrl # read wcl file config = pfwconfig.PfwConfig({'wclfile': configfile}) blockname = config.getfull('blockname') blkdir = config.get('block_dir') tjpad = pfwutils.pad_jobnum(jobnum) # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM:jobnum, 'flabel': 'jobpre', 'fsuffix':'out'}}) new_log_name = f"{blkdir}/{tjpad}/{new_log_name}" miscutils.fwdebug_print(f"new_log_name = {new_log_name}") debugfh.close() sys.stdout = outorig sys.stderr = errorig os.chmod(tmpfn, 0o666) os.rename(tmpfn, new_log_name) dbh = None if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): if config.dbh is None: dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) else: dbh = config.dbh if 'use_qcf' in config and config['use_qcf']: debugfh = Messaging.Messaging(new_log_name, 'jobpre.py', config['pfw_attempt_id'], dbh=dbh, mode='a+', usedb=dbh is not None) else: debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): ctstr = dbh.get_current_timestamp_str() dbh.update_job_info(config, tjpad, {'condor_submit_time': ctstr, 'target_submit_time': ctstr}) log_pfw_event(config, blockname, tjpad, 'j', ['pretask']) miscutils.fwdebug_print("jobpre done") debugfh.close() sys.stdout = outorig sys.stderr = errorig return pfwdefs.PF_EXIT_SUCCESS
def jobpost(argv=None): """Performs steps needed after a pipeline job. """ condor2db = {'jobid': 'condor_job_id', 'csubmittime': 'condor_submit_time', 'gsubmittime': 'target_submit_time', 'starttime': 'condor_start_time', 'endtime': 'condor_end_time'} if argv is None: argv = sys.argv debugfh = tempfile.NamedTemporaryFile(mode='w+', prefix='jobpost_', dir='.', delete=False) tmpfn = debugfh.name sys.stdout = debugfh sys.stderr = debugfh miscutils.fwdebug_print("temp log name = %s" % tmpfn) print('cmd>', ' '.join(argv)) # print command line for debugging if len(argv) < 7: # open file to catch error messages about command line print('Usage: jobpost.py configfile block jobnum inputtar outputtar retval') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] blockname = argv[2] jobnum = argv[3] inputtar = argv[4] outputtar = argv[5] retval = pfwdefs.PF_EXIT_FAILURE if len(argv) == 7: retval = int(sys.argv[6]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("configfile = %s" % configfile) miscutils.fwdebug_print("block = %s" % blockname) miscutils.fwdebug_print("jobnum = %s" % jobnum) miscutils.fwdebug_print("inputtar = %s" % inputtar) miscutils.fwdebug_print("outputtar = %s" % outputtar) miscutils.fwdebug_print("retval = %s" % retval) # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") # now that have more information, rename output file if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("before get_filename") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') tjpad = pfwutils.pad_jobnum(jobnum) os.chdir("%s/%s" % (blkdir, tjpad)) new_log_name = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM: jobnum, 'flabel': 'jobpost', 'fsuffix': 'out'}}) new_log_name = "%s" % (new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod(tmpfn, 0o666) os.rename(tmpfn, new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh dbh = None if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) # get job information from the job stdout if exists (tjobinfo, tjobinfo_task) = parse_job_output(config, jobnum, dbh, retval) if dbh and len(tjobinfo) > 0: print("tjobinfo: ", tjobinfo) dbh.update_tjob_info(config['task_id']['job'][jobnum], tjobinfo) # get job information from the condor job log logfilename = 'runjob.log' if os.path.exists(logfilename) and os.path.getsize(logfilename) > 0: # if made it to submitting/running jobs try: # update job info in DB from condor log print("Updating job info in DB from condor log") condorjobinfo = pfwcondor.parse_condor_user_log(logfilename) if len(list(condorjobinfo.keys())) > 1: print("More than single job in job log") j = list(condorjobinfo.keys())[0] cjobinfo = condorjobinfo[j] djobinfo = {} for ckey, dkey in list(condor2db.items()): if ckey in cjobinfo: djobinfo[dkey] = cjobinfo[ckey] print(djobinfo) dbh.update_job_info(config, cjobinfo['jobname'], djobinfo) if 'holdreason' in cjobinfo and cjobinfo['holdreason'] is not None: msg = "Condor HoldReason: %s" % cjobinfo['holdreason'] print(msg) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_WARN) if 'abortreason' in cjobinfo and cjobinfo['abortreason'] is not None: tjobinfo_task['start_time'] = cjobinfo['starttime'] tjobinfo_task['end_time'] = cjobinfo['endtime'] if 'condor_rm' in cjobinfo['abortreason']: tjobinfo_task['status'] = pfwdefs.PF_EXIT_OPDELETE else: tjobinfo_task['status'] = pfwdefs.PF_EXIT_CONDOR else: pass except Exception: (extype, exvalue, trback) = sys.exc_info() traceback.print_exception(extype, exvalue, trback, file=sys.stdout) else: print("Warning: no job condor log file") if dbh: # update job task if 'status' not in tjobinfo_task: tjobinfo_task['status'] = pfwdefs.PF_EXIT_CONDOR if 'end_time' not in tjobinfo_task: tjobinfo_task['end_time'] = datetime.now() wherevals = {'id': config['task_id']['job'][jobnum]} dbh.basic_update_row('task', tjobinfo_task, wherevals) dbh.commit() log_pfw_event(config, blockname, jobnum, 'j', ['posttask', retval]) # input wcl should already exist in untar form if os.path.exists(inputtar): print("found inputtar: %s" % inputtar) os.unlink(inputtar) else: print("Could not find inputtar: %s" % inputtar) # untar output wcl tar and delete tar if os.path.exists(outputtar): print("Size of output wcl tar:", os.path.getsize(outputtar)) if os.path.getsize(outputtar) > 0: print("found outputtar: %s" % outputtar) pfwutils.untar_dir(outputtar, '..') os.unlink(outputtar) else: msg = "Warn: outputwcl tarball (%s) is 0 bytes." % outputtar print(msg) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_WARN) else: msg = "Warn: outputwcl tarball (%s) does not exist." % outputtar print(msg) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_WARN) if retval != pfwdefs.PF_EXIT_SUCCESS: miscutils.fwdebug_print("Setting failure retval") retval = pfwdefs.PF_EXIT_FAILURE miscutils.fwdebug_print("Returning retval = %s" % retval) miscutils.fwdebug_print("jobpost done") debugfh.close() return int(retval)
def logpre(argv=None): """ Program entry point """ if argv is None: argv = sys.argv default_log = 'logpre.out' debugfh = open(default_log, 'w') outorig = sys.stdout errorig = sys.stderr sys.stdout = debugfh sys.stderr = debugfh print(' '.join(sys.argv)) # command line for debugging if len(argv) < 5: print("Usage: logpre configfile block subblocktype subblock") debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] blockname = argv[2] # could also be uberctrl subblocktype = argv[3] subblock = argv[4] # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') new_log_name = config.get_filename('block', {pfwdefs.PF_CURRVALS: {'subblock': subblock, 'flabel': '${subblock}_logpre', 'fsuffix':'out'}}) new_log_name = f"{blkdir}/{new_log_name}" miscutils.fwdebug_print(f"new_log_name = {new_log_name}") debugfh.close() os.chmod(default_log, 0o666) os.rename(default_log, new_log_name) #debugfh.close() sys.stdout = outorig sys.stderr = errorig if 'use_qcf' in config and config['use_qcf']: if config.dbh is None: if 'submit_des_services' in config: os.environ['DES_SERVICES'] = config.getfull('submit_des_services') os.environ['DES_DB_SECTION'] = config.getfull('submit_des_db_section') debugfh = Messaging.Messaging(new_log_name, 'logpre.py', config['pfw_attempt_id'], mode='a+') else: debugfh = Messaging.Messaging(new_log_name, 'logpre.py', config['pfw_attempt_id'], dbh=config.dbh, mode='a+') else: debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh log_pfw_event(config, blockname, subblock, subblocktype, ['pretask']) print("logpre done") debugfh.close() sys.stdout = outorig sys.stderr = errorig return pfwdefs.PF_EXIT_SUCCESS
def logpost(argv=None): """ Program entry point """ if argv is None: argv = sys.argv # open file to catch error messages about command line debugfh = open('logpost.out', 'w') outorig = sys.stdout errorig = sys.stderr sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # print command line for debugging if len(argv) < 5: print("Usage: logpost configfile block subblocktype subblock retval") debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] blockname = argv[2] subblocktype = argv[3] subblock = argv[4] retval = pfwdefs.PF_EXIT_FAILURE if len(argv) == 6: retval = int(sys.argv[5]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print(f"configfile = {configfile}") miscutils.fwdebug_print(f"block = {blockname}") miscutils.fwdebug_print(f"subblock = {subblock}") miscutils.fwdebug_print(f"retval = {retval}") # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") # now that have more information, rename output file if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("before get_filename") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') new_log_name = config.get_filename( 'block', { pfwdefs.PF_CURRVALS: { 'flabel': '${subblock}_logpost', 'subblock': subblock, 'fsuffix': 'out' } }) new_log_name = f"{blkdir}/{new_log_name}" miscutils.fwdebug_print(f"new_log_name = {new_log_name}") debugfh.close() sys.stdout = outorig sys.stderr = errorig os.chmod('logpost.out', 0o666) os.rename('logpost.out', new_log_name) if 'use_qcf' in config and config['use_qcf']: if config.dbh is None: if 'submit_des_services' in config: os.environ['DES_SERVICES'] = config.getfull( 'submit_des_services') os.environ['DES_DB_SECTION'] = config.getfull( 'submit_des_db_section') debugfh = Messaging.Messaging(new_log_name, 'logpost.py', config['pfw_attempt_id'], mode='a+') else: debugfh = Messaging.Messaging(new_log_name, 'logpost.py', config['pfw_attempt_id'], dbh=config.dbh, mode='a+') else: debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh log_pfw_event(config, blockname, subblock, subblocktype, ['posttask', retval]) # In order to continue, make pipelines dagman jobs exit with success status #if 'pipelinesmngr' not in subblock: # retval = pfwdefs.PF_EXIT_SUCCESS # # If error at non-manager level, send failure email # if retval != pfwdefs.PF_EXIT_SUCCESS and \ # 'mngr' not in subblock: # send_subblock_email(config, blockname, subblock, retval) if subblock != 'begblock' and retval != pfwdefs.PF_EXIT_SUCCESS: miscutils.fwdebug_print("Setting failure retval") retval = pfwdefs.PF_EXIT_FAILURE miscutils.fwdebug_print(f"returning retval = {retval}") miscutils.fwdebug_print("logpost done") debugfh.close() sys.stdout = outorig sys.stderr = errorig miscutils.fwdebug_print(f"Exiting with = {retval}") return int(retval)
def jobpre(argv=None): """Program entry point. """ if argv is None: argv = sys.argv debugfh = tempfile.NamedTemporaryFile(mode='w+', prefix='jobpre_', dir='.', delete=False) tmpfn = debugfh.name sys.stdout = debugfh sys.stderr = debugfh print(' '.join(sys.argv)) # command line for debugging print(os.getcwd()) if len(argv) < 3: print('Usage: jobpre configfile jobnum') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = sys.argv[1] jobnum = sys.argv[2] # could also be uberctrl # read wcl file config = pfwconfig.PfwConfig({'wclfile': configfile}) blockname = config.getfull('blockname') blkdir = config.get('block_dir') tjpad = pfwutils.pad_jobnum(jobnum) # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename( 'job', { pfwdefs.PF_CURRVALS: { pfwdefs.PF_JOBNUM: jobnum, 'flabel': 'jobpre', 'fsuffix': 'out' } }) new_log_name = "%s/%s/%s" % (blkdir, tjpad, new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod(tmpfn, 0o666) os.rename(tmpfn, new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) ctstr = dbh.get_current_timestamp_str() dbh.update_job_info(config, tjpad, { 'condor_submit_time': ctstr, 'target_submit_time': ctstr }) log_pfw_event(config, blockname, tjpad, 'j', ['pretask']) miscutils.fwdebug_print("jobpre done") debugfh.close() return pfwdefs.PF_EXIT_SUCCESS
def blockpost(argv=None): """ Program entry point """ realstdout = sys.stdout realstderr = sys.stderr if argv is None: argv = sys.argv # open file to catch error messages about command line debugfh = open('blockpost.out', 'w') sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # print command line for debugging print(f"running on {socket.gethostname()}") if len(argv) != 3: print('Usage: blockpost.py configfile retval') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] retval = int(argv[2]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print(f"configfile = {configfile}") miscutils.fwdebug_print(f"retval = {retval}") # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename('block', {pfwdefs.PF_CURRVALS: {'flabel': 'blockpost', 'fsuffix':'out'}}) new_log_name = f"{blkdir}/{new_log_name}" miscutils.fwdebug_print(f"new_log_name = {new_log_name}") debugfh.close() os.chmod('blockpost.out', 0o666) os.rename('blockpost.out', new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh os.chdir(blkdir) log_pfw_event(config, blockname, 'blockpost', 'j', ['posttask', retval]) dryrun = config.getfull(pfwdefs.PF_DRYRUN) run = config.getfull('run') attid = config['pfw_attempt_id'] blknum = int(config.getfull(pfwdefs.PF_BLKNUM)) blktid = None msg2 = "" dbh = None qdbh = None job_byblk = {} wrap_byjob = {} wrapinfo = {} jobinfo = {} failedwraps = {} whyfailwraps = {} # mod failures for other modname, shouldn't happen usedb = miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)) verify_files = miscutils.convertBool(config.getfull('verify_files')) verify_status = 0 sem = None if verify_files and not usedb: print('Skipping file verification due to lack of database connection') if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): try: miscutils.fwdebug_print("Connecting to DB") if config.dbh is None: dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) else: dbh = config.dbh if verify_files: curs = dbh.cursor() curs.execute(f"select root from ops_archive where name='{config.getfull('home_archive')}'") rows = curs.fetchall() if rows is None or len(rows) != 1: raise Exception(f"Invalid archive name ({config.getfull('home_archive')}). Found {len(rows)} rows in ops_archive") root = rows[0][0] if not os.path.isdir(root): print(f"Cannot read archive root directory:{config.getfull('home_archive')} This program must be run on an NCSA machine with access to the archive storage system.") sem = dbsem.DBSemaphore('verify_files_10', None, config.getfull('submit_des_services'), config.getfull('submit_des_db_section'), connection=dbh) print("\n\nVerifying archive file sizes on disk (0 is success)") verify_status = cu.compare(dbh=dbh, archive=config.getfull('home_archive'), pfwid=attid, md5sum=False, debug=False, script=False, verbose=False, silent=True) if sem is not None: del sem sem = None print(f" Verification of files returned status {verify_status:d}") if verify_status != 0: print(" This indicates that one or more files do not have the correct file size (based on DB entries). Run") print(f"\n compare_db.py --des_services {config.getfull('submit_des_services')} --section {config.getfull('submit_des_db_section')} --archive {config.getfull('home_archive')} --pfwid {int(attid):d} --verbose") print("\n to see the details.") if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_QCF)): import qcframework.qcfdb as qcfdb #qdbh = qcfdb.QCFDB(config.getfull('submit_des_services'), # config.getfull('submit_des_db_section')) qdbh = qcfdb.QCFDB(connection=dbh) print(f"\n\nChecking non-job block task status from task table in DB ({pfwdefs.PF_EXIT_SUCCESS} is success)") num_bltasks_failed = 0 bltasks = {} blktid = None if ('block' in config['task_id'] and str(blknum) in config['task_id']['block']): blktid = int(config['task_id']['block'][str(blknum)]) miscutils.fwdebug_print("Getting block task info from DB") start_time = time.time() bltasks = dbh.get_block_task_info(blktid) end_time = time.time() miscutils.fwdebug_print(f"Done getting block task info from DB ({end_time - start_time} secs)") for bltdict in bltasks.values(): print("Block status = ", bltdict['status']) if bltdict['status'] == pfwdefs.PF_EXIT_DRYRUN: print("setting return value to dryrun") retval = bltdict['status'] elif bltdict['status'] != pfwdefs.PF_EXIT_SUCCESS: num_bltasks_failed += 1 msg2 += f"\t{bltdict['name']}" if bltdict['label'] is not None: msg2 += f" - {bltdict['label']}" msg2 += " failed\n" if bltdict['name'] == 'begblock': # try to read the begblock.out and begblock.err files print("Trying to get begblock.out and begblock.err") msg2 += get_subblock_output("begblock") # try to get QCF messages (especially from query codes) begblock_tid = int(config['task_id']['begblock']) sql = f"select id from task where parent_task_id={begblock_tid:d} and status!=0" curs = dbh.cursor() curs.execute(sql) res = curs.fetchall() msg2 += "\n===== QCF Messages =====\n" msg2 += "\n begblock\n" wrapids = [blktid, begblock_tid] for r in res: wrapids.append(r[0]) wrapmsg = {} if qdbh is not None: miscutils.fwdebug_print("Querying QCF messages") start_time = time.time() wrapmsg = qdbh.get_qcf_messages_for_wrappers(wrapids) end_time = time.time() miscutils.fwdebug_print(f"Done querying QCF messages ({end_time-start_time} secs)") miscutils.fwdebug_print(f"wrapmsg = {wrapmsg}") if not wrapmsg: msg2 += " No QCF messages\n" else: for msgs in wrapmsg.values(): for m in msgs: msg2 += " " + m['message'] + "\n" retval = pfwdefs.PF_EXIT_FAILURE if retval != pfwdefs.PF_EXIT_DRYRUN: print(f"\n\nChecking job status from pfw_job table in DB ({pfwdefs.PF_EXIT_SUCCESS} is success)") miscutils.fwdebug_print("Getting job info from DB") start_time = time.time() jobinfo = dbh.get_job_info({'pfw_block_task_id': blktid}) end_time = time.time() miscutils.fwdebug_print(f"Done getting job info from DB ({end_time - start_time} secs)") miscutils.fwdebug_print("Getting wrapper info from DB") start_time = time.time() wrapinfo = dbh.get_wrapper_info(pfw_attempt_id=attid, pfw_block_task_id=blktid) if retval != pfwdefs.PF_EXIT_SUCCESS: jobwrap = dbh.get_jobwrapper_info(id=attid) else: jobwrap = {} end_time = time.time() miscutils.fwdebug_print(f"Done getting wrapper info from DB ({end_time - start_time} secs)") else: msg = f"Could not find task id for block {blockname} in config.des" print("Error:", msg) if 'attempt' in config['task_id']: miscutils.fwdebug_print("Saving pfw message") start_time = time.time() Messaging.pfw_message(dbh, attid, config['task_id']['attempt'], msg, pfwdefs.PFWDB_MSG_INFO, 'blockpost.out', 0) end_time = time.time() miscutils.fwdebug_print(f"Done saving pfw message ({end_time - start_time} secs)") print("all the task ids:", config['task_id']) archive = None if pfwdefs.HOME_ARCHIVE in config: archive = config.getfull(pfwdefs.HOME_ARCHIVE) logfullnames = dbh.get_log_fullnames(attid, archive) #dbh.close() print("len(jobinfo) = ", len(jobinfo)) print("len(wrapinfo) = ", len(wrapinfo)) job_byblk = pfwutils.index_job_info(jobinfo) print("blktid: ", blktid) print("job_byblk:", job_byblk) if blktid not in job_byblk: print(f"Warn: could not find jobs for block {blknum}") print(" This is ok if attempt died before jobs ran") print(" block task_ids in job_byblk:", list(job_byblk.keys())) else: wrap_byjob, _ = pfwutils.index_wrapper_info(wrapinfo) #for wid,jwr in jobwrap.iteritems(): #print wid,jwr # in case the post wrapper stuff failed, internally mark the task # as failed to retrieve the info later for wrapb in wrap_byjob.values(): for wrapper in wrapb.values(): if wrapper['parent_task_id'] in jobwrap and jobwrap[wrapper['parent_task_id']]['status'] is not None \ and wrapper['status'] is not None and jobwrap[wrapper['parent_task_id']]['status'] > wrapper['status']: wrapper['status'] = jobwrap[wrapper['parent_task_id']]['status'] #print "wrap_bymod:", wrap_bymod jobtid = '' jobdict = {} for jobtid, jobdict in sorted(job_byblk[blktid].items()): failedwraps[jobtid] = [] whyfailwraps[jobtid] = [] jobkeys = "" # don't print out successful wrappers if jobtid in wrap_byjob and jobdict['status'] == pfwdefs.PF_EXIT_SUCCESS: continue if jobdict['jobkeys'] is not None: jobkeys = jobdict['jobkeys'] #print "jobkeys = ", jobkeys, type(jobkeys) submit_job_path = f"{config.getfull('work_dir')}/B{int(config.getfull('blknum')):02d}-{config.getfull('blockname'):s}/{int(jobdict['jobnum']):04d}" msg2 += f"\n\t{pfwutils.pad_jobnum(jobdict['jobnum'])} ({jobkeys}) " if jobtid not in wrap_byjob: msg2 += "\tNo wrapper instances" else: #print "wrapnum in job =", wrap_byjob[jobtid].keys() maxwrap = max(wrap_byjob[jobtid]) #print "maxwrap =", maxwrap modname = wrap_byjob[jobtid][maxwrap]['modname'] #print "modname =", modname msg2 += f"{len(wrap_byjob[jobtid]):d}/{jobdict['expect_num_wrap']} {modname}" # determine wrappers for this job without success exit for wrapnum, wdict in wrap_byjob[jobtid].items(): if wdict['status'] is None or wdict['status'] != pfwdefs.PF_EXIT_SUCCESS: if wdict['modname'] == modname: failedwraps[jobtid].append(wrapnum) else: whyfailwraps[jobtid].append(wrapnum) if jobdict['status'] == pfwdefs.PF_EXIT_EUPS_FAILURE: msg2 += " - FAIL - EUPS setup failure" retval = jobdict['status'] elif jobdict['status'] == pfwdefs.PF_EXIT_CONDOR: msg2 += " - FAIL - Condor/Globus failure" retval = jobdict['status'] elif jobdict['status'] is None: msg2 += " - FAIL - NULL status" retval = pfwdefs.PF_EXIT_FAILURE elif jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS: msg2 += " - FAIL - Non-zero status" retval = jobdict['status'] if jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS: msg2 += f"\n\t\t{submit_job_path}/runjob.out " msg2 += '\n' # print pfw_messages if 'message' in jobdict: print('\nmessages: ', jobdict['message']) for msgdict in sorted(jobdict['message'], key=lambda k: k['message_time']): level = int(msgdict['message_lvl']) levelstr = 'info' if level == pfwdefs.PFWDB_MSG_WARN: levelstr = 'WARN' elif level == pfwdefs.PFWDB_MSG_ERROR: levelstr = 'ERROR' msg2 += "\t\t{} - {}\n".format(levelstr, msgdict['message'].replace('\n', '\n\t\t\t')) if jobtid in wrap_byjob: # print log file name for failed/unfinished wrappers for wrapnum in failedwraps[jobtid]: wrapdict = wrap_byjob[jobtid][wrapnum] if wrapdict['log'] in logfullnames: msg2 += f"\t\t{wrapnum} - {logfullnames[wrapdict['log']]}\n" else: msg2 += f"\t\t{wrapnum} - Could not find log in archive {wrapdict['log']})\n" wrapmsg = get_qcf_messages(qdbh, [wrapdict['task_id']]) msg2 += print_qcf_messages(wrapdict, wrapmsg) msg2 += '\n' # If weirdness happened in run, print a message if whyfailwraps[jobtid]: msg2 += "\n*** Contact framework developers. Wrappers ran after at least 1 wrapper from a previous module that doesn't have success status.\n" msg2 += f"\t{','.join(whyfailwraps[jobtid])}\n" except Exception as exc: if sem is not None: del sem msg2 += "\n\nEncountered error trying to gather status information for email." msg2 += "\nCheck output for blockpost for further details." print("\n\nEncountered error trying to gather status information for email") print(f"{exc.__class__.__name__}: {str(exc)}") (extype, exvalue, trback) = sys.exc_info() traceback.print_exception(extype, exvalue, trback, file=sys.stdout) retval = pfwdefs.PF_EXIT_FAILURE retval = int(retval) + verify_status print("before email retval =", retval) when_to_email = 'run' if 'when_to_email' in config: when_to_email = config.getfull('when_to_email').lower() if miscutils.convertBool(dryrun): if when_to_email != 'never': print("dryrun = ", dryrun) print("Sending dryrun email") if retval == pfwdefs.PF_EXIT_DRYRUN: msg1 = f"{run}: In dryrun mode, block {blockname} has finished successfully." else: msg1 = f"{run}: In dryrun mode, block {blockname} has failed." send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending dryrun email") print("retval = ", retval) retval = pfwdefs.PF_EXIT_DRYRUN elif retval: if when_to_email != 'never': print("Sending block failed email\n") msg1 = f"{run}: block {blockname} has failed." send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending failed email") print("retval = ", retval) elif retval == pfwdefs.PF_EXIT_SUCCESS: if when_to_email == 'block': msg1 = f"{run}: block {blockname} has finished successfully." msg2 = "" print("Sending success email\n") send_email(config, blockname, retval, "", msg1, msg2) elif when_to_email == 'run': numblocks = len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')) if int(config[pfwdefs.PF_BLKNUM]) == numblocks: msg1 = f"{run}: run has finished successfully." msg2 = "" print("Sending success email\n") send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending run email because not last block") print("retval = ", retval) else: print("Not sending success email") print("retval = ", retval) else: print("Not sending email") print("retval = ", retval) # Store values in DB and hist file #dbh = None if miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]): if dbh is None: dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) if blktid is not None: print("Updating end of block task", blktid) dbh.end_task(blktid, retval, True) else: print("Could not update end of block task without block task id") if retval != pfwdefs.PF_EXIT_SUCCESS: print("Updating end of attempt", config['task_id']['attempt']) dbh.end_task(config['task_id']['attempt'], retval, True) dbh.commit() #dbh.close() print("before next block retval = ", retval) if retval == pfwdefs.PF_EXIT_SUCCESS: # Get ready for next block config.inc_blknum() with open(configfile, 'w') as cfgfh: config.write(cfgfh) print("new blknum = ", config[pfwdefs.PF_BLKNUM]) print("number of blocks = ", len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ','))) if int(config[pfwdefs.PF_BLKNUM]) > len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')) and miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]): #dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) updatevals = {'PROCESSING_STATE': 'PASS'} wherevals = {'PFW_ATTEMPT_ID': attid} dbh.basic_update_row('ATTEMPT_STATE', updatevals, wherevals) dbh.commit() #dbh.close() elif miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]): #dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) updatevals = {'PROCESSING_STATE': 'FAIL'} wherevals = {'PFW_ATTEMPT_ID': attid} dbh.basic_update_row('ATTEMPT_STATE', updatevals, wherevals) dbh.commit() #dbh.close() if dbh is not None: dbh.close() miscutils.fwdebug_print(f"Returning retval = {retval} ({type(retval)})") miscutils.fwdebug_print("END") debugfh.close() if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print(f"Exiting with = {exitcode}") miscutils.fwdebug_print(f"type of exitcode = {type(exitcode)}") sys.stdout = realstdout sys.stderr = realstderr return int(retval)
def blockpre(argv=None): """Program entry point. """ if argv is None: argv = sys.argv default_log = 'blockpre.out' debugfh = open(default_log, 'w') sys.stdout = debugfh sys.stderr = debugfh print(' '.join(sys.argv)) # command line for debugging if len(argv) < 2 or len(argv) > 3: print('Usage: blockpre configfile') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = sys.argv[1] # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) # make sure values which depend upon block are set correctly config.set_block_info() miscutils.fwdebug_print("blknum = %s" % config[pfwdefs.PF_BLKNUM]) with open(configfile, 'w') as cfgfh: config.write(cfgfh) blockname = config.getfull('blockname') miscutils.fwdebug_print("blockname = %s" % blockname) blkdir = config.getfull('block_dir') # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename( 'block', {pfwdefs.PF_CURRVALS: { 'flabel': 'blockpre', 'fsuffix': 'out' }}) new_log_name = "%s/%s" % (blkdir, new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod(default_log, 0o666) os.rename(default_log, new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh os.chdir(blkdir) write_block_condor(config) log_pfw_event(config, blockname, 'blockpre', 'j', ['pretask']) miscutils.fwdebug_print("blockpre done") debugfh.close() return pfwdefs.PF_EXIT_SUCCESS
def logpost(argv=None): """Program entry point. """ if argv is None: argv = sys.argv # open file to catch error messages about command line debugfh = open('logpost.out', 'w') sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # print command line for debugging if len(argv) < 5: print('Usage: logpost configfile block subblocktype subblock retval') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] blockname = argv[2] subblocktype = argv[3] subblock = argv[4] retval = pfwdefs.PF_EXIT_FAILURE if len(argv) == 6: retval = int(sys.argv[5]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("configfile = %s" % configfile) miscutils.fwdebug_print("block = %s" % blockname) miscutils.fwdebug_print("subblock = %s" % subblock) miscutils.fwdebug_print("retval = %s" % retval) # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") # now that have more information, rename output file if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("before get_filename") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') new_log_name = config.get_filename( 'block', { pfwdefs.PF_CURRVALS: { 'flabel': '${subblock}_logpost', 'subblock': subblock, 'fsuffix': 'out' } }) new_log_name = "%s/%s" % (blkdir, new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod('logpost.out', 0o666) os.rename('logpost.out', new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh log_pfw_event(config, blockname, subblock, subblocktype, ['posttask', retval]) # In order to continue, make pipelines dagman jobs exit with success status #if 'pipelinesmngr' not in subblock: # retval = pfwdefs.PF_EXIT_SUCCESS # # If error at non-manager level, send failure email # if retval != pfwdefs.PF_EXIT_SUCCESS and \ # 'mngr' not in subblock: # send_subblock_email(config, blockname, subblock, retval) if subblock != 'begblock' and retval != pfwdefs.PF_EXIT_SUCCESS: miscutils.fwdebug_print("Setting failure retval") retval = pfwdefs.PF_EXIT_FAILURE miscutils.fwdebug_print("returning retval = %s" % retval) miscutils.fwdebug_print("logpost done") debugfh.close() return int(retval)