def pfw_dynam_load_class(pfw_dbh, wcl, parent_tid, attempt_task_id, label, classname, extra_info): """ Dynamically load a class save timing info in task table """ #task_id = -1 #if pfw_dbh is not None: # task_id = pfw_dbh.create_task(name='dynclass', # info_table=None, # parent_task_id=parent_tid, # root_task_id=attempt_task_id, # label=label, # do_begin=True, # do_commit=True) the_class_obj = None try: the_class = miscutils.dynamically_load_class(classname) valdict = {} try: valdict = miscutils.get_config_vals(extra_info, wcl, the_class.requested_config_vals()) except AttributeError: # in case the_class doesn't have requested_config_vals pass the_class_obj = the_class(valdict, wcl) except: (extype, exvalue, _) = sys.exc_info() msg = f"Error: creating {label} object - {extype} - {exvalue}" print(f"\n{msg}") if pfw_dbh is not None: Messaging.pfw_message(pfw_dbh, wcl['pfw_attempt_id'], parent_tid, msg, pfwdefs.PFWDB_MSG_ERROR) raise #if pfw_dbh is not None: # pfw_dbh.end_task(task_id, pfwdefs.PF_EXIT_SUCCESS, True) return the_class_obj
def begrun(argv): """TODO: short summary. Performs steps executed on submit machine at beginning of processing attempt. """ pfw_dbh = None try: configfile = argv[0] config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(6, 'BEGRUN_DEBUG'): miscutils.fwdebug_print( 'use_home_archive_output = %s' % config.getfull(pfwdefs.USE_HOME_ARCHIVE_OUTPUT)) if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): import processingfw.pfwdb as pfwdb pfw_dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) pfw_dbh.begin_task(config['task_id']['attempt'], True) # the three wcl files to copy to the home archive origwcl = config['origwcl'] expwcl = config['expwcl'] fullwcl = config['fullwcl'] # if not a dryrun and using a home archive for output if (config.getfull(pfwdefs.USE_HOME_ARCHIVE_OUTPUT) != 'never' and 'submit_files_mvmt' in config and (pfwdefs.PF_DRYRUN not in config or not miscutils.convertBool(config.getfull(pfwdefs.PF_DRYRUN)))): # get home archive info home_archive = config.getfull('home_archive') archive_info = config[pfwdefs.SW_ARCHIVESECT][home_archive] # load filemgmt class attempt_tid = config['task_id']['attempt'] filemgmt = pfwutils.pfw_dynam_load_class(pfw_dbh, config, attempt_tid, attempt_tid, "filemgmt", archive_info['filemgmt'], archive_info) # save file information filemgmt.register_file_data('wcl', [origwcl, expwcl, fullwcl], config['pfw_attempt_id'], attempt_tid, False, None, None) copy_files_home(config, archive_info, filemgmt) filemgmt.commit() if pfw_dbh is not None: print("Saving attempt's archive path into PFW tables...", end=' ') pfw_dbh.update_attempt_archive_path(config) pfw_dbh.commit() except Exception as exc: msg = "begrun: %s: %s" % (exc.__class__.__name__, str(exc)) if pfw_dbh is not None: Messaging.pfw_message(pfw_dbh, config['pfw_attempt_id'], config['task_id']['attempt'], msg, pfw_utils.PFW_DB_WARN, 'begrun.out', 0) send_failed_email(config, msg) raise except SystemExit as exc: msg = "begrun: SysExit=%s" % str(exc) if pfw_dbh is not None: Messaging.pfw_message(pfw_dbh, config['pfw_attempt_id'], config['task_id']['attempt'], msg, pfw_utils.PFW_DB_WARN, 'begrun.out', 0) send_failed_email(config, msg) raise
def blockpost(argv=None): """Program entry point. """ if argv is None: argv = sys.argv # open file to catch error messages about command line debugfh = open('blockpost.out', 'w') sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # print command line for debugging print("running on %s" % (socket.gethostname())) if len(argv) != 3: print('Usage: blockpost.py configfile retval') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] retval = int(argv[2]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("configfile = %s" % configfile) miscutils.fwdebug_print("retval = %s" % retval) # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename( 'block', {pfwdefs.PF_CURRVALS: { 'flabel': 'blockpost', 'fsuffix': 'out' }}) new_log_name = "%s/%s" % (blkdir, new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod('blockpost.out', 0o666) os.rename('blockpost.out', new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh os.chdir(blkdir) log_pfw_event(config, blockname, 'blockpost', 'j', ['posttask', retval]) dryrun = config.getfull(pfwdefs.PF_DRYRUN) run = config.getfull('run') attid = config['pfw_attempt_id'] reqnum = config.getfull(pfwdefs.REQNUM) unitname = config.getfull(pfwdefs.UNITNAME) attnum = config.getfull(pfwdefs.ATTNUM) blknum = int(config.getfull(pfwdefs.PF_BLKNUM)) blktid = None msg2 = "" dbh = None job_byblk = {} wrap_byjob = {} wrap_bymod = {} wrapinfo = {} jobinfo = {} failedwraps = {} whyfailwraps = {} # mod failures for other modname, shouldn't happen usedb = miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)) verify_files = miscutils.convertBool(config.getfull('verify_files')) verify_status = 0 if verify_files and not usedb: print('Skipping file verification due to lack of database connection') if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): sem = None try: miscutils.fwdebug_print("Connecting to DB") dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) if verify_files: curs = dbh.cursor() curs.execute("select root from ops_archive where name='%s'" % (config.getfull('home_archive'))) rows = curs.fetchall() if rows is None or len(rows) != 1: raise Exception( "Invalid archive name (%s). Found %s rows in ops_archive" % (config.getfull('home_archive'), len(rows))) root = rows[0][0] if not os.path.isdir(root): print( "Cannot read archive root directory:%s This program must be run on an NCSA machine with access to the archive storage system." % (config.getfull('home_archive'))) sem = dbsem.DBSemaphore( 'verify_files_10', None, config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) print( "\n\nVerifying archive file sizes on disk (0 is success)") verify_status = cu.compare( dbh=dbh, archive=config.getfull('home_archive'), pfwid=attid, filesize=True, md5sum=False, quick=True, debug=False, script=False, verbose=False, silent=True) if sem is not None: del sem print(" Verification of files returned status %i" % (verify_status)) if verify_status != 0: print( " This indicates that one or more files do not have the correct file size (based on DB entries). Run" ) print( "\n compare_db.py --des_services %s --section %s --archive %s --pfwid %i --filesize --verbose" % (config.getfull('submit_des_services'), config.getfull('submit_des_db_section'), config.getfull('home_archive'), int(attid))) print("\n to see the details.") if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_QCF)): import qcframework.qcfdb as qcfdb qdbh = qcfdb.QCFDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) print("\n\nChecking non-job block task status from task table in DB (%s is success)" % \ pfwdefs.PF_EXIT_SUCCESS) num_bltasks_failed = 0 bltasks = {} blktid = None if ('block' in config['task_id'] and str(blknum) in config['task_id']['block']): blktid = int(config['task_id']['block'][str(blknum)]) miscutils.fwdebug_print("Getting block task info from DB") start_time = time.time() bltasks = dbh.get_block_task_info(blktid) end_time = time.time() miscutils.fwdebug_print( "Done getting block task info from DB (%s secs)" % (end_time - start_time)) for bltdict in list(bltasks.values()): print("Block status = ", bltdict['status']) if bltdict['status'] == pfwdefs.PF_EXIT_DRYRUN: print("setting return value to dryrun") retval = bltdict['status'] elif bltdict['status'] != pfwdefs.PF_EXIT_SUCCESS: num_bltasks_failed += 1 msg2 += "\t%s" % (bltdict['name']) if bltdict['label'] is not None: msg2 += " - %s" % (bltdict['label']) msg2 += " failed\n" if bltdict['name'] == 'begblock': # try to read the begblock.out and begblock.err files print( "Trying to get begblock.out and begblock.err") msg2 += get_subblock_output("begblock") # try to get QCF messages (especially from query codes) begblock_tid = int(config['task_id']['begblock']) sql = "select id from task where parent_task_id=%i and status!=0" % ( begblock_tid) curs = dbh.cursor() curs.execute(sql) res = curs.fetchall() msg2 += "\n===== QCF Messages =====\n" msg2 += "\n begblock\n" wrapids = [blktid, begblock_tid] for r in res: wrapids.append(r[0]) wrapmsg = {} if qdbh is not None: miscutils.fwdebug_print( "Querying QCF messages") start_time = time.time() wrapmsg = qdbh.get_qcf_messages_for_wrappers( wrapids) end_time = time.time() miscutils.fwdebug_print( "Done querying QCF messages (%s secs)" % (end_time - start_time)) miscutils.fwdebug_print("wrapmsg = %s" % wrapmsg) if len(wrapmsg) == 0: msg2 += " No QCF messages\n" else: for msgs in list(wrapmsg.values()): for m in msgs: msg2 += " " + m['message'] + "\n" retval = pfwdefs.PF_EXIT_FAILURE if retval != pfwdefs.PF_EXIT_DRYRUN: print("\n\nChecking job status from pfw_job table in DB (%s is success)" % \ pfwdefs.PF_EXIT_SUCCESS) miscutils.fwdebug_print("Getting job info from DB") start_time = time.time() jobinfo = dbh.get_job_info({'pfw_block_task_id': blktid}) end_time = time.time() miscutils.fwdebug_print( "Done getting job info from DB (%s secs)" % (end_time - start_time)) miscutils.fwdebug_print("Getting wrapper info from DB") start_time = time.time() wrapinfo = dbh.get_wrapper_info(pfw_attempt_id=attid, pfw_block_task_id=blktid) end_time = time.time() miscutils.fwdebug_print( "Done getting wrapper info from DB (%s secs)" % (end_time - start_time)) else: msg = "Could not find task id for block %s in config.des" % blockname print("Error:", msg) if 'attempt' in config['task_id']: miscutils.fwdebug_print("Saving pfw message") start_time = time.time() Messaging.pfw_message(dbh, attid, config['task_id']['attempt'], msg, pfw_utils.PFW_DB_INFO, 'blockpost.out', 0) end_time = time.time() miscutils.fwdebug_print( "Done saving pfw message (%s secs)" % (end_time - start_time)) print("all the task ids:", config['task_id']) archive = None if pfwdefs.HOME_ARCHIVE in config: archive = config.getfull(pfwdefs.HOME_ARCHIVE) logfullnames = dbh.get_fail_log_fullnames(attid, archive) dbh.close() print("len(jobinfo) = ", len(jobinfo)) print("len(wrapinfo) = ", len(wrapinfo)) job_byblk = pfwutils.index_job_info(jobinfo) print("blktid: ", blktid) print("job_byblk:", job_byblk) if blktid not in job_byblk: print("Warn: could not find jobs for block %s" % blknum) print(" This is ok if attempt died before jobs ran") print(" block task_ids in job_byblk:" % list(job_byblk.keys())) else: wrap_byjob, wrap_bymod = pfwutils.index_wrapper_info(wrapinfo) #print "wrap_byjob:", wrap_byjob #print "wrap_bymod:", wrap_bymod for jobtid, jobdict in sorted(job_byblk[blktid].items()): failedwraps[jobtid] = [] whyfailwraps[jobtid] = [] jobkeys = "" # don't print out successful wrappers if jobtid in wrap_byjob and jobdict[ 'status'] == pfwdefs.PF_EXIT_SUCCESS: continue if jobdict['jobkeys'] is not None: jobkeys = jobdict['jobkeys'] #print "jobkeys = ", jobkeys, type(jobkeys) submit_job_path = "%s/B%02d-%s/%04d" % ( config.getfull('work_dir'), int(config.getfull('blknum')), config.getfull('blockname'), int(jobdict['jobnum'])) msg2 += "\n\t%s (%s) " % (pfwutils.pad_jobnum( jobdict['jobnum']), jobkeys) if jobtid not in wrap_byjob: msg2 += "\tNo wrapper instances" else: #print "wrapnum in job =", wrap_byjob[jobtid].keys() maxwrap = max(wrap_byjob[jobtid].keys()) #print "maxwrap =", maxwrap modname = wrap_byjob[jobtid][maxwrap]['modname'] #print "modname =", modname msg2 += "%d/%s %s" % (len( wrap_byjob[jobtid]), jobdict['expect_num_wrap'], modname) # determine wrappers for this job without success exit for wrapnum, wdict in list(wrap_byjob[jobtid].items()): if wdict['status'] is None or wdict[ 'status'] != pfwdefs.PF_EXIT_SUCCESS: if wdict['modname'] == modname: failedwraps[jobtid].append(wrapnum) else: whyfailwraps[jobtid].append(wrapnum) if jobdict['status'] == pfwdefs.PF_EXIT_EUPS_FAILURE: msg2 += " - FAIL - EUPS setup failure" retval = jobdict['status'] elif jobdict['status'] == pfwdefs.PF_EXIT_CONDOR: msg2 += " - FAIL - Condor/Globus failure" retval = jobdict['status'] elif jobdict['status'] is None: msg2 += " - FAIL - NULL status" retval = pfwdefs.PF_EXIT_FAILURE elif jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS: msg2 += " - FAIL - Non-zero status" retval = jobdict['status'] if jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS: msg2 += "\n\t\t%s/runjob.out " % (submit_job_path) msg2 += '\n' # print pfw_messages if 'message' in jobdict: print(jobdict['message']) for msgdict in sorted(jobdict['message'], key=lambda k: k['message_time']): level = int(msgdict['message_lvl']) levelstr = 'info' if level == pfwdefs.PFWDB_MSG_WARN: levelstr = 'WARN' elif level == pfwdefs.PFWDB_MSG_ERROR: levelstr = 'ERROR' msg2 += "\t\t%s - %s\n" % ( levelstr, msgdict['message'].replace( '\n', '\n\t\t\t')) if jobtid in wrap_byjob: # print log file name for failed/unfinished wrappers for wrapnum in failedwraps[jobtid]: wrapdict = wrap_byjob[jobtid][wrapnum] if wrapdict['log'] in logfullnames: msg2 += "\t\t%s - %s\n" % ( wrapnum, logfullnames[wrapdict['log']]) else: msg2 += "\t\t%s - Could not find log in archive (%s)\n" % ( wrapnum, wrapdict['log']) wrapmsg = get_qcf_messages(qdbh, config, [wrapdict['task_id']]) msg2 = print_qcf_messages(config, wrapdict, wrapmsg, msg2) msg2 += '\n' # If weirdness happened in run, print a message if len(whyfailwraps[jobtid]) > 0: msg2 += "\n*** Contact framework developers. Wrappers ran after at least 1 wrapper from a previous module that doesn't have success status.\n" msg2 += "\t%s\n" % ','.join(whyfailwraps[jobtid]) except Exception as exc: if sem is not None: del sem msg2 += "\n\nEncountered error trying to gather status information for email." msg2 += "\nCheck output for blockpost for further details." print( "\n\nEncountered error trying to gather status information for email" ) print("%s: %s" % (exc.__class__.__name__, str(exc))) (extype, exvalue, trback) = sys.exc_info() traceback.print_exception(extype, exvalue, trback, file=sys.stdout) retval = pfwdefs.PF_EXIT_FAILURE retval = int(retval) + verify_status print("before email retval =", retval) when_to_email = 'run' if 'when_to_email' in config: when_to_email = config.getfull('when_to_email').lower() if miscutils.convertBool(dryrun): if when_to_email != 'never': print("dryrun = ", dryrun) print("Sending dryrun email") if retval == pfwdefs.PF_EXIT_DRYRUN: msg1 = "%s: In dryrun mode, block %s has finished successfully." % ( run, blockname) else: msg1 = "%s: In dryrun mode, block %s has failed." % ( run, blockname) send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending dryrun email") print("retval = ", retval) retval = pfwdefs.PF_EXIT_DRYRUN elif retval: if when_to_email != 'never': print("Sending block failed email\n") msg1 = "%s: block %s has failed." % (run, blockname) send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending failed email") print("retval = ", retval) elif retval == pfwdefs.PF_EXIT_SUCCESS: if when_to_email == 'block': msg1 = "%s: block %s has finished successfully." % (run, blockname) msg2 = "" print("Sending success email\n") send_email(config, blockname, retval, "", msg1, msg2) elif when_to_email == 'run': numblocks = len( miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')) if int(config[pfwdefs.PF_BLKNUM]) == numblocks: msg1 = "%s: run has finished successfully." % (run) msg2 = "" print("Sending success email\n") send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending run email because not last block") print("retval = ", retval) else: print("Not sending success email") print("retval = ", retval) else: print("Not sending email") print("retval = ", retval) # Store values in DB and hist file dbh = None if miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]): dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) if blktid is not None: print("Updating end of block task", blktid) dbh.end_task(blktid, retval, True) else: print("Could not update end of block task without block task id") if retval != pfwdefs.PF_EXIT_SUCCESS: print("Updating end of attempt", config['task_id']['attempt']) dbh.end_task(config['task_id']['attempt'], retval, True) dbh.commit() dbh.close() print("before next block retval = ", retval) if retval == pfwdefs.PF_EXIT_SUCCESS: # Get ready for next block config.inc_blknum() with open(configfile, 'w') as cfgfh: config.write(cfgfh) print("new blknum = ", config[pfwdefs.PF_BLKNUM]) print("number of blocks = ", len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ','))) miscutils.fwdebug_print("Returning retval = %s (%s)" % (retval, type(retval))) miscutils.fwdebug_print("END") debugfh.close() return int(retval)
def jobpre(argv=None): """ Program entry point """ if argv is None: argv = sys.argv #debugfh = tempfile.NamedTemporaryFile(prefix='jobpre_', dir='.', delete=False) default_log = f"jobpre_{random.randint(1,10000000):08d}.out" debugfh = open(default_log, 'w') tmpfn = debugfh.name outorig = sys.stdout errorig = sys.stderr sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # command line for debugging print(os.getcwd()) if len(argv) < 3: print("Usage: jobpre configfile jobnum") debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = sys.argv[1] jobnum = sys.argv[2] # could also be uberctrl # read wcl file config = pfwconfig.PfwConfig({'wclfile': configfile}) blockname = config.getfull('blockname') blkdir = config.get('block_dir') tjpad = pfwutils.pad_jobnum(jobnum) # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM:jobnum, 'flabel': 'jobpre', 'fsuffix':'out'}}) new_log_name = f"{blkdir}/{tjpad}/{new_log_name}" miscutils.fwdebug_print(f"new_log_name = {new_log_name}") debugfh.close() sys.stdout = outorig sys.stderr = errorig os.chmod(tmpfn, 0o666) os.rename(tmpfn, new_log_name) dbh = None if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): if config.dbh is None: dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) else: dbh = config.dbh if 'use_qcf' in config and config['use_qcf']: debugfh = Messaging.Messaging(new_log_name, 'jobpre.py', config['pfw_attempt_id'], dbh=dbh, mode='a+', usedb=dbh is not None) else: debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): ctstr = dbh.get_current_timestamp_str() dbh.update_job_info(config, tjpad, {'condor_submit_time': ctstr, 'target_submit_time': ctstr}) log_pfw_event(config, blockname, tjpad, 'j', ['pretask']) miscutils.fwdebug_print("jobpre done") debugfh.close() sys.stdout = outorig sys.stderr = errorig return pfwdefs.PF_EXIT_SUCCESS
def parse_job_output(config, jobnum, dbh=None, retval=None): """TODO: short summary. Search stdout/stderr for timing stats as well as eups setup or DB connection error messages and insert them into db. """ jobbase = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM: jobnum, 'flabel': 'runjob', 'fsuffix': ''}}) tjobinfo = {} tjobinfo_task = {} for jobfile in ['%sout'%jobbase, '%serr'%jobbase]: if os.path.exists(jobfile): with open(jobfile, 'r') as jobfh: for no, line in enumerate(jobfh): line = line.strip() if line.startswith('PFW:'): parts = line.split() if parts[1] == 'batchid': if parts[2] == '=': # older pfwrunjob.py tjobinfo['target_job_id'] = parts[3] else: tjobinfo['target_job_id'] = parts[2] elif parts[1] == 'condorid': tjobinfo['condor_job_id'] = parts[2] elif parts[1] == 'job_shell_script': print("parts[2]", parts[2]) print("parts[3]", parts[3]) if parts[2] == 'exechost:': #tjobinfo['target_exec_host'] = parts[3] tjobinfo_task['exec_host'] = parts[3] elif parts[2] == 'starttime:': tjobinfo_task['start_time'] = datetime.fromtimestamp(float(parts[3])) elif parts[2] == 'endtime:': #tjobinfo['target_end_time'] = datetime.fromtimestamp(float(parts[3])) tjobinfo_task['end_time'] = datetime.fromtimestamp(float(parts[3])) elif parts[2] == 'exit_status:': tjobinfo_task['status'] = parts[3] elif 'ORA-' in line: print("Found:", line) if not 'DBD' in line: print("Setting retval to failure") tjobinfo_task['status'] = pfwdefs.PF_EXIT_FAILURE else: print(" Ignoring QCF perl error message.") if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], line, pfwdefs.PFWDB_MSG_ERROR, jobfile, no) elif 'No such file or directory: ' in line and \ config.getfull('target_des_services') in line: print("Found:", line) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], line, pfwdefs.PFWDB_MSG_ERROR, jobfile, no) elif 'Error: eups setup' in line: print("Found:", line) print("Setting retval to failure") tjobinfo_task['status'] = pfwdefs.PF_EXIT_EUPS_FAILURE if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], line, pfwdefs.PFWDB_MSG_ERROR, jobfile, no) elif 'Exiting with status' in line: lmatch = re.search(r'Exiting with status (\d+)', line) if lmatch: if int(lmatch.group(1)) != 0 and retval == 0: print("Found:", line) msg = "Info: Job exit status was %s, but retval was %s." % \ (lmatch.group(1), retval) msg += "Setting retval to failure." print(msg) tjobinfo['status'] = pfwdefs.PF_EXIT_FAILURE if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_ERROR, jobfile, no) elif 'Could not connect to database'in line: print("Found:", line) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], line, pfwdefs.PFWDB_MSG_INFO, jobfile, no) return tjobinfo, tjobinfo_task
def jobpost(argv=None): """Performs steps needed after a pipeline job. """ condor2db = {'jobid': 'condor_job_id', 'csubmittime': 'condor_submit_time', 'gsubmittime': 'target_submit_time', 'starttime': 'condor_start_time', 'endtime': 'condor_end_time'} if argv is None: argv = sys.argv debugfh = tempfile.NamedTemporaryFile(mode='w+', prefix='jobpost_', dir='.', delete=False) tmpfn = debugfh.name sys.stdout = debugfh sys.stderr = debugfh miscutils.fwdebug_print("temp log name = %s" % tmpfn) print('cmd>', ' '.join(argv)) # print command line for debugging if len(argv) < 7: # open file to catch error messages about command line print('Usage: jobpost.py configfile block jobnum inputtar outputtar retval') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] blockname = argv[2] jobnum = argv[3] inputtar = argv[4] outputtar = argv[5] retval = pfwdefs.PF_EXIT_FAILURE if len(argv) == 7: retval = int(sys.argv[6]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("configfile = %s" % configfile) miscutils.fwdebug_print("block = %s" % blockname) miscutils.fwdebug_print("jobnum = %s" % jobnum) miscutils.fwdebug_print("inputtar = %s" % inputtar) miscutils.fwdebug_print("outputtar = %s" % outputtar) miscutils.fwdebug_print("retval = %s" % retval) # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") # now that have more information, rename output file if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("before get_filename") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') tjpad = pfwutils.pad_jobnum(jobnum) os.chdir("%s/%s" % (blkdir, tjpad)) new_log_name = config.get_filename('job', {pfwdefs.PF_CURRVALS: {pfwdefs.PF_JOBNUM: jobnum, 'flabel': 'jobpost', 'fsuffix': 'out'}}) new_log_name = "%s" % (new_log_name) miscutils.fwdebug_print("new_log_name = %s" % new_log_name) debugfh.close() os.chmod(tmpfn, 0o666) os.rename(tmpfn, new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh dbh = None if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) # get job information from the job stdout if exists (tjobinfo, tjobinfo_task) = parse_job_output(config, jobnum, dbh, retval) if dbh and len(tjobinfo) > 0: print("tjobinfo: ", tjobinfo) dbh.update_tjob_info(config['task_id']['job'][jobnum], tjobinfo) # get job information from the condor job log logfilename = 'runjob.log' if os.path.exists(logfilename) and os.path.getsize(logfilename) > 0: # if made it to submitting/running jobs try: # update job info in DB from condor log print("Updating job info in DB from condor log") condorjobinfo = pfwcondor.parse_condor_user_log(logfilename) if len(list(condorjobinfo.keys())) > 1: print("More than single job in job log") j = list(condorjobinfo.keys())[0] cjobinfo = condorjobinfo[j] djobinfo = {} for ckey, dkey in list(condor2db.items()): if ckey in cjobinfo: djobinfo[dkey] = cjobinfo[ckey] print(djobinfo) dbh.update_job_info(config, cjobinfo['jobname'], djobinfo) if 'holdreason' in cjobinfo and cjobinfo['holdreason'] is not None: msg = "Condor HoldReason: %s" % cjobinfo['holdreason'] print(msg) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_WARN) if 'abortreason' in cjobinfo and cjobinfo['abortreason'] is not None: tjobinfo_task['start_time'] = cjobinfo['starttime'] tjobinfo_task['end_time'] = cjobinfo['endtime'] if 'condor_rm' in cjobinfo['abortreason']: tjobinfo_task['status'] = pfwdefs.PF_EXIT_OPDELETE else: tjobinfo_task['status'] = pfwdefs.PF_EXIT_CONDOR else: pass except Exception: (extype, exvalue, trback) = sys.exc_info() traceback.print_exception(extype, exvalue, trback, file=sys.stdout) else: print("Warning: no job condor log file") if dbh: # update job task if 'status' not in tjobinfo_task: tjobinfo_task['status'] = pfwdefs.PF_EXIT_CONDOR if 'end_time' not in tjobinfo_task: tjobinfo_task['end_time'] = datetime.now() wherevals = {'id': config['task_id']['job'][jobnum]} dbh.basic_update_row('task', tjobinfo_task, wherevals) dbh.commit() log_pfw_event(config, blockname, jobnum, 'j', ['posttask', retval]) # input wcl should already exist in untar form if os.path.exists(inputtar): print("found inputtar: %s" % inputtar) os.unlink(inputtar) else: print("Could not find inputtar: %s" % inputtar) # untar output wcl tar and delete tar if os.path.exists(outputtar): print("Size of output wcl tar:", os.path.getsize(outputtar)) if os.path.getsize(outputtar) > 0: print("found outputtar: %s" % outputtar) pfwutils.untar_dir(outputtar, '..') os.unlink(outputtar) else: msg = "Warn: outputwcl tarball (%s) is 0 bytes." % outputtar print(msg) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_WARN) else: msg = "Warn: outputwcl tarball (%s) does not exist." % outputtar print(msg) if dbh: Messaging.pfw_message(dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_WARN) if retval != pfwdefs.PF_EXIT_SUCCESS: miscutils.fwdebug_print("Setting failure retval") retval = pfwdefs.PF_EXIT_FAILURE miscutils.fwdebug_print("Returning retval = %s" % retval) miscutils.fwdebug_print("jobpost done") debugfh.close() return int(retval)
def run_cmd_qcf(cmd, logfilename, wid, execnames, use_qcf=False, dbh=None, pfwattid=0, patterns={}): """Execute the command piping stdout/stderr to log and QCF. """ bufsize = 1024 * 10 if miscutils.fwdebug_check(3, "PFWUTILS_DEBUG"): miscutils.fwdebug_print("BEG") miscutils.fwdebug_print("working dir = %s" % (os.getcwd())) miscutils.fwdebug_print("cmd = %s" % cmd) miscutils.fwdebug_print("logfilename = %s" % logfilename) miscutils.fwdebug_print("wid = %s" % wid) miscutils.fwdebug_print("execnames = %s" % execnames) miscutils.fwdebug_print("use_qcf = %s" % use_qcf) use_qcf = miscutils.convertBool(use_qcf) sys.stdout.flush() try: messaging = Messaging.Messaging(logfilename, execnames, pfwattid=pfwattid, taskid=wid, dbh=dbh, usedb=use_qcf, qcf_patterns=patterns) process_wrap = subprocess.Popen(shlex.split(cmd), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) except: (extype, exvalue, _) = sys.exc_info() print("********************") print("Unexpected error: %s - %s" % (extype, exvalue)) print("cmd> %s" % cmd) print("Probably could not find %s in path" % cmd.split()[0]) print("Check for mispelled execname in submit wcl or") print( " make sure that the corresponding eups package is in the metapackage " ) print(" and it sets up the path correctly") raise try: buf = os.read(process_wrap.stdout.fileno(), bufsize).decode() while process_wrap.poll() == None or len(buf) != 0: messaging.write(buf) buf = os.read(process_wrap.stdout.fileno(), bufsize).decode() except IOError as exc: print("\tI/O error({0}): {1}".format(exc.errno, exc.strerror)) except: (extype, exvalue, _) = sys.exc_info() print("\tError: Unexpected error: %s - %s" % (extype, exvalue)) raise sys.stdout.flush() if miscutils.fwdebug_check(3, "PFWUTILS_DEBUG"): if process_wrap.returncode != 0: miscutils.fwdebug_print( "\tInfo: cmd exited with non-zero exit code = %s" % process_wrap.returncode) miscutils.fwdebug_print("\tInfo: failed cmd = %s" % cmd) else: miscutils.fwdebug_print("\tInfo: cmd exited with exit code = 0") if miscutils.fwdebug_check(3, "PFWUTILS_DEBUG"): miscutils.fwdebug_print("END") return process_wrap.returncode
def run_cmd_qcf(cmd, logfilename, wid, execnames, use_qcf=False, dbh=None, pfwattid=0, patterns={}, threaded=False): """ Execute the command piping stdout/stderr to log and QCF """ bufsize = 1024 * 10 lasttime = time.time() if miscutils.fwdebug_check(3, "PFWUTILS_DEBUG"): miscutils.fwdebug_print("BEG") miscutils.fwdebug_print(f"working dir = {os.getcwd()}") miscutils.fwdebug_print(f"cmd = {cmd}") miscutils.fwdebug_print(f"logfilename = {logfilename}") miscutils.fwdebug_print(f"wid = {wid}") miscutils.fwdebug_print(f"execnames = {execnames}") miscutils.fwdebug_print(f"use_qcf = {use_qcf}") use_qcf = miscutils.convertBool(use_qcf) sys.stdout.flush() try: messaging = Messaging.Messaging(logfilename, execnames, pfwattid=pfwattid, taskid=wid, dbh=dbh, usedb=use_qcf, qcf_patterns=patterns, threaded=threaded) process_wrap = subprocess.Popen(shlex.split(cmd), shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) except: (extype, exvalue, _) = sys.exc_info() print("********************") print(f"Unexpected error: {extype} - {exvalue}") print(f"cmd> {cmd}") print(f"Probably could not find {cmd.split()[0]} in path") print("Check for mispelled execname in submit wcl or") print(" make sure that the corresponding eups package is in the metapackage ") print(" and it sets up the path correctly") raise try: buf = os.read(process_wrap.stdout.fileno(), bufsize) while process_wrap.poll() is None or buf: if dbh is not None: now = time.time() if now - lasttime > 30.*60.: if not dbh.ping(): dbh.reconnect() lasttime = now messaging.write(buf) #print buf buf = os.read(process_wrap.stdout.fileno(), bufsize) # brief sleep if process_wrap.poll() is None: time.sleep(0.1) except IOError as exc: print(f"\tI/O error({exc.errno}): {exc.strerror}") except: (extype, exvalue, _) = sys.exc_info() print(f"\tError: Unexpected error: {extype} - {exvalue}") raise sys.stdout.flush() if miscutils.fwdebug_check(3, "PFWUTILS_DEBUG"): if process_wrap.returncode != 0: miscutils.fwdebug_print(f"\tInfo: cmd exited with non-zero exit code = {process_wrap.returncode}") miscutils.fwdebug_print(f"\tInfo: failed cmd = {cmd}") else: miscutils.fwdebug_print("\tInfo: cmd exited with exit code = 0") if miscutils.fwdebug_check(3, "PFWUTILS_DEBUG"): miscutils.fwdebug_print("END") return process_wrap.returncode
def logpre(argv=None): """ Program entry point """ if argv is None: argv = sys.argv default_log = 'logpre.out' debugfh = open(default_log, 'w') outorig = sys.stdout errorig = sys.stderr sys.stdout = debugfh sys.stderr = debugfh print(' '.join(sys.argv)) # command line for debugging if len(argv) < 5: print("Usage: logpre configfile block subblocktype subblock") debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] blockname = argv[2] # could also be uberctrl subblocktype = argv[3] subblock = argv[4] # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') new_log_name = config.get_filename('block', {pfwdefs.PF_CURRVALS: {'subblock': subblock, 'flabel': '${subblock}_logpre', 'fsuffix':'out'}}) new_log_name = f"{blkdir}/{new_log_name}" miscutils.fwdebug_print(f"new_log_name = {new_log_name}") debugfh.close() os.chmod(default_log, 0o666) os.rename(default_log, new_log_name) #debugfh.close() sys.stdout = outorig sys.stderr = errorig if 'use_qcf' in config and config['use_qcf']: if config.dbh is None: if 'submit_des_services' in config: os.environ['DES_SERVICES'] = config.getfull('submit_des_services') os.environ['DES_DB_SECTION'] = config.getfull('submit_des_db_section') debugfh = Messaging.Messaging(new_log_name, 'logpre.py', config['pfw_attempt_id'], mode='a+') else: debugfh = Messaging.Messaging(new_log_name, 'logpre.py', config['pfw_attempt_id'], dbh=config.dbh, mode='a+') else: debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh log_pfw_event(config, blockname, subblock, subblocktype, ['pretask']) print("logpre done") debugfh.close() sys.stdout = outorig sys.stderr = errorig return pfwdefs.PF_EXIT_SUCCESS
def parse_job_output(config, jobnum, dbh=None, retval=None): """ Search stdout/stderr for timing stats as well as eups setup or DB connection error messages and insert them into db """ jobbase = config.get_filename( 'job', { pfwdefs.PF_CURRVALS: { pfwdefs.PF_JOBNUM: jobnum, 'flabel': 'runjob', 'fsuffix': '' } }) tjobinfo = {} tjobinfo_task = {} for jobfile in [f"{jobbase}out", f"{jobbase}err"]: if os.path.exists(jobfile): with open(jobfile, 'r') as jobfh: for no, line in enumerate(jobfh): line = line.strip() if line.startswith('PFW:'): parts = line.split() if parts[1] == 'batchid': if parts[2] == '=': # older pfwrunjob.py tjobinfo['target_job_id'] = parts[3] else: tjobinfo['target_job_id'] = parts[2] elif parts[1] == 'condorid': tjobinfo['condor_job_id'] = parts[2] elif parts[1] == 'job_shell_script': #print("parts[2]", parts[2]) #print("parts[3]", parts[3]) if parts[2] == 'exechost:': #tjobinfo['target_exec_host'] = parts[3] tjobinfo_task['exec_host'] = parts[3] elif parts[2] == 'starttime:': tjobinfo_task[ 'start_time'] = datetime.fromtimestamp( float(parts[3])) elif parts[2] == 'endtime:': #tjobinfo['target_end_time'] = datetime.fromtimestamp(float(parts[3])) tjobinfo_task[ 'end_time'] = datetime.fromtimestamp( float(parts[3])) elif parts[2] == 'exit_status:': tjobinfo_task['status'] = parts[3] # skip ORA messages as they are caught by the QCF earlier, and not all are fatal #elif 'ORA-' in line: #print "Found:", line #if not 'DBD' in line: # print "Setting retval to failure" # tjobinfo_task['status'] = pfwdefs.PF_EXIT_FAILURE #else: # print " Ignoring QCF perl error message." # if dbh: # Messaging.pfw_message(dbh, config['pfw_attempt_id'], # config['task_id']['job'][jobnum], # line, pfwdefs.PFWDB_MSG_ERROR, jobfile, no) elif "No such file or directory:" in line and \ config.getfull('target_des_services') in line: #print "Found:", line if dbh: Messaging.pfw_message( dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], line, pfwdefs.PFWDB_MSG_ERROR, jobfile, no) elif "Error: eups setup" in line: #print "Found:", line print("Setting retval to failure") tjobinfo_task['status'] = pfwdefs.PF_EXIT_EUPS_FAILURE if dbh: Messaging.pfw_message( dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], line, pfwdefs.PFWDB_MSG_ERROR, jobfile, no) elif "Exiting with status" in line: lmatch = re.search(r'Exiting with status (\d+)', line) if lmatch: if int(lmatch.group(1)) != 0 and retval == 0: #print "Found:", line msg = f"Info: Job exit status was {lmatch.group(1)}, but retval was {retval}." msg += "Setting retval to failure." #print msg tjobinfo['status'] = pfwdefs.PF_EXIT_FAILURE if dbh: Messaging.pfw_message( dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], msg, pfwdefs.PFWDB_MSG_ERROR, jobfile, no) elif "Could not connect to database" in line: #print "Found:", line if dbh: Messaging.pfw_message( dbh, config['pfw_attempt_id'], config['task_id']['job'][jobnum], line, pfwdefs.PFWDB_MSG_INFO, jobfile, no) return tjobinfo, tjobinfo_task
def logpost(argv=None): """ Program entry point """ if argv is None: argv = sys.argv # open file to catch error messages about command line debugfh = open('logpost.out', 'w') outorig = sys.stdout errorig = sys.stderr sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # print command line for debugging if len(argv) < 5: print("Usage: logpost configfile block subblocktype subblock retval") debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] blockname = argv[2] subblocktype = argv[3] subblock = argv[4] retval = pfwdefs.PF_EXIT_FAILURE if len(argv) == 6: retval = int(sys.argv[5]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print(f"configfile = {configfile}") miscutils.fwdebug_print(f"block = {blockname}") miscutils.fwdebug_print(f"subblock = {subblock}") miscutils.fwdebug_print(f"retval = {retval}") # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") # now that have more information, rename output file if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("before get_filename") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') new_log_name = config.get_filename( 'block', { pfwdefs.PF_CURRVALS: { 'flabel': '${subblock}_logpost', 'subblock': subblock, 'fsuffix': 'out' } }) new_log_name = f"{blkdir}/{new_log_name}" miscutils.fwdebug_print(f"new_log_name = {new_log_name}") debugfh.close() sys.stdout = outorig sys.stderr = errorig os.chmod('logpost.out', 0o666) os.rename('logpost.out', new_log_name) if 'use_qcf' in config and config['use_qcf']: if config.dbh is None: if 'submit_des_services' in config: os.environ['DES_SERVICES'] = config.getfull( 'submit_des_services') os.environ['DES_DB_SECTION'] = config.getfull( 'submit_des_db_section') debugfh = Messaging.Messaging(new_log_name, 'logpost.py', config['pfw_attempt_id'], mode='a+') else: debugfh = Messaging.Messaging(new_log_name, 'logpost.py', config['pfw_attempt_id'], dbh=config.dbh, mode='a+') else: debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh log_pfw_event(config, blockname, subblock, subblocktype, ['posttask', retval]) # In order to continue, make pipelines dagman jobs exit with success status #if 'pipelinesmngr' not in subblock: # retval = pfwdefs.PF_EXIT_SUCCESS # # If error at non-manager level, send failure email # if retval != pfwdefs.PF_EXIT_SUCCESS and \ # 'mngr' not in subblock: # send_subblock_email(config, blockname, subblock, retval) if subblock != 'begblock' and retval != pfwdefs.PF_EXIT_SUCCESS: miscutils.fwdebug_print("Setting failure retval") retval = pfwdefs.PF_EXIT_FAILURE miscutils.fwdebug_print(f"returning retval = {retval}") miscutils.fwdebug_print("logpost done") debugfh.close() sys.stdout = outorig sys.stderr = errorig miscutils.fwdebug_print(f"Exiting with = {retval}") return int(retval)
def update_job_target_info(self, wcl, submit_condor_id=None, target_batch_id=None, exechost=None): """Save information about target job from pfwrunjob. """ params = {} setvals = [] if submit_condor_id is not None: setvals.append('condor_job_id=%s' % self.get_named_bind_string('condor_job_id')) params['condor_job_id'] = float(submit_condor_id) if target_batch_id is not None: setvals.append('target_job_id=%s' % self.get_named_bind_string('target_job_id')) params['target_job_id'] = target_batch_id if 'jobroot' in wcl: setvals.append('jobroot=%s' % self.get_named_bind_string('jobroot')) params['jobroot'] = wcl['jobroot'] if len(setvals) > 0: params['task_id'] = wcl['task_id']['job'] sql = "update pfw_job set %s where task_id=%s and condor_job_id is NULL" % ( ','.join(setvals), self.get_named_bind_string('task_id')) if miscutils.fwdebug_check(3, 'PFWDB_DEBUG'): miscutils.fwdebug_print("sql> %s" % sql) if miscutils.fwdebug_check(3, 'PFWDB_DEBUG'): miscutils.fwdebug_print("params> %s" % params) curs = self.cursor() try: curs.execute(sql, params) except: (type, value, _) = sys.exc_info() print("******************************") print("Error:", type, value) print("sql> %s\n" % (sql)) print("params> %s\n" % params) raise if curs.rowcount == 0: Messaging.pfw_message(self, wcl['pfw_attempt_id'], wcl['task_id']['job'], "Job attempted to run more than once", pfw_utils.PFWDB_MSG_ERROR) print("******************************") print("Error: This job has already been run before.") print("pfw_attempt_id = ", wcl['pfw_attempt_id']) print("reqnum = ", wcl[pfwdefs.REQNUM]) print("unitname = ", wcl[pfwdefs.UNITNAME]) print("attnum = ", wcl[pfwdefs.ATTNUM]) print("blknum = ", wcl[pfwdefs.PF_BLKNUM]) print("jobnum = ", wcl[pfwdefs.PF_JOBNUM]) print("job task_id = ", wcl['task_id']['job']) print("\nThe 1st job information:") curs2 = self.cursor() sql = "select * from pfw_job, task where pfw_job.task_id=task.id and pfw_job.task_id=%s" % ( self.get_named_bind_string('task_id')) curs2.execute(sql, {'task_id': wcl['task_id']['job']}) desc = [d[0].lower() for d in curs2.description] for row in curs2: d = dict(list(zip(desc, row))) for k, v in list(d.items()): print(k, v) print("\n") print("\nThe 2nd job information:") print("submit_condor_id = ", submit_condor_id) print("target_batch_id = ", target_batch_id) print("exechost = ", exechost) print("current time = ", str(datetime.now())) print("\nupdate statement information") print("sql> %s\n" % sql) print("params> %s\n" % params) raise Exception("Error: job attempted to run more than once") if exechost is not None: sql = "update task set exec_host='%s'" % (exechost) if 'PFW_JOB_START_EPOCH' in os.environ: # doing conversion on DB to avoid any timezone issues sql += ", start_time = (from_tz(to_timestamp('1970-01-01','YYYY-MM-DD') + numtodsinterval(%s,'SECOND'), 'UTC') at time zone 'US/Central')" % ( os.environ['PFW_JOB_START_EPOCH']) sql += ' where id=%s' % (wcl['task_id']['job']) curs = self.cursor() curs.execute(sql) self.commit()
def blockpost(argv=None): """ Program entry point """ realstdout = sys.stdout realstderr = sys.stderr if argv is None: argv = sys.argv # open file to catch error messages about command line debugfh = open('blockpost.out', 'w') sys.stdout = debugfh sys.stderr = debugfh print(' '.join(argv)) # print command line for debugging print(f"running on {socket.gethostname()}") if len(argv) != 3: print('Usage: blockpost.py configfile retval') debugfh.close() return pfwdefs.PF_EXIT_FAILURE configfile = argv[1] retval = int(argv[2]) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print(f"configfile = {configfile}") miscutils.fwdebug_print(f"retval = {retval}") # read sysinfo file config = pfwconfig.PfwConfig({'wclfile': configfile}) if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print("done reading config file") blockname = config.getfull('blockname') blkdir = config.getfull('block_dir') # now that have more information, can rename output file miscutils.fwdebug_print("getting new_log_name") new_log_name = config.get_filename('block', {pfwdefs.PF_CURRVALS: {'flabel': 'blockpost', 'fsuffix':'out'}}) new_log_name = f"{blkdir}/{new_log_name}" miscutils.fwdebug_print(f"new_log_name = {new_log_name}") debugfh.close() os.chmod('blockpost.out', 0o666) os.rename('blockpost.out', new_log_name) debugfh = open(new_log_name, 'a+') sys.stdout = debugfh sys.stderr = debugfh os.chdir(blkdir) log_pfw_event(config, blockname, 'blockpost', 'j', ['posttask', retval]) dryrun = config.getfull(pfwdefs.PF_DRYRUN) run = config.getfull('run') attid = config['pfw_attempt_id'] blknum = int(config.getfull(pfwdefs.PF_BLKNUM)) blktid = None msg2 = "" dbh = None qdbh = None job_byblk = {} wrap_byjob = {} wrapinfo = {} jobinfo = {} failedwraps = {} whyfailwraps = {} # mod failures for other modname, shouldn't happen usedb = miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)) verify_files = miscutils.convertBool(config.getfull('verify_files')) verify_status = 0 sem = None if verify_files and not usedb: print('Skipping file verification due to lack of database connection') if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_DB_OUT)): try: miscutils.fwdebug_print("Connecting to DB") if config.dbh is None: dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) else: dbh = config.dbh if verify_files: curs = dbh.cursor() curs.execute(f"select root from ops_archive where name='{config.getfull('home_archive')}'") rows = curs.fetchall() if rows is None or len(rows) != 1: raise Exception(f"Invalid archive name ({config.getfull('home_archive')}). Found {len(rows)} rows in ops_archive") root = rows[0][0] if not os.path.isdir(root): print(f"Cannot read archive root directory:{config.getfull('home_archive')} This program must be run on an NCSA machine with access to the archive storage system.") sem = dbsem.DBSemaphore('verify_files_10', None, config.getfull('submit_des_services'), config.getfull('submit_des_db_section'), connection=dbh) print("\n\nVerifying archive file sizes on disk (0 is success)") verify_status = cu.compare(dbh=dbh, archive=config.getfull('home_archive'), pfwid=attid, md5sum=False, debug=False, script=False, verbose=False, silent=True) if sem is not None: del sem sem = None print(f" Verification of files returned status {verify_status:d}") if verify_status != 0: print(" This indicates that one or more files do not have the correct file size (based on DB entries). Run") print(f"\n compare_db.py --des_services {config.getfull('submit_des_services')} --section {config.getfull('submit_des_db_section')} --archive {config.getfull('home_archive')} --pfwid {int(attid):d} --verbose") print("\n to see the details.") if miscutils.convertBool(config.getfull(pfwdefs.PF_USE_QCF)): import qcframework.qcfdb as qcfdb #qdbh = qcfdb.QCFDB(config.getfull('submit_des_services'), # config.getfull('submit_des_db_section')) qdbh = qcfdb.QCFDB(connection=dbh) print(f"\n\nChecking non-job block task status from task table in DB ({pfwdefs.PF_EXIT_SUCCESS} is success)") num_bltasks_failed = 0 bltasks = {} blktid = None if ('block' in config['task_id'] and str(blknum) in config['task_id']['block']): blktid = int(config['task_id']['block'][str(blknum)]) miscutils.fwdebug_print("Getting block task info from DB") start_time = time.time() bltasks = dbh.get_block_task_info(blktid) end_time = time.time() miscutils.fwdebug_print(f"Done getting block task info from DB ({end_time - start_time} secs)") for bltdict in bltasks.values(): print("Block status = ", bltdict['status']) if bltdict['status'] == pfwdefs.PF_EXIT_DRYRUN: print("setting return value to dryrun") retval = bltdict['status'] elif bltdict['status'] != pfwdefs.PF_EXIT_SUCCESS: num_bltasks_failed += 1 msg2 += f"\t{bltdict['name']}" if bltdict['label'] is not None: msg2 += f" - {bltdict['label']}" msg2 += " failed\n" if bltdict['name'] == 'begblock': # try to read the begblock.out and begblock.err files print("Trying to get begblock.out and begblock.err") msg2 += get_subblock_output("begblock") # try to get QCF messages (especially from query codes) begblock_tid = int(config['task_id']['begblock']) sql = f"select id from task where parent_task_id={begblock_tid:d} and status!=0" curs = dbh.cursor() curs.execute(sql) res = curs.fetchall() msg2 += "\n===== QCF Messages =====\n" msg2 += "\n begblock\n" wrapids = [blktid, begblock_tid] for r in res: wrapids.append(r[0]) wrapmsg = {} if qdbh is not None: miscutils.fwdebug_print("Querying QCF messages") start_time = time.time() wrapmsg = qdbh.get_qcf_messages_for_wrappers(wrapids) end_time = time.time() miscutils.fwdebug_print(f"Done querying QCF messages ({end_time-start_time} secs)") miscutils.fwdebug_print(f"wrapmsg = {wrapmsg}") if not wrapmsg: msg2 += " No QCF messages\n" else: for msgs in wrapmsg.values(): for m in msgs: msg2 += " " + m['message'] + "\n" retval = pfwdefs.PF_EXIT_FAILURE if retval != pfwdefs.PF_EXIT_DRYRUN: print(f"\n\nChecking job status from pfw_job table in DB ({pfwdefs.PF_EXIT_SUCCESS} is success)") miscutils.fwdebug_print("Getting job info from DB") start_time = time.time() jobinfo = dbh.get_job_info({'pfw_block_task_id': blktid}) end_time = time.time() miscutils.fwdebug_print(f"Done getting job info from DB ({end_time - start_time} secs)") miscutils.fwdebug_print("Getting wrapper info from DB") start_time = time.time() wrapinfo = dbh.get_wrapper_info(pfw_attempt_id=attid, pfw_block_task_id=blktid) if retval != pfwdefs.PF_EXIT_SUCCESS: jobwrap = dbh.get_jobwrapper_info(id=attid) else: jobwrap = {} end_time = time.time() miscutils.fwdebug_print(f"Done getting wrapper info from DB ({end_time - start_time} secs)") else: msg = f"Could not find task id for block {blockname} in config.des" print("Error:", msg) if 'attempt' in config['task_id']: miscutils.fwdebug_print("Saving pfw message") start_time = time.time() Messaging.pfw_message(dbh, attid, config['task_id']['attempt'], msg, pfwdefs.PFWDB_MSG_INFO, 'blockpost.out', 0) end_time = time.time() miscutils.fwdebug_print(f"Done saving pfw message ({end_time - start_time} secs)") print("all the task ids:", config['task_id']) archive = None if pfwdefs.HOME_ARCHIVE in config: archive = config.getfull(pfwdefs.HOME_ARCHIVE) logfullnames = dbh.get_log_fullnames(attid, archive) #dbh.close() print("len(jobinfo) = ", len(jobinfo)) print("len(wrapinfo) = ", len(wrapinfo)) job_byblk = pfwutils.index_job_info(jobinfo) print("blktid: ", blktid) print("job_byblk:", job_byblk) if blktid not in job_byblk: print(f"Warn: could not find jobs for block {blknum}") print(" This is ok if attempt died before jobs ran") print(" block task_ids in job_byblk:", list(job_byblk.keys())) else: wrap_byjob, _ = pfwutils.index_wrapper_info(wrapinfo) #for wid,jwr in jobwrap.iteritems(): #print wid,jwr # in case the post wrapper stuff failed, internally mark the task # as failed to retrieve the info later for wrapb in wrap_byjob.values(): for wrapper in wrapb.values(): if wrapper['parent_task_id'] in jobwrap and jobwrap[wrapper['parent_task_id']]['status'] is not None \ and wrapper['status'] is not None and jobwrap[wrapper['parent_task_id']]['status'] > wrapper['status']: wrapper['status'] = jobwrap[wrapper['parent_task_id']]['status'] #print "wrap_bymod:", wrap_bymod jobtid = '' jobdict = {} for jobtid, jobdict in sorted(job_byblk[blktid].items()): failedwraps[jobtid] = [] whyfailwraps[jobtid] = [] jobkeys = "" # don't print out successful wrappers if jobtid in wrap_byjob and jobdict['status'] == pfwdefs.PF_EXIT_SUCCESS: continue if jobdict['jobkeys'] is not None: jobkeys = jobdict['jobkeys'] #print "jobkeys = ", jobkeys, type(jobkeys) submit_job_path = f"{config.getfull('work_dir')}/B{int(config.getfull('blknum')):02d}-{config.getfull('blockname'):s}/{int(jobdict['jobnum']):04d}" msg2 += f"\n\t{pfwutils.pad_jobnum(jobdict['jobnum'])} ({jobkeys}) " if jobtid not in wrap_byjob: msg2 += "\tNo wrapper instances" else: #print "wrapnum in job =", wrap_byjob[jobtid].keys() maxwrap = max(wrap_byjob[jobtid]) #print "maxwrap =", maxwrap modname = wrap_byjob[jobtid][maxwrap]['modname'] #print "modname =", modname msg2 += f"{len(wrap_byjob[jobtid]):d}/{jobdict['expect_num_wrap']} {modname}" # determine wrappers for this job without success exit for wrapnum, wdict in wrap_byjob[jobtid].items(): if wdict['status'] is None or wdict['status'] != pfwdefs.PF_EXIT_SUCCESS: if wdict['modname'] == modname: failedwraps[jobtid].append(wrapnum) else: whyfailwraps[jobtid].append(wrapnum) if jobdict['status'] == pfwdefs.PF_EXIT_EUPS_FAILURE: msg2 += " - FAIL - EUPS setup failure" retval = jobdict['status'] elif jobdict['status'] == pfwdefs.PF_EXIT_CONDOR: msg2 += " - FAIL - Condor/Globus failure" retval = jobdict['status'] elif jobdict['status'] is None: msg2 += " - FAIL - NULL status" retval = pfwdefs.PF_EXIT_FAILURE elif jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS: msg2 += " - FAIL - Non-zero status" retval = jobdict['status'] if jobdict['status'] != pfwdefs.PF_EXIT_SUCCESS: msg2 += f"\n\t\t{submit_job_path}/runjob.out " msg2 += '\n' # print pfw_messages if 'message' in jobdict: print('\nmessages: ', jobdict['message']) for msgdict in sorted(jobdict['message'], key=lambda k: k['message_time']): level = int(msgdict['message_lvl']) levelstr = 'info' if level == pfwdefs.PFWDB_MSG_WARN: levelstr = 'WARN' elif level == pfwdefs.PFWDB_MSG_ERROR: levelstr = 'ERROR' msg2 += "\t\t{} - {}\n".format(levelstr, msgdict['message'].replace('\n', '\n\t\t\t')) if jobtid in wrap_byjob: # print log file name for failed/unfinished wrappers for wrapnum in failedwraps[jobtid]: wrapdict = wrap_byjob[jobtid][wrapnum] if wrapdict['log'] in logfullnames: msg2 += f"\t\t{wrapnum} - {logfullnames[wrapdict['log']]}\n" else: msg2 += f"\t\t{wrapnum} - Could not find log in archive {wrapdict['log']})\n" wrapmsg = get_qcf_messages(qdbh, [wrapdict['task_id']]) msg2 += print_qcf_messages(wrapdict, wrapmsg) msg2 += '\n' # If weirdness happened in run, print a message if whyfailwraps[jobtid]: msg2 += "\n*** Contact framework developers. Wrappers ran after at least 1 wrapper from a previous module that doesn't have success status.\n" msg2 += f"\t{','.join(whyfailwraps[jobtid])}\n" except Exception as exc: if sem is not None: del sem msg2 += "\n\nEncountered error trying to gather status information for email." msg2 += "\nCheck output for blockpost for further details." print("\n\nEncountered error trying to gather status information for email") print(f"{exc.__class__.__name__}: {str(exc)}") (extype, exvalue, trback) = sys.exc_info() traceback.print_exception(extype, exvalue, trback, file=sys.stdout) retval = pfwdefs.PF_EXIT_FAILURE retval = int(retval) + verify_status print("before email retval =", retval) when_to_email = 'run' if 'when_to_email' in config: when_to_email = config.getfull('when_to_email').lower() if miscutils.convertBool(dryrun): if when_to_email != 'never': print("dryrun = ", dryrun) print("Sending dryrun email") if retval == pfwdefs.PF_EXIT_DRYRUN: msg1 = f"{run}: In dryrun mode, block {blockname} has finished successfully." else: msg1 = f"{run}: In dryrun mode, block {blockname} has failed." send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending dryrun email") print("retval = ", retval) retval = pfwdefs.PF_EXIT_DRYRUN elif retval: if when_to_email != 'never': print("Sending block failed email\n") msg1 = f"{run}: block {blockname} has failed." send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending failed email") print("retval = ", retval) elif retval == pfwdefs.PF_EXIT_SUCCESS: if when_to_email == 'block': msg1 = f"{run}: block {blockname} has finished successfully." msg2 = "" print("Sending success email\n") send_email(config, blockname, retval, "", msg1, msg2) elif when_to_email == 'run': numblocks = len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')) if int(config[pfwdefs.PF_BLKNUM]) == numblocks: msg1 = f"{run}: run has finished successfully." msg2 = "" print("Sending success email\n") send_email(config, blockname, retval, "", msg1, msg2) else: print("Not sending run email because not last block") print("retval = ", retval) else: print("Not sending success email") print("retval = ", retval) else: print("Not sending email") print("retval = ", retval) # Store values in DB and hist file #dbh = None if miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]): if dbh is None: dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) if blktid is not None: print("Updating end of block task", blktid) dbh.end_task(blktid, retval, True) else: print("Could not update end of block task without block task id") if retval != pfwdefs.PF_EXIT_SUCCESS: print("Updating end of attempt", config['task_id']['attempt']) dbh.end_task(config['task_id']['attempt'], retval, True) dbh.commit() #dbh.close() print("before next block retval = ", retval) if retval == pfwdefs.PF_EXIT_SUCCESS: # Get ready for next block config.inc_blknum() with open(configfile, 'w') as cfgfh: config.write(cfgfh) print("new blknum = ", config[pfwdefs.PF_BLKNUM]) print("number of blocks = ", len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ','))) if int(config[pfwdefs.PF_BLKNUM]) > len(miscutils.fwsplit(config[pfwdefs.SW_BLOCKLIST], ',')) and miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]): #dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) updatevals = {'PROCESSING_STATE': 'PASS'} wherevals = {'PFW_ATTEMPT_ID': attid} dbh.basic_update_row('ATTEMPT_STATE', updatevals, wherevals) dbh.commit() #dbh.close() elif miscutils.convertBool(config[pfwdefs.PF_USE_DB_OUT]): #dbh = pfwdb.PFWDB(config.getfull('submit_des_services'), config.getfull('submit_des_db_section')) updatevals = {'PROCESSING_STATE': 'FAIL'} wherevals = {'PFW_ATTEMPT_ID': attid} dbh.basic_update_row('ATTEMPT_STATE', updatevals, wherevals) dbh.commit() #dbh.close() if dbh is not None: dbh.close() miscutils.fwdebug_print(f"Returning retval = {retval} ({type(retval)})") miscutils.fwdebug_print("END") debugfh.close() if miscutils.fwdebug_check(3, 'PFWPOST_DEBUG'): miscutils.fwdebug_print(f"Exiting with = {exitcode}") miscutils.fwdebug_print(f"type of exitcode = {type(exitcode)}") sys.stdout = realstdout sys.stderr = realstderr return int(retval)