示例#1
0
def main(argv):
    # Parse arguments. User must input a job name.
    parser = argparse.ArgumentParser(description='Main program to start or restart ' + \
             'calibration spinup for the National Water Model')
    parser.add_argument('jobID',metavar='jobID',type=str,nargs='+',
                        help='Job ID specific to calibration spinup.')
    parser.add_argument('--optDbPath',type=str,nargs='?',
                        help='Optional alternative path to SQLite DB file.')
    
    args = parser.parse_args()
    
    # If the SQLite file does not exist, throw an error.
    if args.optDbPath is not None:
        if not os.path.isfile(args.optDbPath):
            print "ERROR: " + args.optDbPath + " Does Not Exist."
            sys.exit(1)
        else:
            dbPath = args.optDbPath
    else:
        dbPath = topDir + "wrfHydroCalib.db"
        if not os.path.isfile(dbPath):
            print "ERROR: SQLite3 DB file: " + dbPath + " Does Not Exist."
            sys.exit(1)
    
    # Establish the beginning timestamp for this program.
    begTimeStamp = datetime.datetime.now()
    
    # Get current user who is running this program.
    userTmp = pwd.getpwuid(os.getuid()).pw_name
    
    # Initialize object to hold status and job information
    jobData = statusMod.statusMeta()
    jobData.jobID = int(args.jobID[0])
    jobData.dbPath = dbPath
    
    # Establish database connection.
    db = dbMod.Database(jobData)
    try:
        db.connect(jobData)
    except:
        print jobData.errMsg
        sys.exit(1)
        
    # Extract job data from database
    try:
        db.jobStatus(jobData)
    except:
        print jobData.errMsg
        sys.exit(1)
    
    # Pull extensive meta-data describing the job from the config file.
    configPath = str(jobData.jobDir) + "/setup.config"
    if not os.path.isfile(configPath):
        print "ERROR: Configuration file: " + configPath + " not found."
        sys.exit(1)

    try:        
        staticData = configMod.readConfig(configPath)
    except:
        print "ERROR: Failure to read configuration file: " + configPath
        sys.exit(1)
        
    if staticData.coldStart == 1:
        print "ERROR: User has specified a cold-start option for calibration. Exiting...."
        sys.exit(0)
    if staticData.optSpinFlag == 1:
        print "ERROR: User has specified an optional spinup file. Exiting...."
        sys.exit(0)
    
    # Assign the SQL command from the config file into the jobData structure
    jobData.gSQL = staticData.gSQL
        
    # Check gages in directory to match what's in the database
    try:
        jobData.checkGages2(db)
    except:
        errMod.errOut(jobData)
    
    # Establish LOCK file to secure this Python program to make sure
    # no other instances over-step here. This is mostly designed to deal
    # with nohup processes being kicked off Yellowstone/Cheyenne/Crontabs arbitrarily.
    # Just another check/balance here.
    pyLockPath = str(jobData.jobDir) + "/PYTHON.LOCK"
    if os.path.isfile(pyLockPath):
        # Either a job is still running, or was running
        # and was killed.

        print 'LOCK FILE FOUND.'
        # Read in to get PID number
        pidObj = pd.read_csv(pyLockPath)
        pidCheck = int(pidObj.PID[0])
        if errMod.check_pid(pidCheck):
                print "JOB: " + str(pidCheck) + \
                      " Is still running."
                sys.exit(0)
        else:
                print "JOB: " + str(pidCheck) + \
                      " Has Failed. Removing LOCK " + \
                      " file."
                os.remove(pyLockPath)
                fileObj = open(pyLockPath,'w')
                fileObj.write('\"PID\"\n')
                fileObj.write(str(os.getpid()))
                fileObj.close()
        # TEMPORARY FOR CHEYENNE. Since all cron jobs are launched
        # from an administrative node, we cannot monitor the process at 
        # all, which is an inconvenience. So.... we will check the last
        # modified time. If it's more than 15 minutes old, we will assume
        # the process is no longer running and can continue on with calibration.
        #dtRunCheck = datetime.datetime.now() - datetime.datetime.fromtimestamp(os.path.getmtime(pyLockPath))
        #if dtRunCheck.seconds/60.0 < 15.0:
        #    # We are going to assume a previous process is still running on the system. 
        #    # exit gracefully.
        #    print 'ASSUMING PROCESS STILL RUNNING'
        #    sys.exit(0)
        #else:
        #    # We are assuming the process is no longer running on the system. Alow
        #    # the workflow to continue. 
        #    print 'ALLOWING WORKFLOW TO CONINUE. REMOVING LOCK FILE'
        #    os.remove(pyLockPath)
        #    fileObj = open(pyLockPath,'w')
        #    fileObj.write('\"PID\"\n')
        #    fileObj.write(str(os.getpid()))
        #    fileObj.close()
    else:
        # Write a LOCK file for this program.
        fileObj = open(pyLockPath,'w')
        fileObj.write('\"PID\"\n')
        fileObj.write(str(os.getpid()))
        fileObj.close()
    
    # Some house keeping here. If the spinup is already complete, throw an error. 
    # also, if this is a re-initiation under a different user, require the new
    # user to enter a new contact that will be unpdated in the database. 
    if int(jobData.spinComplete) == 1:
        jobData.errMsg = "ERROR: Spinup for job ID: " + str(jobData.jobID) + \
                         " has already completed."
        errMod.errOut(jobData)
        
    if userTmp != jobData.owner:
        print "User: "******" is requesting to takeover jobID: " + \
              str(jobData.jobID) + " from owner: " + str(jobData.owner)
        strTmp = "Please enter new email address. Leave blank if no email " + \
                 "change is desired. NOTE if you leave both email and Slack " + \
                 "information blank, no change in contact will occur. Only " + \
                 "the owner will be modified:"
        newEmail = raw_input(strTmp)
        #strTmp = "Please enter Slack channel:"
        #newSlackChannel = raw_input(strTmp)
        #strTmp = "Please enter Slack token:"
        #newSlackToken = raw_input(strTmp)
        #strTmp = "Please enter Slack user name:"
        #newSlackUName = raw_input(strTmp)
        changeFlag = 1
        #if len(newSlackChannel) != 0 and len(newSlackToken) == 0:
        #    print "ERROR: You must specify an associated Slacker API token."
        #    sys.exit(1)
        #if len(newSlackChannel) != 0 and len(newSlackUName) == 0:
        #    print "ERROR: You must specify an associated Slacker user name."
        #    sys.exit(1)
        #if len(newSlackToken) != 0 and len(newSlackChannel) == 0:
        #    print "ERROR: You must specify an associated Slacker channel name."
        #    sys.exit(1)
        #if len(newSlackToken) != 0 and len(newSlackUName) == 0:
        #    print "ERROR: You must specify an associated Slacker user name."
        #    sys.exit(1)
        #if len(newSlackUName) != 0 and len(newSlackChannel) == 0:
        #    print "ERROR: You must specify an associated Slacker channel name."
        #    sys.exit(1)
        #if len(newSlackUName) != 0 and len(newSlackToken) == 0:
        #    print "ERROR: You must specify an associated Slacker API token."
        #    sys.exit(1)
        #if len(newSlackChannel) != 0 and len(newEmail) != 0:
        #    print "ERROR: You cannot specify both email and Slack for notifications."
        #    sys.exit(1)
        #if len(newSlackChannel) == 0 and len(newEmail) == 0:
        #    changeFlag = 0
            
        # PLACEHOLDER FOR CHECKING SLACK CREDENTIALS
        
        # TEMPORARY FOR VERSION 1.2 NWM CALIBRATION!!!!
        # If a new owner takes over, simply change the owner, but keep all 
        # other contact information the same.
        newEmail = jobData.email
        newSlackChannel = jobData.slChan
        newSlackToken = jobData.slToken
        newSlackUName = jobData.slUser
        if not newEmail:
            newEmail = ''
        if not newSlackChannel:
            newSlackChannel = ''
            newSlackToken = ''
            newSlackUName = ''
            
        try:
            db.updateJobOwner(jobData,userTmp,newEmail,newSlackChannel,newSlackToken,newSlackUName,changeFlag)
        except:
            errMod.errOut(jobData)
            
        jobData.genMsg = "MSG: User: "******" Is Taking Over JobID: " + str(jobData.jobID) + \
                         " From Owner: " + str(jobData.owner)
        errMod.sendMsg(jobData)
        
    # Begin an "infinite" do loop. This loop will continue to loop through all 
    # the basins until spinups are complete. Basins are allowed ONE failure. A restart
    # will be attempted. If the restart fails again, a LOCK file is placed into the
    # run directory and an error email is sent to the user.
    completeStatus = False
    
    # Create a "key" array. This array is of length [numBasins] and is initialized to 0.0.
    # Each array element can have the following values based on current model status:
    # 0.0 - Initial value
    # 0.5 - Model simulation in progress
    # 1.0 - Model simulation complete
    # -0.5 - Model simulation failed once and a restart has been attempted
    # -1.0 - Model has failed twice. A LOCK file has been created.
    # Once all array elements are 1.0, then completeStatus goes to True, an entry into
    # the database occurs, and the program will complete.
    keySlot = np.empty(len(jobData.gages))
    keySlot[:] = 0.0
    entryValue = float(len(jobData.gages))
    
    # Create an array to hold systme job ID values. This will only be used for
    # PBS as qstat has demonstrated slow behavior when doing a full qstat command. 
    # We will track job ID values and do a qstat <jobID> and populate this array
    # to keep track of things. 
    pbsJobId = np.empty([len(jobData.gages)],np.int64)
    pbsJobId[:] = -9999

    while not completeStatus:
        # Walk through spinup directory for each basin. Determine the status of
        # the model runs by the files available. If restarting, modify the 
        # namelist files appropriately. Then, restart the model. Once all
        # basins have been accounted for, fire off the monitoring program through
        # nohup to keep track of the models. If anything goes wrong, notifications
        # will either be emailed per the user's info, or piped to Slack for group
        # notification.
        # Loop through each basin. Perform the following steps:
        # 1.) If status is -0.5,0.0, or 0.5, check to see if the model is running
        #     for this basin.
        # 2.) If the model is not running, check for expected output and perform
        #     necessary logistics. Continue to the next basin.
        # If the status goes to -1.0, a LOCK file is created and must be manually
        # removed from the user. Once the program detects this, it will restart the
        # model and the status goes back to 0.5.
        # If the status is -0.5 and no job is running, output must be complete, or 
        # status goes to -1.0.
        # If output is not complete, the model is still running, status stays at 0.5.
        # If job is not running, and output has been completed, status goes to 1.0.
        # This continues indefinitely until statuses for ALL basins go to 1.0.
        for basin in range(0,len(jobData.gages)):
            try:
                spinupMod.runModel(jobData,staticData,db,jobData.gageIDs[basin],jobData.gages[basin],keySlot,basin,pbsJobId)
            except:
                errMod.errOut(jobData)
            
            # TEMPORARY FOR CHEYENNE
            # Check to make sure program hasn't passed a prescribed time limit. If it has,
            # exit gracefully.
            #timeCheckStamp = datetime.datetime.now()
            #programDtCheck = timeCheckStamp - begTimeStamp
            #if programDtCheck.seconds/60.0 > 90.0: 
            #    # 90-minutes)
            #    try:
            #        fileObj = open(pyLockPath,'a')
            #        fileObj.write('WORKFLOW HAS HIT TIME LIMIT - EXITING....\n')
            #        fileObj.close()
            #    except:
            #        jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + pyLockPath
            #        errMod.errOut(jobData)
        
        # Check to see if program requirements have been met.
        if keySlot.sum() == entryValue:
            jobData.spinComplete = 1
            try:
                db.updateSpinupStatus(jobData)
            except:
                errMod.errout(jobData)
            jobData.genMsg = "SPINUP FOR JOB ID: " + str(jobData.jobID) + " COMPLETE."
            errMod.sendMsg(jobData)
            completeStatus = True
            
        # Open the Python LOCK file. Write a blank line to the file and close it.
        # This action will simply modify the file modification time while only adding
        # a blank line.
        try:
            fileObj = open(pyLockPath,'a')
            fileObj.write('\n')
            fileObj.close()
        except:
            jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + pyLockPath
            errMod.errOut(jobData)
            
    # Remove LOCK file
    os.remove(pyLockPath)
示例#2
0
def main(argv):
    # Parse arguments. User must input a job name.
    parser = argparse.ArgumentParser(description='Main program to initialize ' + \
             'calibration for the National Water Model')
    parser.add_argument('configFile',
                        metavar='config',
                        type=str,
                        nargs='+',
                        help='Config file to initialize job.')
    parser.add_argument('--optDbPath',
                        type=str,
                        nargs='?',
                        help='Optional alternative path to SQLite DB file.')

    args = parser.parse_args()

    # If the SQLite file does not exist, throw an error.
    if args.optDbPath is not None:
        if not os.path.isfile(args.optDbPath):
            print "ERROR: " + args.optDbPath + " Does Not Exist."
            sys.exit(1)
        else:
            dbPath = args.optDbPath
    else:
        dbPath = topDir + "wrfHydroCalib.db"
        if not os.path.isfile(dbPath):
            print "ERROR: SQLite3 DB file: " + dbPath + " Does Not Exist."
            sys.exit(1)

    # Initialize job using setup.parm and calibration DB.
    try:
        jobData = configMod.createJob(args)
    except:
        print "ERROR: Failure to initialize calibration workflow job."
        sys.exit(1)

    jobData.dbPath = dbPath

    # Establish database connection.
    db = dbMod.Database(jobData)
    try:
        db.connect(jobData)
    except:
        errMod.errOut(jobData)

    # First check to see if unique Job ID already exists.
    try:
        db.getJobID(jobData)
    except:
        errMod.errOut(jobData)

    # If a job ID value was found, this means information from this configuration
    # file has already been initiated by the workflow into the database.
    if int(jobData.jobID) != -9999:
        jobData.errMsg = "ERROR: Information for this job has already " + \
                         "been entered as job ID: " + str(jobData.jobID)
        errMod.errOut(jobData)

    # Extract list of gages to perform workflow on
    try:
        calibIoMod.getGageList(jobData, db)
    except:
        errMod.errOut(jobData)

    # Check to see if this job ID contains any entries in other tables. If it does,
    # Warn the user that this data will be wiped, and prompt the user to confirm
    # they want to delete the data from the other tables.
    try:
        statusTmp = db.checkPreviousEntries(jobData)
    except:
        errMod.errOut(jobData)

    # If any entries in the tables were found, warn the user that tables from an
    # orphaned ghost job are being deleted. This may be a situation where a previous
    # job was ran in the DB, it was removed from Job_Meta, but the remaining tables
    # weren't cleaned up.
    if not statusTmp:
        print "WARNING: Old orphaned table entries from this jobID are being deleted."
        try:
            db.cleanupJob(jobData)
        except:
            errMod.errOut(jobData)

    # Create DB entries for job name
    try:
        db.enterJobID(jobData)
    except:
        errMod.errOut(jobData)

    # Pull Job ID from newly created job. Will be used for calibration
    # parameter DB entries
    try:
        db.getJobID(jobData)
    except:
        errMod.errOut(jobData)

    # Create necessary run directories to hold output, analysis, etc.
    try:
        calibIoMod.setupModels(jobData, db, args, libPathTop)
    except:
        errMod.errOut(jobData)

    # Create DB entries to log the parameters being calibrated.
    try:
        db.enterJobParms(jobData)
    except:
        errMod.errOut(jobData)

    # Create empty table to hold calibrated parameter values that will be
    # calculated during calibration.
    try:
        db.populateParmTable(jobData)
    except:
        errMod.errOut(jobData)

    jobData.nGages = len(jobData.gages)
    try:
        jobData.checkGages2(db)
    except:
        errMod.errOut(jobData)

    # Create empty table entries into the Calib_Stats/Sens_Stats tables to be filled in as the workflow progresses.
    # If table entries have already been entered, continue on. This only needs to be done ONCE. Moved this
    # from calib.py as there's no reason to do this during the spinup program.
    for basin in range(0, len(jobData.gages)):
        domainID = jobData.gageIDs[basin]

        if domainID == -9999:
            jobData.errMsg = "ERROR: Unable to locate domainID for gage: " + str(
                jobData.gages[basin])
            errMod.errOut(jobData)

        if jobData.calibFlag == 1:
            try:
                db.populateCalibTable(jobData, domainID,
                                      str(jobData.gages[basin]))
            except:
                errMod.errOut(jobData)

        if jobData.sensFlag == 1:
            try:
                db.populateSensTable(jobData, domainID,
                                     str(jobData.gages[basin]))
            except:
                errMod.errOut(jobData)

    # Disconnect from the calibration database.
    try:
        db.disconnect(jobData)
    except:
        errMod.errOut(jobData)

    # Print the newly created job ID to the user
    jobData.genMsg = "WORKFLOW HAS BEEN SETUP FOR OWNER: " + str(jobData.owner) + \
                     " JOB ID = " + str(jobData.jobID)
    print jobData.genMsg
    errMod.sendMsg(jobData)
示例#3
0
def main(argv):
    # Parse arguments. User must input a job name.
    parser = argparse.ArgumentParser(description='Main program to start or restart ' + \
             'sensitivity analysis for WRF-Hydro')
    parser.add_argument('jobID',metavar='jobID',type=str,nargs='+',
                        help='Job ID specific to your sensitivity/caliration workflow job.')
    parser.add_argument('--optDbPath',type=str,nargs='?',
                        help='Optional alternative path to SQLite DB file.')
    
    args = parser.parse_args()
    
    # If the SQLite file does not exist, throw an error.
    if args.optDbPath is not None:
        if not os.path.isfile(args.optDbPath):
            print "ERROR: " + args.optDbPath + " Does Not Exist."
            sys.exit(1)
        else:
            dbPath = args.optDbPath
    else:
        dbPath = topDir + "wrfHydroCalib.db"
        if not os.path.isfile(dbPath):
            print "ERROR: SQLite3 DB file: " + dbPath + " Does Not Exist."
            sys.exit(1)
    
    # Establish the beginning timestamp for this program.
    begTimeStamp = datetime.datetime.now()
    
    # Get current user who is running this program.
    userTmp = pwd.getpwuid(os.getuid()).pw_name
    
    # Initialize object to hold status and job information
    jobData = statusMod.statusMeta()
    jobData.jobID = int(args.jobID[0])
    jobData.dbPath = dbPath
    
    # Establish database connection.
    db = dbMod.Database(jobData)
    try:
        db.connect(jobData)
    except:
        print jobData.errMsg
        sys.exit(1)
        
    # Extract job data from database
    try:
        db.jobStatus(jobData)
    except:
        print jobData.errMsg
        sys.exit(1)
        
    # If the sensitivity flag is 0, simply exit gracefully as the user specified
    # not to run calibration.
    if jobData.sensFlag != 1:
        print "ERROR: Sensitivity flag was set to 0 for this workflow."
        sys.exit(1)
        
    # Establish LOCK file to secure this Python program to make sure
    # no other instances over-step here. This is mostly designed to deal
    # with nohup processes being kicked off Yellowstone/Cheyenne/Crontabs arbitrarily.
    # Just another check/balance here.
    lockPath = str(jobData.jobDir) + "/PYTHON.LOCK"
    if os.path.isfile(lockPath):
        # Either a job is still running, or was running
        # and was killed.

        print 'LOCK FILE FOUND.'
        # Read in to get PID number
        pidObj = pd.read_csv(lockPath)
        pidCheck = int(pidObj.PID[0])
        if errMod.check_pid(pidCheck):
                print "JOB: " + str(pidCheck) + \
                      " Is still running."
                sys.exit(0)
        else:
                print "JOB: " + str(pidCheck) + \
                      " Has Failed. Removing LOCK " + \
                      " file."
                os.remove(lockPath)
                fileObj = open(lockPath,'w')
                fileObj.write('\"PID\"\n')
                fileObj.write(str(os.getpid()))
                fileObj.close()
        # TEMPORARY FOR CHEYENNE. Since all cron jobs are launched
        # from an administrative node, we cannot monitor the process at 
        # all, which is an inconvenience. So.... we will check the last
        # modified time. If it's more than 30 minutes old, we will assume
        # the process is no longer running and can continue on with calibration.
        #dtRunCheck = datetime.datetime.now() - datetime.datetime.fromtimestamp(os.path.getmtime(lockPath))
        #if dtRunCheck.seconds/60.0 < 15.0:
        #    # We are going to assume a previous process is still running on the system. 
        #    # exit gracefully.
        #    print 'ASSUMING PROCESS STILL RUNNING'
        #    sys.exit(0)
        #else:
        #    # We are assuming the process is no longer running on the system. Alow
        #    # the workflow to continue. 
        #    print 'ALLOWING WORKFLOW TO CONINUE. REMOVING LOCK FILE'
        #    os.remove(lockPath)
        #    fileObj = open(lockPath,'w')
        #    fileObj.write('\"PID\"\n')
        #    fileObj.write(str(os.getpid()))
        #    fileObj.close()
    else:
        # Write a LOCK file for this program.
        fileObj = open(lockPath,'w')
        fileObj.write('\"PID\"\n')
        fileObj.write(str(os.getpid()))
        fileObj.close()
        
    # Pull extensive meta-data describing the job from the config file.
    configPath = str(jobData.jobDir) + "/setup.config"
    if not os.path.isfile(configPath):
        print "ERROR: Configuration file: " + configPath + " not found."
        sys.exit(1)
    try:        
        staticData = configMod.readConfig(configPath)
    except:
        print "ERROR: Failure to read configuration file: " + configPath
        sys.exit(1)
        
    # Assign the SQL command from the config file into the jobData structure
    jobData.gSQL = staticData.gSQL
    
    # Check gages in directory to match what's in the database
    try:
        jobData.checkGages2(db)
    except:
        errMod.errOut(jobData)
        
    # Some house keeping here. If the sensitivity is already complete, throw an error. 
    # Also ensure the spinup has been entered as complete. This is necessary for the 
    # sensitivity to run.
    # also, if this is a re-initiation under a different user, require the new
    # user to enter a new contact that will be unpdated in the database. 
    if int(jobData.spinComplete) != 1:
        # Check to see if optional spinup options were enabled. If so, update the spinup status.
        if staticData.coldStart == 1 or staticData.optSpinFlag != 0:
            print "Found optional spinup alternatives"
            jobData.spinComplete = 1
            try:
                db.updateSpinupStatus(jobData)
            except:
                errMod.errOut(jobData)
        else:
            jobData.errMsg = "ERROR: Spinup for job ID: " + str(jobData.jobID) + \
                             " is NOT complete. You must complete the spinup in order" + \
                             " to run calibration."
            errMod.errOut(jobData)
        
    if int(jobData.sensComplete) == 1:
        jobData.errMsg = "ERROR: Sensitivity for job ID: " + str(jobData.jobID) + \
                         " has already completed."
        errMod.errOut(jobData)
        
    if userTmp != jobData.owner:
        print "User: "******" is requesting to takeover jobID: " + \
              str(jobData.jobID) + " from owner: " + str(jobData.owner)
        strTmp = "Please enter new email address. Leave blank if no email " + \
                 "change is desired. NOTE if you leave both email and Slack " + \
                 "information blank, no change in contact will occur. Only " + \
                 "the owner will be modified:"
        newEmail = raw_input(strTmp)
        #strTmp = "Please enter Slack channel:"
        #newSlackChannel = raw_input(strTmp)
        #strTmp = "Please enter Slack token:"
        #newSlackToken = raw_input(strTmp)
        #strTmp = "Please enter Slack user name:"
        #newSlackUName = raw_input(strTmp)
        changeFlag = 1
        #if len(newSlackChannel) != 0 and len(newSlackToken) == 0:
        #    print "ERROR: You must specify an associated Slacker API token."
        #    sys.exit(1)
        #if len(newSlackChannel) != 0 and len(newSlackUName) == 0:
        #    print "ERROR: You must specify an associated Slacker user name."
        #    sys.exit(1)
        #if len(newSlackToken) != 0 and len(newSlackChannel) == 0:
        #    print "ERROR: You must specify an associated Slacker channel name."
        #    sys.exit(1)
        #if len(newSlackToken) != 0 and len(newSlackUName) == 0:
        #    print "ERROR: You must specify an associated Slacker user name."
        #    sys.exit(1)
        #if len(newSlackUName) != 0 and len(newSlackChannel) == 0:
        #    print "ERROR: You must specify an associated Slacker channel name."
        #    sys.exit(1)
        #if len(newSlackUName) != 0 and len(newSlackToken) == 0:
        #    print "ERROR: You must specify an associated Slacker API token."
        #    sys.exit(1)
        #if len(newSlackChannel) != 0 and len(newEmail) != 0:
        #    print "ERROR: You cannot specify both email and Slack for notifications."
        #    sys.exit(1)
        #if len(newSlackChannel) == 0 and len(newEmail) == 0:
        #    changeFlag = 0
            
        # PLACEHOLDER FOR CHECKING SLACK CREDENTIALS
            
        jobData.genMsg = "MSG: User: "******" Is Taking Over JobID: " + str(jobData.jobID) + \
                         " From Owner: " + str(jobData.owner)
        errMod.sendMsg(jobData)
        
        # TEMPORARY FOR VERSION 1.2 NWM CALIBRATION!!!!
        # If a new owner takes over, simply change the owner, but keep all 
        # other contact information the same.
        newEmail = jobData.email
        newSlackChannel = jobData.slChan
        newSlackToken = jobData.slToken
        newSlackUName = jobData.slUser
        if not newEmail:
            newEmail = ''
        if not newSlackChannel:
            newSlackChannel = ''
            newSlackToken = ''
            
        try:
            db.updateJobOwner(jobData,userTmp,newEmail,newSlackChannel,newSlackToken,newSlackUName,changeFlag)
        except:
            errMod.errOut(jobData)
            
    # Begin an "infinite" do loop. This loop will continue to loop through all 
    # the basins until sensitivity jobs are complete. Basins are allowed ONE failure. A restart
    # will be attempted. If the restart fails again, a LOCK file is placed into the
    # run directory and an error email is sent to the user.
    completeStatus = False
    
    # Create a "key" array. This array is of length [numBasins] and is initialized to 0.0.
    # Each array element can have the following values based on current model status:
    # 0.0 - Initial value
    # 0.10 - Job to generate parameter grids for each model job is being ran.
    # 0.25 - Job to generate parameter grids is complete. Ready to run models.....
    # 0.5 - Model simulationa are in progress
    # 0.75 - Job to read in model output and run sensitivity analysis is ready to be ran. 
    # 0.90 - Job to read in model output and run sensitivity analysis is running. 
    # 1.0 - Sensitivity analysis complete
    # -0.1 - Parameter generation failed. A LOCK file has been created. 
    # -0.5 - Model simulation failed once and a restart has been attempted
    # -0.90 - Sensitivity analysis job has failed. A LOCK file has been created. 
    # -1.0 - Model has failed twice. A LOCK file has been created.
    # Once all array elements are 1.0, then completeStatus goes to True, an entry into
    # the database occurs, and the program will complete.
    keySlot = np.empty([len(jobData.gages),int(jobData.nSensIter)])
    keySlot[:,:] = 0.0
    entryValue = float(len(jobData.gages)*int(jobData.nSensIter)*2.0)
    
    # Create an array to hold systme job ID values. This will only be used for
    # PBS as qstat has demonstrated slow behavior when doing a full qstat command. 
    # We will track job ID values and do a qstat <jobID> and populate this array
    # to keep track of things. 
    pbsJobId = np.empty([len(jobData.gages),int(jobData.nSensIter)],np.int64)
    pbsJobId[:,:] = -9999
    pbsCollectId = np.empty([len(jobData.gages),int(jobData.nSensIter)],np.int64)
    pbsCollectId[:,:] = -9999
    pbsPreId = np.empty([len(jobData.gages)],np.int64)
    pbsPreId[:] = -9999
    pbsPostId = np.empty([len(jobData.gages)],np.int64)
    pbsPostId[:] = -9999
    
    # Pull all the status values into the keySlot array. 
    for basin in range(0,len(jobData.gages)):
        domainID = jobData.gageIDs[basin]
            
        if domainID == -9999:
            jobData.errMsg = "ERROR: Unable to locate domainID for gage: " + str(jobData.gages[basin])
            errMod.errOut(jobData)
            
        # We are going to pull all values for one basin, then place them into the array.
        # This is faster then looping over each iteration at a time. 
        statusData = db.sensIterationStatus(jobData,domainID,str(jobData.gages[basin]))
        statusData = [list(item) for item in statusData]
        for iteration in range(0,int(jobData.nSensIter)):
            for iteration2 in range(0,int(jobData.nSensIter)):
                if statusData[iteration2][0] == iteration+1:
                    keySlot[basin,iteration] = float(statusData[iteration2][1])
            
    if len(np.where(keySlot != 0.0)[0]) == 0:
        # We need to either check to see if pre-processing has taken place, or
        # run it.
        preProcStatus = False
        
    while not completeStatus:
        # Walk through each basin undergoing sensitivity analysis. 
        for basin in range(0,len(jobData.gages)):
            print "GAGE: " + jobData.gages[basin]
            # Establish a status value for pre-processing the parameter values from R/Python code. 
            preProcStatus = False 
    
            # Establish a status value for post-processing the model output and running sensitivity
            # analysis.
            postProcStatus = False
            
            # Calculate the number of "batches" we are going to run
            nBatches = int(jobData.nSensIter/jobData.nSensBatch)
            entryValueBatch = float(jobData.nSensBatch)
            
            # If we have a pre-processing complete file, set our pre-proc status to True. 
            # Also, log parameter values generated if the log file hasn't been created. 
            preProcComplete = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/preProc.COMPLETE"
            parmsLogged =  jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/PARAMS_LOGGED.COMPLETE"
            parmTxtFile = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/params_new.txt"
            sensLogged = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/SENS_LOGGED.COMPLETE"
            sensStats = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/stat_sensitivity.txt"
            missingFlag = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/CALC_STATS_MISSING"
            if os.path.isfile(preProcComplete):
                preProcStatus = True
                print "PRE PROCESSING COMPLETE!"
                if not os.path.isfile(parmsLogged):
                    # Log parameter values generated by pre-processing.
                    print "LOGGING PRE-PROC PARAM FILES."
                    try:
                        db.insertSensParms(jobData,parmsLogged,parmTxtFile,jobData.gageIDs[basin])
                    except:
                        jobData.errMsg = ("WARNING: Unable to log sensitivity parameters for basin: " + str(basin) + \
                                          " Job: " + str(jobData.jobID))
                        errMod.errOut(jobData)
            if not preProcStatus:
                try:
                    sensitivityMod.preProc(preProcStatus,jobData,staticData,db,jobData.gageIDs[basin],jobData.gages[basin],pbsPreId,basin)
                except:
                    errMod.errOut(jobData)
            else:
                # The goal here is to only operate on a fixed number of model runs at a time.
                # If you have a large parameter sample size, it's possible to have hundreds,
                # if not thousands of model permuatations. This worflow allows for 
                # only batches of model runs to be ran at a time as to not bog down the system. 
                for batchIter in range(0,nBatches):
                    time.sleep(30)
                    batchCheck = keySlot[basin,(batchIter*jobData.nSensBatch):((batchIter+1)*jobData.nSensBatch)]
                    if batchIter == 0:
                        batchCheckPrev = entryValueBatch
                    else:
                        batchCheckPrev = keySlot[basin,((batchIter-1)*jobData.nSensBatch):(batchIter*jobData.nSensBatch)]
                        batchCheckPrev = batchCheckPrev.sum()
                    if batchCheck.sum() != entryValueBatch and batchCheckPrev == entryValueBatch:
                        for iterTmp in range(0,jobData.nSensBatch):
                            iteration = batchIter*jobData.nSensBatch + iterTmp
                            keyCheck1 = keySlot[basin,iteration]
                            if keyCheck1 < 1:
                                # This model iteration has not completed. 
                                try:
                                    sensitivityMod.runModel(jobData,staticData,db,jobData.gageIDs[basin],jobData.gages[basin],keySlot,basin,iteration,pbsJobId,pbsCollectId)
                                except:
                                    errMod.errOut(jobData)
                                
                                if keySlot[basin,iteration] == 0.0 and keyCheck1 == 0.5:
                                    # Put some spacing between launching model simulations to slow down que geting 
                                    # overloaded.
                                    time.sleep(3)
                                    
                                # Update the temporary status array as it will be checked for this batch of model runs.
                                batchCheck[iterTmp] = keySlot[basin,iteration]
                                
            # Run post-processing ONLY when all model simulations are finished.
            if not postProcStatus and preProcStatus and len(np.where(batchCheck != 1.0)[0]) == 0:
                print "READY FOR POST PROCESSING"
                try:
                    sensitivityMod.postProc(postProcStatus,jobData,staticData,db,jobData.gageIDs[basin],jobData.gages[basin],pbsPostId,basin)
                except:
                    errMod.errOut(jobData)
                                
            postProcComplete = jobData.jobDir + "/" + jobData.gages[basin] + "/RUN.SENSITIVITY/postProc.COMPLETE"
            if os.path.isfile(postProcComplete):
                if not os.path.isfile(sensLogged) and not os.path.isfile(missingFlag):
                    # Log sensitivity statistics into the database.
                    if not os.path.isfile(sensStats):
                        jobData.errMsg = "ERROR: Expected to find: " + sensStats + " after post-processing. Not found."
                        errMod.errOut(jobData)
                    else:
                        try:
                            db.logSensStats(jobData,sensStats,jobData.gageIDs[basin],sensLogged)
                        except:
                            errMod.errOut(jobData)
                    # Check for complete flag on logging sensitivity statistics. 
                    if os.path.isfile(sensLogged):
                        postProcStatus = True
                        # Upgrade key status values as necessary
                        for iterTmp in range(0,jobData.nSensIter):
                            keySlot[basin,iterTmp] = 2.0
                elif os.path.isfile(sensLogged):
                    # Post-processing complete and statistics were sucessfully logged.
                    postProcStatus = True
                    # Upgrade key status values as necessary
                    for iterTmp in range(0,jobData.nSensIter):
                        keySlot[basin,iterTmp] = 2.0
                elif os.path.isfile(missingFlag):
                    # Missing obs were found. We will default to making this basin complete.
                    for iterTmp in range(0,jobData.nSensIter):
                        keySlot[basin,iterTmp] = 2.0
                        
            # TEMPORARY FOR CHEYENNE
            # Check to make sure program hasn't passed a prescribed time limit. If it has,
            # exit gracefully.
            #timeCheckStamp = datetime.datetime.now()
            #programDtCheck = timeCheckStamp - begTimeStamp
            #if programDtCheck.seconds/60.0 > 90.0: 
            #    # 90-minutes)
            #    try:
            #        fileObj = open(lockPath,'a')
            #        fileObj.write('WORKFLOW HAS HIT TIME LIMIT - EXITING....\n')
            #        fileObj.close()
            #    except:
            #        jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + lockPath
            #        errMod.errOut(jobData)
            
        # Check to see if program requirements have been met.
        if keySlot.sum() == entryValue and postProcStatus:
            jobData.sensComplete = 1
            try:
                db.updateSensStatus(jobData)
            except:
                errMod.errOut(jobData)
            jobData.genMsg = "SENSITIVITY FOR JOB ID: " + str(jobData.jobID) + " COMPLETE."
            errMod.sendMsg(jobData)
            completeStatus = True
            
        # Open the Python LOCK file. Write a blank line to the file and close it.
        # This action will simply modify the file modification time while only adding
        # a blank line.
        try:
            fileObj = open(lockPath,'a')
            fileObj.write('\n')
            fileObj.close()
        except:
            jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + lockPath
            errMod.errOut(jobData)
            
    # Remove LOCK file
    os.remove(lockPath)
def main(argv):
    # Parse arguments. User must input a job name and directory.
    parser = argparse.ArgumentParser(description='Utility program to report the position' + \
                                     ' of a calibration job.')
    parser.add_argument('jobID',
                        metavar='jobID',
                        type=str,
                        nargs='+',
                        help='Job ID specific to calibration spinup.')
    parser.add_argument('contactFlag',
                        metavar='ctFlag',
                        type=int,
                        nargs='+',
                        help='1 = send to job contact, 0 = print to screen.')
    parser.add_argument('inDB',
                        metavar='inDB',
                        type=str,
                        nargs='+',
                        help='Required path to sqllite3 DB file.')
    parser.add_argument('--email',
                        nargs='?',
                        help='Optional email to pipe output to.')
    #parser.add_argument('--hostname',type=str,nargs='?',
    #                    help='Optional hostname MySQL DB resides on. Will use localhost if not passed.')
    #parser.add_argument('--pwd',metavar='pwd',type=str,nargs='?',help='Password to the Database.')

    args = parser.parse_args()

    # If the sqllite DB file does not exist, throw an error to the user.
    if not os.path.isfile(args.inDB[0]):
        print "ERROR: Unable to locate DB file: " + args.inDB[0]
        sys.exit(1)

    # Create dictionary of specified status messages.
    msgDict = {
        '-1.0': 'MODEL RUN LOCKED.',
        '-0.75': 'MAIN CALIBRATION PROGRAM LOCKED',
        '-0.5': 'MODEL FAILED ONCE - RUNNING AGAIN',
        '-0.25': 'MODEL FAILED ONCE - WAITING',
        '-0.1': 'CALIBRATON PROGRAM FOR DEFAULT PARAMETERS LOCKED',
        '0.0': 'NOT STARTED',
        '0.25': 'CALIBRATION PROGRAM FOR DEFAULT PROGRAM RUNNING',
        '0.5': 'MODEL CURRENTLY RUNNING',
        '0.75': 'MODEL COMPLETE READY FOR PARAMETER ESTIMATION',
        '0.9': 'PARAMETER ESTIMATION OCCURRING',
        '1.0': 'MODEL ITERATION COMPLETE'
    }

    # Initialize object to hold status and job information
    jobData = statusMod.statusMeta()
    jobData.jobID = int(args.jobID[0])
    jobData.dbPath = args.inDB[0]

    # Lookup database username/login credentials based on username
    # running program.
    #if not args.pwd:
    #    try:
    #        pwdTmp = getpass.getpass('Enter Database Password: '******'WH_Calib_rw'
    #        jobData.dbPwd = str(pwdTmp)
    #    except:
    #        print "ERROR: Unable to authenticate credentials for database."
    #        sys.exit(1)
    #else:
    #    jobData.dbPwd = args.pwd
    #jobData.dbUName= 'WH_Calib_rw'
    #jobData.port = 5432

    #if not args.hostname:
    #    # We will assume localhost for Postgres DB
    #    hostTmp = 'localhost'
    #else:
    #    hostTmp = str(args.hostname)
    #jobData.host = hostTmp

    # Establish database connection.
    db = dbMod.Database(jobData)
    try:
        db.connect(jobData)
    except:
        print jobData.errMsg
        sys.exit(1)

    # Extract job data from database
    try:
        db.jobStatus(jobData)
    except:
        print jobData.errMsg
        sys.exit(1)

    # Pull extensive meta-data describing the job from the config file.
    configPath = str(jobData.jobDir) + "/setup.config"
    if not os.path.isfile(configPath):
        print "ERROR: Configuration file: " + configPath + " not found."
        sys.exit(1)
    staticData = configMod.readConfig(configPath)
    try:
        staticData = configMod.readConfig(configPath)
    except:
        print "ERROR: Failure to read configuration file: " + configPath
        sys.exit(1)

    # Assign the SQL command from the config file into the jobData structure
    jobData.gSQL = staticData.gSQL

    # Check gages in directory to match what's in the database
    try:
        jobData.checkGages2(db)
    except:
        errMod.errOut(jobData)

    # If an optional email was passed to the program, update the job object to
    # reflect this for information dissemination.
    if args.email:
        jobData.slackObj = None
        jobData.email = str(args.email)

    # Loop through each basin. Determine if which iteration we are on, then report the status
    # of the job for this basin.
    msgOut = ''
    iterArray = np.empty([int(jobData.nIter)], np.int)
    completeArray = np.empty([int(jobData.nIter)], np.float)

    meanSum = 0.0
    for basin in range(0, len(jobData.gages)):
        iterArray[:] = 0
        completeArray[:] = 0.0
        keyStatus = 0.0
        keyStatusPrev = 0.0
        domainID = jobData.gageIDs[basin]
        iterComplete = 1
        statusData = db.iterationStatus(jobData, domainID,
                                        str(jobData.gages[basin]))
        for iteration in range(0, int(jobData.nIter)):
            keyStatus = float(statusData[iteration][1])
            iterationTmp = int(statusData[iteration][0])
            iterArray[iteration] = iterationTmp
            completeArray[iteration] = keyStatus

        indComplete = np.where(completeArray == 1)
        indCheck1 = np.where(completeArray != 1.0)
        indCheck2 = np.where(completeArray == 0.0)
        meanSum = meanSum + len(indComplete[0])
        if len(indComplete[0]) == int(jobData.nIter):
            msgOut = msgOut + "BASIN: " + str(jobData.gages[basin]) + \
                            ": CALIBRATION COMPLETE.\n"
        else:
            if len(indCheck2[0]) == int(jobData.nIter):
                msgOut = msgOut + "BASIN: " + str(jobData.gages[basin]) + \
                         " - HAS NOT BEGUN CALIBRATION.\n"
            else:
                iterLastComplete = len(indComplete[0])
                iterCurrent = iterLastComplete + 1
                indCurrent = np.where(iterArray == iterCurrent)
                statusCurrent = completeArray[indCurrent[0][0]]
                ind2 = np.where(completeArray != 0.0) and np.where(
                    completeArray != 1.0)
                iterTmp = iterArray[ind2[0][0]]
                msgOut = msgOut + "BASIN: " + str(jobData.gages[basin]) + \
                         ": " + str(msgDict[str(statusCurrent)]) + \
                         " - ITERATION: " + str(iterCurrent) + "\n"

    print "MEAN COMPLETENESS = " + str(float(meanSum) / len(jobData.gages))
    jobData.genMsg = msgOut
    if int(args.contactFlag[0]) == 0:
        print jobData.genMsg
    else:
        errMod.sendMsg(jobData)
示例#5
0
def main(argv):
    # Parse arguments. User must input a job name.
    parser = argparse.ArgumentParser(description='Main program to start or restart ' + \
             'calibration for WRF-Hydro')
    parser.add_argument('jobID',
                        metavar='jobID',
                        type=str,
                        nargs='+',
                        help='Job ID specific to calibration spinup.')
    parser.add_argument('--optDbPath',
                        type=str,
                        nargs='?',
                        help='Optional alternative path to SQLite DB file.')

    args = parser.parse_args()

    # If the SQLite file does not exist, throw an error.
    if args.optDbPath is not None:
        if not os.path.isfile(args.optDbPath):
            print "ERROR: " + args.optDbPath + " Does Not Exist."
            sys.exit(1)
        else:
            dbPath = args.optDbPath
    else:
        dbPath = topDir + "wrfHydroCalib.db"
        if not os.path.isfile(dbPath):
            print "ERROR: SQLite3 DB file: " + dbPath + " Does Not Exist."
            sys.exit(1)

    # Establish the beginning timestamp for this program.
    begTimeStamp = datetime.datetime.now()

    # Get current user who is running this program.
    userTmp = pwd.getpwuid(os.getuid()).pw_name

    # Initialize object to hold status and job information
    jobData = statusMod.statusMeta()
    jobData.jobID = int(args.jobID[0])
    jobData.dbPath = dbPath

    # Establish database connection.
    db = dbMod.Database(jobData)
    try:
        db.connect(jobData)
    except:
        print jobData.errMsg
        sys.exit(1)

    # Extract job data from database
    try:
        db.jobStatus(jobData)
    except:
        print jobData.errMsg
        sys.exit(1)

    # If the calibration flag is 0, simply exit gracefully as the user specified
    # not to run calibration.
    if jobData.calibFlag != 1:
        print "ERROR: Calibration flag was set to 0 for this workflow."
        sys.exit(1)

    # Establish LOCK file to secure this Python program to make sure
    # no other instances over-step here. This is mostly designed to deal
    # with nohup processes being kicked off Yellowstone/Cheyenne/Crontabs arbitrarily.
    # Just another check/balance here.
    lockPath = str(jobData.jobDir) + "/PYTHON.LOCK"
    if os.path.isfile(lockPath):
        # Either a job is still running, or was running
        # and was killed.

        print 'LOCK FILE FOUND.'
        # Read in to get PID number
        pidObj = pd.read_csv(lockPath)
        pidCheck = int(pidObj.PID[0])
        if errMod.check_pid(pidCheck):
            print "JOB: " + str(pidCheck) + \
                  " Is still running."
            sys.exit(0)
        else:
            print "JOB: " + str(pidCheck) + \
                  " Has Failed. Removing LOCK " + \
                  " file."
            os.remove(lockPath)
            fileObj = open(lockPath, 'w')
            fileObj.write('\"PID\"\n')
            fileObj.write(str(os.getpid()))
            fileObj.close()
        # TEMPORARY FOR CHEYENNE. Since all cron jobs are launched
        # from an administrative node, we cannot monitor the process at
        # all, which is an inconvenience. So.... we will check the last
        # modified time. If it's more than 30 minutes old, we will assume
        # the process is no longer running and can continue on with calibration.
        #dtRunCheck = datetime.datetime.now() - datetime.datetime.fromtimestamp(os.path.getmtime(lockPath))
        #if dtRunCheck.seconds/60.0 < 15.0:
        #    # We are going to assume a previous process is still running on the system.
        #    # exit gracefully.
        #    print 'ASSUMING PROCESS STILL RUNNING'
        #    sys.exit(0)
        #else:
        #    # We are assuming the process is no longer running on the system. Alow
        #    # the workflow to continue.
        #    print 'ALLOWING WORKFLOW TO CONINUE. REMOVING LOCK FILE'
        #    os.remove(lockPath)
        #    fileObj = open(lockPath,'w')
        #    fileObj.write('\"PID\"\n')
        #    fileObj.write(str(os.getpid()))
        #    fileObj.close()
    else:
        print 'LOCK FILE NOT FOUND.'
        # Write a LOCK file for this program.
        fileObj = open(lockPath, 'w')
        fileObj.write('\"PID\"\n')
        fileObj.write(str(os.getpid()))
        fileObj.close()

    # Pull extensive meta-data describing the job from the config file.
    configPath = str(jobData.jobDir) + "/setup.config"
    if not os.path.isfile(configPath):
        print "ERROR: Configuration file: " + configPath + " not found."
        sys.exit(1)
    try:
        staticData = configMod.readConfig(configPath)
    except:
        print "ERROR: Failure to read configuration file: " + configPath
        sys.exit(1)

    # Assign the SQL command from the config file into the jobData structure
    jobData.gSQL = staticData.gSQL

    # Check gages in directory to match what's in the database
    try:
        jobData.checkGages2(db)
    except:
        errMod.errOut(jobData)

    # Some house keeping here. If the calibration is already complete, throw an error.
    # Also ensure the spinup has been entered as complete. This is necessary for the
    # calibration to run.
    # also, if this is a re-initiation under a different user, require the new
    # user to enter a new contact that will be unpdated in the database.
    if int(jobData.spinComplete) != 1:
        # Check to see if optional spinup options were enabled. If so, update the spinup status.
        if staticData.coldStart == 1 or staticData.optSpinFlag != 0:
            print "Found optional spinup alternatives"
            jobData.spinComplete = 1
            try:
                db.updateSpinupStatus(jobData)
            except:
                errMod.errOut(jobData)
        else:
            jobData.errMsg = "ERROR: Spinup for job ID: " + str(jobData.jobID) + \
                             " is NOT complete. You must complete the spinup in order" + \
                             " to run calibration."
            errMod.errOut(jobData)

    if int(jobData.calibComplete) == 1:
        jobData.errMsg = "ERROR: Calibration for job ID: " + str(jobData.jobID) + \
                         " has already completed."
        errMod.errOut(jobData)

    if userTmp != jobData.owner:
        print "User: "******" is requesting to takeover jobID: " + \
              str(jobData.jobID) + " from owner: " + str(jobData.owner)
        strTmp = "Please enter new email address. Leave blank if no email " + \
                 "change is desired. NOTE if you leave both email and Slack " + \
                 "information blank, no change in contact will occur. Only " + \
                 "the owner will be modified:"
        newEmail = raw_input(strTmp)
        #strTmp = "Please enter Slack channel:"
        #newSlackChannel = raw_input(strTmp)
        #strTmp = "Please enter Slack token:"
        #newSlackToken = raw_input(strTmp)
        #strTmp = "Please enter Slack user name:"
        #newSlackUName = raw_input(strTmp)
        # V1.2 NOTE!!!!!
        # Given the automation of the workflow on Yellowstone, we are simply
        # keeping contact information the same, but only changing the ownership
        # of the workflow
        changeFlag = 1
        #if len(newSlackChannel) != 0 and len(newSlackToken) == 0:
        #    print "ERROR: You must specify an associated Slacker API token."
        #    sys.exit(1)
        #if len(newSlackChannel) != 0 and len(newSlackUName) == 0:
        #    print "ERROR: You must specify an associated Slacker user name."
        #    sys.exit(1)
        #if len(newSlackToken) != 0 and len(newSlackChannel) == 0:
        #    print "ERROR: You must specify an associated Slacker channel name."
        #    sys.exit(1)
        #if len(newSlackToken) != 0 and len(newSlackUName) == 0:
        #    print "ERROR: You must specify an associated Slacker user name."
        #    sys.exit(1)
        #if len(newSlackUName) != 0 and len(newSlackChannel) == 0:
        #    print "ERROR: You must specify an associated Slacker channel name."
        #    sys.exit(1)
        #if len(newSlackUName) != 0 and len(newSlackToken) == 0:
        #    print "ERROR: You must specify an associated Slacker API token."
        #    sys.exit(1)
        #if len(newSlackChannel) != 0 and len(newEmail) != 0:
        #    print "ERROR: You cannot specify both email and Slack for notifications."
        #    sys.exit(1)
        #if len(newSlackChannel) == 0 and len(newEmail) == 0:
        #    changeFlag = 0

        # PLACEHOLDER FOR CHECKING SLACK CREDENTIALS

        jobData.genMsg = "MSG: User: "******" Is Taking Over JobID: " + str(jobData.jobID) + \
                         " From Owner: " + str(jobData.owner)
        errMod.sendMsg(jobData)

        # If a new owner takes over, simply change the owner, but keep all
        # other contact information the same.
        newEmail = jobData.email
        newSlackChannel = jobData.slChan
        newSlackToken = jobData.slToken
        newSlackUName = jobData.slUser
        if not newEmail:
            newEmail = ''
        if not newSlackChannel:
            newSlackChannel = ''
            newSlackToken = ''

        try:
            db.updateJobOwner(jobData, userTmp, newEmail, newSlackChannel,
                              newSlackToken, newSlackUName, changeFlag)
        except:
            errMod.errOut(jobData)

    # Begin an "infinite" do loop. This loop will continue to loop through all
    # the basins until calibrations are complete. Basins are allowed ONE failure. A restart
    # will be attempted. If the restart fails again, a LOCK file is placed into the
    # run directory and an error email is sent to the user. Additionally, if the R calibration
    # code fails, a seperate LOCK file will be placed into the directory, and the user
    # will be notified about the failure.
    completeStatus = False

    # Create a "key" array. This array is of length [numBasins] and is initialized to 0.0.
    # Each array element can have the following values based on current model status:
    # 0.0 - Initial value
    # 0.25 - This is a special value for the first iteration. The initial default
    #        parameter values specified in the parameter table by the user are being
    #        applied and entered into the DB.
    # 0.5 - Model simulation in progress
    # 0.75 - The model simulation has completed. We are ready to run the R code to
    #        generate the next set of parameter values and enter evaluation statistics
    #        into the DB.
    # 0.90 - The R code is running to generate new parameters estimates. Python is
    #        also generating new files.
    # 1.0 - R/Python code is complete and param/stats have been enetered into the DB. Ready to
    #       run the next model iteration.
    # -0.1 - The R code to generate the initial parameter values has failed. CALIB.LOCK
    #        is put into place.
    # -0.25 - The workflow has found the model simulation to have failed.
    # -0.5 - Model simulation failed once and a restart is being ran.
    # -0.75 - The R/Python code to generate new parameters/stats has failed. CALIB.LOCK
    #         is put into place.
    # -1.0 - Model has failed twice. A RUN.LOCK file has been created.
    # Once all array elements are 1.0, then completeStatus goes to True, an entry into
    # the database occurs, and the program will complete.
    keySlot = np.empty([len(jobData.gages), int(jobData.nIter)])
    keySlot[:, :] = 0.0

    # Create an array to hold systme job ID values. This will only be used for
    # PBS as qstat has demonstrated slow behavior when doing a full qstat command.
    # We will track job ID values and do a qstat <jobID> and populate this array
    # to keep track of things.
    pbsJobId = np.empty([len(jobData.gages)], np.int64)
    pbsJobId[:] = -9999

    # NOTE this is different from the spinup. We have a 2D array of values to account
    # for all the iterations.
    entryValue = float(len(jobData.gages) * int(jobData.nIter))

    # Pull all the status values into the keySlot array.
    for basin in range(0, len(jobData.gages)):
        domainID = jobData.gageIDs[basin]

        if domainID == -9999:
            jobData.errMsg = "ERROR: Unable to locate domainID for gage: " + str(
                jobData.gages[basin])
            errMod.errOut(jobData)

        # We are going to pull all values for one basin, then place them into the array.
        # This is faster then looping over each iteration at a time.
        statusData = db.iterationStatus(jobData, domainID,
                                        str(jobData.gages[basin]))
        statusData = [list(item) for item in statusData]
        for iteration in range(0, int(jobData.nIter)):
            for iteration2 in range(0, int(jobData.nIter)):
                if statusData[iteration2][0] == iteration + 1:
                    keySlot[basin,
                            iteration] = float(statusData[iteration2][1])

    while not completeStatus:
        # Walk through calibration directories for each basin. Determine the status of
        # the model runs by the files available. If restarting, modify the
        # namelist files appropriately. Then, restart the model. If anything goes wrong, notifications
        # will either be emailed per the user's info, or piped to Slack for group
        # notification. A simulation is deemed complete when all expected RESTART
        # files are present and there are no jobs running for the basin. The parameter
        # estimation is deemed complete when CALIB_ITER.COMPLETE is present and
        # no calibration jobs for this basin are running.
        # Loop through each basin. Perform the following steps:
        # 1.) If status is -0.5,0.0, or 0.5, check to see if the model is running
        #     for this basin or if parameter estimation is occurring.
        # 2.) If the model is not running, check for expected output and perform
        #     necessary logistics. Continue to parameter estimation. Note that
        #     for the first iteration, R needs to be ran before the model to get
        #     initial default parameters.
        # 3.) Once the model is complete, the status goes to 0.75.
        # 4.) Fire off a job to run R/Python code for parameter estimation, generation
        #     , plot generation, and generation of model eval statistics to be
        #     entered into the DB. Status goes to 0.90.
        # 5.) Once the calibration job is complete, the status goes to 1.0 and the
        #     workflow is ready for the next iteration.
        # If the status goes to -1.0, a LOCK file is created and must be manually
        # removed from the user. Once the program detects this, it will restart the
        # model and the status goes back to 0.5.
        # If the status goes to -0.75, a LOCK file is created and needs to be removed
        # manually by the user before the workflow can continue.

        for basin in range(0, len(jobData.gages)):
            for iteration in range(0, int(jobData.nIter)):
                # Holding onto the status value before the workflow iterates for checking below.
                keyStatusCheck1 = keySlot[basin, iteration]
                # If the status is already 1.0, then continue the loop as now work needs to be done.
                if keyStatusCheck1 == 1.0:
                    continue
                else:
                    try:
                        calibMod.runModel(jobData, staticData, db,
                                          jobData.gageIDs[basin],
                                          jobData.gages[basin], keySlot, basin,
                                          iteration, pbsJobId)
                    except:
                        errMod.errOut(jobData)
                # Temporary for Cheyenne to slow down the strain on PBS.
                keyStatusCheck2 = keySlot[basin, iteration]
                # Put some spacing between launching model simulations to slow down que geting
                # overloaded.
                if keyStatusCheck1 == 0.25 and keyStatusCheck2 == 0.5:
                    time.sleep(15)
                if keyStatusCheck1 == 0.0 and keyStatusCheck2 == 0.5:
                    time.sleep(15)
                if keyStatusCheck1 == 0.5 and keyStatusCheck2 == 0.5:
                    time.sleep(15)
                if keyStatusCheck1 == 0.0 and keyStatusCheck2 == 0.25:
                    time.sleep(15)
                if keyStatusCheck1 == 0.5 and keyStatusCheck2 == 0.75:
                    time.sleep(15)
                if keyStatusCheck1 == 0.75 and keyStatusCheck2 == 0.9:
                    time.sleep(15)
                if keyStatusCheck1 == 0.5 and keyStatusCheck2 == 0.9:
                    time.sleep(15)
                if keyStatusCheck1 == 0.9 and keyStatusCheck2 == 0.9:
                    time.sleep(15)
                if keyStatusCheck1 == 0.9 and keyStatusCheck2 == 1.0:
                    time.sleep(15)

                # TEMPORARY FOR CHEYENNE
                # Check to make sure program hasn't passed a prescribed time limit. If it has,
                # exit gracefully.
                #timeCheckStamp = datetime.datetime.now()
                #programDtCheck = timeCheckStamp - begTimeStamp
                #if programDtCheck.seconds/60.0 > 90.0:
                #    # 90-minutes)
                #    try:
                #        fileObj = open(lockPath,'a')
                #        fileObj.write('WORKFLOW HAS HIT TIME LIMIT - EXITING....\n')
                #        fileObj.close()
                #    except:
                #        jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + lockPath
                #        errMod.errOut(jobData)

        # Check to see if program requirements have been met.
        if keySlot.sum() == entryValue:
            jobData.calibComplete = 1
            try:
                db.updateCalibStatus(jobData)
            except:
                errMod.errout(jobData)
            jobData.genMsg = "CALIBRATION FOR JOB ID: " + str(
                jobData.jobID) + " COMPLETE."
            errMod.sendMsg(jobData)
            completeStatus = True

        # Open the Python LOCK file. Write a blank line to the file and close it.
        # This action will simply modify the file modification time while only adding
        # a blank line.
        try:
            fileObj = open(lockPath, 'a')
            fileObj.write('\n')
            fileObj.close()
        except:
            jobData.errMsg = "ERROR: Unable to update workflow LOCK file: " + lockPath
            errMod.errOut(jobData)

    # Remove LOCK file
    os.remove(lockPath)
示例#6
0
def runModel(statusData, staticData, db, gageID, gage, keySlot, basinNum,
             pbsJobId):
    """
    Generic function for running the model. Some basic information about
    the run directory, beginning date, ending dates, account keys,
    number of cores to use, etc will be used to compose a BSUB
    submision script,execute mpiexec/mpirun, or a QSUB script. 
    This function will walk the run directory 
    to determine where the model left off. If no restart files exist,
    then the function will assume the model has not ran at all. Both
    the LSM and hydro restart files must be present in order for the
    model to restart. 
    """
    runDir = statusData.jobDir + "/" + gage + "/RUN.SPINUP/OUTPUT"
    workDir = statusData.jobDir + "/" + gage + "/RUN.SPINUP"
    if not os.path.isdir(workDir):
        statusData.errMsg = "ERROR: " + workDir + " not found."
        raise Exception()
    if not os.path.isdir(runDir):
        statusData.errMsg = "ERROR: " + runDir + " not found."
        raise Exception()

    # Pull gage metadata
    gageMeta = calibIoMod.gageMeta()
    try:
        gageMeta.pullGageMeta(statusData, db, gage, gageID)
    except:
        raise

    if statusData.jobRunType == 1:
        # If BSUB run script doesn't exist, create it here.
        bsubFile = runDir + "/run_WH.sh"
        if not os.path.isfile(bsubFile):
            try:
                generateBsubScript(statusData, int(gageID), runDir, gageMeta)
            except:
                raise
    if statusData.jobRunType == 2:
        pbsFile = runDir + "/run_WH.sh"
        if not os.path.isfile(pbsFile):
            try:
                generatePbsScript(statusData, int(gageID), runDir, gageMeta)
            except:
                raise
    if statusData.jobRunType == 3 or statusData.jobRunType == 6:
        slurmFile = runDir + "/run_WH.sh"
        if not os.path.isfile(slurmFile):
            try:
                generateSlurmScript(statusData, int(gageID), runDir, gageMeta)
            except:
                raise
    if statusData.jobRunType == 4 or statusData.jobRunType == 5:
        # If run script doesn't exist, create it here.
        runScript = runDir + "/run_WH.sh"
        if not os.path.isfile(runScript):
            try:
                generateMpiScript(statusData, int(gageID), runDir, gageMeta)
            except:
                raise

    # Calculate datetime objects
    begDate = statusData.bSpinDate
    endDate = statusData.eSpinDate

    # Initialize status
    keyStatus = keySlot[basinNum]

    try:
        basinStatus = statusMod.checkBasJob(statusData, basinNum, pbsJobId)
    except:
        raise

    # Create path to LOCK file if neeced
    lockPath = workDir + "/RUN.LOCK"

    # If the LOCK file is present, report this and lock things up.
    if os.path.isfile(lockPath):
        keySlot[basinNum] = -1.0
        keyStatus = -1.0
        runFlag = False
        print "MODEL IS LOCKED"
        #statusData.genMsg = "ERROR: Basin ID: " + str(gageID) + " Is locked. " + \
        #                    "Please remove: " + lockPath + " before continuing."
        #errMod.sendMsg(statusData)

    if keyStatus == 1.0:
        # Model has already completed
        runFlag = False
        return

    # For uncompleted simulations that are still listed as running.
    if keyStatus == 0.5:
        # If a model is running for this basin, continue and set keyStatus to 0.5
        if basinStatus:
            keySlot[basinNum] = 0.5
            keyStatus = 0.5
            runFlag = False
        else:
            # Either simulation has completed, or potentially crashed.
            runStatus = statusMod.walkMod(begDate, endDate, runDir)
            begDate = runStatus[0]
            endDate = runStatus[1]
            runFlag = runStatus[2]
            if runFlag:
                # Model crashed as simulation is not complete but no processes are running.
                statusData.genMsg = "WARNING: Simulation for gage: " + statusData.gages[basinNum] + \
                                    " Failed. Attempting to restart."
                print statusData.genMsg
                keySlot[basinNum] = -0.25
                keyStatus = -0.25
            else:
                # Model has completed!
                keySlot[basinNum] = 1.0
                keyStatus = 1.0
                runFlag = False

    # For simulations that are fresh
    if keyStatus == 0.0:
        if basinStatus:
            # Model is still running from previous instance of workflow. Allow it to continue.
            keySlot[basinNum] = 0.5
            keyStatus = 0.5
            runFlag = False
        else:
            runStatus = statusMod.walkMod(begDate, endDate, runDir)
            begDate = runStatus[0]
            endDate = runStatus[1]
            runFlag = runStatus[2]
            if not runFlag:
                # Model simulation completed before workflow was restarted
                keySlot[basinNum] = 1.0
                keyStatus = 1.0
                runFlag = False

    # For when the model failed TWICE and is locked.
    if keyStatus == -1.0:
        # If LOCK file exists, no simulation will take place. File must be removed
        # manually by user.
        if os.path.isfile(lockPath):
            runFlag = False
        else:
            # LOCK file was removed, upgrade status to 0.0 temporarily
            runStatus = statusMod.walkMod(begDate, endDate, runDir)
            begDate = runStatus[0]
            endDate = runStatus[1]
            runFlag = runStatus[2]
            if runFlag:
                keySlot[basinNum] = 0.0
                keyStatus = 0.0
            else:
                # Model sucessfully completed.
                keySlot[basinNum] = 1.0
                keyStatus = 1.0
                runFlag = False

    # For when the model crashed ONCE
    if keyStatus == -0.5:
        if basinStatus:
            # Model is running again, upgrade status
            # PLACEHOLDER FOR MORE ROBUST METHOD HERE.
            keySlot[basinNum] = 0.5
            keyStatus = 0.5
            runFlag = False
        else:
            runStatus = statusMod.walkMod(begDate, endDate, runDir)
            begDate = runStatus[0]
            endDate = runStatus[1]
            runFlag = runStatus[2]
            if runFlag:
                # Model has crashed again, time to lock it up and send a message out.
                statusData.genMsg = "ERROR: SIMULATION FOR GAGE: " + statusData.gages[basinNum] + \
                                    " HAS FAILED A SECOND TIME. PLEASE FIX ISSUE AND " + \
                                    "MANUALLY REMOVE LOCK FILE: " + lockPath
                errMod.sendMsg(statusData)
                open(lockPath, 'a').close()
                keySlot[basinNum] = -1.0
                keyStatus = -1.0
                runFlag = False
            else:
                # Model sucessfully completed from first failed attempt.
                keySlot[basinNum] = 1.0
                keyStatus = 1.0

    if keyStatus == -0.25 and runFlag:
        # Restarting model from one crash
        # First delete namelist files if they exist.
        check = runDir + "/namelist.hrldas"
        check2 = runDir + "/hydro.namelist"
        if os.path.isfile(check):
            os.remove(check)
        if os.path.isfile(check2):
            os.remove(check2)

        if begDate == staticData.bSpinDate:
            # Always cold start the model for the beginning......
            startType = 1
        else:
            # Alternative is the model HAS to have a restart file.
            startType = 2

        try:
            namelistMod.createHrldasNL(statusData, gageMeta, staticData,
                                       runDir, startType, begDate, endDate, 0)
            namelistMod.createHydroNL(statusData, gageMeta, staticData, runDir,
                                      startType, begDate, endDate, 0)
        except:
            raise

        if startType == 2:
            # Clean run directory of any old diagnostics files
            try:
                errMod.cleanRunDir(statusData, runDir)
            except:
                raise

        # Fire off model.
        if statusData.jobRunType == 1:
            cmd = "bsub < " + runDir + "/run_WH.sh"
        if statusData.jobRunType == 2:
            try:
                jobTmp = subprocess.check_output(
                    ['qsub', runDir + '/run_WH.sh'])
                pbsJobId[basinNum] = int(jobTmp.split('.')[0])
            except:
                statusData.errMsg = "ERROR: Unable to launch WRF-Hydro job for gage: " + str(
                    gageMeta.gage[basinNum])
                raise
        if statusData.jobRunType == 3 or statusData.jobRunType == 6:
            cmd = "sbatch " + runDir + "/run_WH.sh"
        if statusData.jobRunType == 4 or statusData.jobRunType == 5:
            cmd = runDir + "/run_WH.sh 1>" + runDir + "/WH_" + \
                  str(statusData.jobID) + "_" + str(gageID) + ".out" + \
                  ' 2>' + runDir + "/WH_" + str(statusData.jobID) + "_" + str(gageID) + ".err"
        try:
            if statusData.jobRunType == 1 or statusData.jobRunType == 3 or statusData.jobRunType == 6:
                subprocess.call(cmd, shell=True)
            if statusData.jobRunType == 4 or statusData.jobRunType == 5:
                p = subprocess.Popen([cmd], shell=True)
        except:
            statusData.errMsg = "ERROR: Unable to launch WRF-Hydro job for gage: " + str(
                gageMeta.gage[basinNum])
            raise

        # Revert statuses to -0.5 for next loop to convey the model crashed once.
        keyStatus = -0.5
        keySlot[basinNum] = -0.5

    if keyStatus == 0.0 and runFlag:
        # Model needs to be either ran, or restarted
        # First delete namelist files if they exist.
        check = runDir + "/namelist.hrldas"
        check2 = runDir + "/hydro.namelist"
        if os.path.isfile(check):
            os.remove(check)
        if os.path.isfile(check2):
            os.remove(check2)

        if begDate == staticData.bSpinDate:
            # Model always has to start from a cold start.....
            startType = 1
        else:
            # Otherwise, there HAS to be a restart file.....
            startType = 2

        try:
            namelistMod.createHrldasNL(statusData.gageMeta, staticData, runDir,
                                       startType, begDate, endDate, 0)
            namelistMod.createHydroNL(statusData, gageMeta, staticData, runDir,
                                      startType, begDate, endDate, 0)
        except:
            raise

        if startType == 2:
            # Clean run directory of any old diagnostics files
            try:
                errMod.cleanRunDir(statusData, runDir)
            except:
                raise

        # Fire off model.
        if statusData.jobRunType == 1:
            cmd = "bsub < " + runDir + "/run_WH.sh"
        if statusData.jobRunType == 2:
            try:
                jobTmp = subprocess.check_output(
                    ['qsub', runDir + '/run_WH.sh'])
                pbsJobId[basinNum] = int(jobTmp.split('.')[0])
            except:
                statusData.errMsg = "ERROR: Unable to launch WRF-Hydro job for gage: " + str(
                    gageMeta.gage[basinNum])
                raise
        if statusData.jobRunType == 3 or statusData.jobRunType == 6:
            cmd = "sbatch " + runDir + "/run_WH.sh"
        if statusData.jobRunType == 4 or statusData.jobRunType == 5:
            cmd = runDir + "/run_WH.sh 1>" + runDir + "/WH_" + \
                  str(statusData.jobID) + "_" + str(gageID) + ".out" + \
                  ' 2>' + runDir + "/WH_" + str(statusData.jobID) + "_" + str(gageID) + ".err"
        try:
            if statusData.jobRunType == 1 or statusData.jobRunType == 3 or statusData.jobRunType == 6:
                subprocess.call(cmd, shell=True)
            if statusData.jobRunType == 4 or statusData.jobRunType == 5:
                p = subprocess.Popen([cmd], shell=True)
        except:
            statusData.errMsg = "ERROR: Unable to launch WRF-Hydro job for gage: " + str(
                gageMeta.gage[basinNum])
            raise

        keyStatus = 0.5
        keySlot[basinNum] = 0.5