示例#1
0
def watch_jobs(jobs):
    """
    Monitors jobs running. If jobs request frequency, then submits to frequency
    calculation
    """
    orunning = list()
    frunning = list()
    ocomplete = list()
    fcomplete = list()
    ocrashed = list()
    fcrashed = list()
    crashed = list()
    completed = list()
    failed_submit = list()
    stuck = list()

    allcomplete = False
    jobdict = dict()

    starttime = time()

    for job in jobs:
        if job.status == 'Opt Submitted' or job.status == 'TS Submitted':
            orunning.append(job.jobid)
            jobdict[job.jobid] = job
        elif job.status == 'Freq Submitted':
            frunning.append(job.jobid)
            jobdict[job.jobid] = job
        else:
            failed_submit.append(job.name + ' - ' + job.status)
        

    logging.info('There are {} jobs being watched.'.format(
        len(jobdict)
        ))

    if len(failed_submit) > 0:
        logging.warning('There are {} jobs that failed to launch:\n{}'.format(
            len(failed_submit),
            turbogo_helpers.list_str(failed_submit)
            ))
    if len(jobdict) == 0:
        exit()

    loopcount = 0
    change = False
    #delay to ensures all jobs are in queue, and catch first moment fails
    sleep(60)

    while not allcomplete:
        alljobs = turbogo_helpers.get_all_active_jobs()
        if len(alljobs) == 0 and (len(orunning) > 0 or len(frunning) > 0):
            #possible fail at getting jobs from queue
            sleep(60)
            alljobs = turbogo_helpers.get_all_active_jobs()
            if len(alljobs) == 0:
                #One more try
                sleep(300)
                alljobs = turbogo_helpers.get_all_active_jobs()

        checkojobs = list(orunning)
        checkfjobs = list(frunning)

        for job in alljobs:
            if job in checkojobs:
                checkojobs.remove(job)
            elif job in checkfjobs:
                checkfjobs.remove(job)

        if len(checkojobs) != 0:
            #Some jobs not running
            for ojob in checkojobs:
                job = jobdict[ojob]
                del jobdict[ojob]
                orunning.remove(job.jobid)
                #find out what happened to the job & deal with it
                status = check_opt(job)
                if status == 'freq':
                    ocomplete.append(job.name)
                    frunning.append(job.jobid)
                    jobdict[job.jobid] = job
                    logging.debug(
                        "Job {} submitted for freq with jobid {}.".format(
                        job.name, job.jobid
                    ))
                elif status == 'fcrashed':
                    fcrashed.append(job.name)
                    crashed.append(job.name)
                    logging.debug("Job {} crashed starting freq.".format(
                        job.name
                    ))
                elif status == 'ocrashed':
                    ocrashed.append(job.name)
                    crashed.append(job.name)
                    logging.debug("Job {} crashed in opt.".format(
                        job.name
                    ))
                else:
                    completed.append(job.name)
                    write_stats(job)
                    logging.debug("Job {} completed opt.".format(
                        job.name
                    ))
            change = True

        if len(checkfjobs) != 0:
            #some freq not running
            for fjob in checkfjobs:
                job = jobdict[fjob]
                del jobdict[fjob]
                frunning.remove(job.jobid)
                #find out what happened to the job and deal with it
                status = check_freq(job)
                if status == 'opt':
                    #job was resubmitted with new geometry to avoid saddle point
                    orunning.append(job.jobid)
                    jobdict[job.jobid] = job
                    logging.debug(
                        "Job {} resubmitted for opt with jobid {}.".format(
                        job.name, job.jobid
                    ))
                elif status == 'fcrashed':
                    fcrashed.append(job.name)
                    crashed.append(job.name)
                    logging.debug("Job {} crashed starting freq.".format(
                        job.name
                    ))
                elif status == 'ocrashed':
                    ocrashed.append(job.name)
                    crashed.append(job.name)
                    logging.debug("Job {} crashed restarting opt.".format(
                        job.name
                    ))
                elif status == 'same' or status == 'imaginary':
                    stuck.append(job.name)
                    write_stats(job)
                    logging.info(
                        "Job {} stuck on transition state with freq {}.".format(
                            job.name, job.firstfreq))
                elif status == 'ts':
                    write_stats(job)
                    completed.append(job.name)
                    logging.debug("Job {} completed ts.".format(
                        job.name
                    ))
                else:
                    write_stats(job)
                    completed.append(job.name)
                    logging.debug("Job {} completed freq.".format(
                        job.name
                    ))
            change = True

        if len(orunning) == 0 and len(frunning) == 0:
            #all jobs finished or crashed:
            allcomplete = True
        else:
            if loopcount % (3*6) == 0 and change == True:
                #3-Hourly status update if a change happened
                logstring = "\n----------------------------------------------" \
                            "------\n"
                logstring += "At {}:\n".format(strftime("%d/%m/%y %H:%M:%S"))
                if len(orunning) > 0:
                    logstring += "There are {} running opt jobs:\n{}\n".format(
                        len(orunning), turbogo_helpers.list_str(orunning))
                if len(frunning) > 0:
                    logstring += "There are {} running freq jobs:\n{}\n".format(
                        len(frunning), turbogo_helpers.list_str(frunning))
                if len(crashed) > 0:
                    logstring += "There are {} crashed jobs:\n{}\n".format(
                        len(crashed),
                        turbogo_helpers.list_str(crashed))
                if len(stuck) > 0:
                    logstring += "There are {} stuck jobs:\n{}\n".format(
                        len(stuck), turbogo_helpers.list_str(stuck))
                if len(completed) > 0:
                    logstring += "There are {} completed jobs:\n{}\n".format(
                        len(completed), turbogo_helpers.list_str(completed))
                logstring += "-----------------------------------------------" \
                             "-----"
                logging.info(logstring)
                change = False
            loopcount += 1
            sleep(10*60)

    #after job finished/crashed logging
    elapsed = turbogo_helpers.time_readable(time()-starttime)

    logging.warning("{} jobs completed. {} jobs crashed.".format(
        len(fcomplete),len(crashed)))

    logstring = "\n----------------------------------------------------\n"
    logstring += "Completed at {} after {}:\n".format(
        strftime("%d/%m/%y %H:%M:%S"), elapsed)
    if len(completed) > 0:
        logstring += "There are {} completed jobs:\n{}\n".format(
            len(completed), turbogo_helpers.list_str(completed))
    if len(stuck) > 0:
        logstring += "There are {} stuck jobs:\n{}\n".format(
            len(stuck), turbogo_helpers.list_str(stuck))
    if len(crashed) > 0:
        logstring += "There are {} crashed jobs:\n{}\n".format(
            len(crashed), turbogo_helpers.list_str(crashed))
    if len(failed_submit) > 0:
        logstring += "There are {} jobs that failed to start:\n{}\n".format(
            len(failed_submit), turbogo_helpers.list_str(failed_submit))
    logstring += "----------------------------------------------------"
    logging.info(logstring)
示例#2
0
def write_stats(job):
    """
    Writes a line to the stats file for each job that completes successfully.
    """
    if not turbogo_helpers.check_files_exist(['stats.txt']):
        #write the header to the file
        try:
            with open('stats.txt', 'w') as f:
                f.write("{name:^16}{directory:^20}{optsteps:^10}{opttime:^12}" \
                        "{freqtime:^12}{tottime:^12}{firstfreq:^16}{energy:^16}"
                .format(
                    name='Name',
                    directory = 'Directory',
                    optsteps = 'Opt Steps',
                    opttime = 'Opt Time',
                    freqtime = 'Freq Time',
                    tottime = 'Total Time',
                    firstfreq = '1st Frequency',
                    energy = 'Energy',
                ))
                f.write('\n')
        except IOError as e:
            logging.warning("Error preparing stats file: {}".format(e))
        except Exception as e:
            logging.warning("Unknown error {}".format(e))
    name = job.name
    directory = os.path.join(job.indir, job.infile)
    try:
        with open(os.path.join(job.indir, 'energy')) as f:
            lines = f.readlines()
    except IOError as e:
        logging.warning("Error reading energy file for stats: {}".format(e))
        optsteps = '?'
        energy = '?'
    except Exception as e:
        logging.warning("Unknown error {}.".format(e))
    else:
        optsteps = lines[-2][:6].strip()
        energy = lines[-2][6:22].strip()
    opttime = turbogo_helpers.time_readable(job.otime)
    freqtime = turbogo_helpers.time_readable(job.ftime)
    tottime = turbogo_helpers.time_readable(job.otime + job.ftime)
    firstfreq = job.firstfreq
    try:
        with open('stats.txt', 'a') as f:
            f.write("{name:^16.16}{directory:^20.20}{optsteps:^10.10}" \
                    "{opttime:^12.12}{freqtime:^12.12}{tottime:^12.12}" \
                    "{firstfreq:^16.16}{energy:^16.16}"
            .format(
                name=name,
                directory = directory,
                optsteps = optsteps,
                opttime = opttime,
                freqtime = freqtime,
                tottime = tottime,
                firstfreq = firstfreq,
                energy = energy,
            ))
            f.write('\n')
    except (OSError, IOError) as e:
        logging.warning("Error writing stats file: {}".format(e))