Пример #1
0
    def test_pmlogextract(self, pmlogextracnfn, adjustjobfn):
        
        pmlogextracnfn.return_value = -10

        summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options)

        self.verify_errors(ProcessingError.PMLOGEXTRACT_ERROR, 'skipped_pmlogextract_error')
Пример #2
0
    def test_jobtoolong(self, summaryclass, extract):
        extract.return_value = 0

        self.mockjob.configure_mock(walltime = 99999999)

        summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options)

        self.verify_errors(ProcessingError.TIME_TOO_LONG, 'skipped_too_long')
Пример #3
0
    def test_invlid_nodecount(self, summaryclass, extract):
        extract.return_value = 0

        self.mockjob.configure_mock(nodecount = 0)

        summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options)

        self.verify_errors(ProcessingError.INVALID_NODECOUNT, 'skipped_invalid_nodecount')
Пример #4
0
    def test_parallel_too_short(self, summaryclass, extract):
        extract.return_value = 0

        self.mockjob.configure_mock(walltime = 599, nodecount = 10)
        self.options['min_parallel_duration'] = 600

        summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options)

        self.verify_errors(ProcessingError.PARALLEL_TOO_SHORT, 'skipped_parallel_too_short')
Пример #5
0
    def test_job_too_short(self, summaryclass, extract):
        extract.return_value = 0

        self.mockjob.configure_mock(walltime = 128)
        self.options['min_duration'] = 129

        summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options)

        self.verify_errors(ProcessingError.TIME_TOO_SHORT, 'skipped_too_short')
Пример #6
0
    def test_jobtoonodehours(self, summaryclass, extract):
        """ test the too many nodehours error """
        extract.return_value = 0

        self.mockjob.configure_mock(walltime=1000, nodecount=500)
        self.options['max_nodetime'] = 499999

        summarizejob(self.mockjob, self.mockconf, {}, [], [], self.mockoutput, self.mocklog, self.options)

        self.verify_errors(ProcessingError.JOB_TOO_MANY_NODEHOURS, 'skipped_job_nodehours')
Пример #7
0
    def test_pmlogextractfail2(self, popen, getextractcmdline, adjustjobfn):
        
        popensettings = {
                'communicate.return_value': ("","__pmLogPutResult2: write failed: returns 491416 expecting 1246712: Cannot allocate memory"),
                'returncode': 1
        }
        popen.return_value = Mock(**popensettings)

        configres = {'getsection.return_value': {'subdir_out_format': '%j', 'archive_out_dir': tempfile.mkdtemp()}}
        self.mockconf.configure_mock(**configres)

        summarizejob(self.mockjob, self.mockconf, self.mockresconf, [], [], self.mockoutput, self.mocklog, self.options)

        self.verify_errors(ProcessingError.PMLOGEXTRACT_ERROR, 'skipped_pmlogextract_error')
Пример #8
0
    def test_pmlogextractfail1(self, popen, getextractcmdline, adjustjobfn):
        
        popensettings = {
                'communicate.return_value': ("","pmlogextract: Warning: no qualifying records found."),
                'returncode': 1
        }
        popen.return_value = Mock(**popensettings)

        configres = {'getsection.return_value': {'subdir_out_format': '%j', 'archive_out_dir': tempfile.mkdtemp()}}
        self.mockconf.configure_mock(**configres)

        summarizejob(self.mockjob, self.mockconf, self.mockresconf, [], [], self.mockoutput, self.mocklog, self.options)

        self.verify_errors(ProcessingError.PMLOGEXTRACT_ERROR, 'skipped_pmlogextract_error')
Пример #9
0
    def test_pmlogextractfail0(self, popen, getextractcmdline, adjustjobfn):

        popensettings = {
            'communicate.return_value':
            ("",
             "__pmLogPutResult2: write failed: returns 804876 expecting 954704: No space left on device"
             ),
            'returncode':
            1
        }
        popen.return_value = Mock(**popensettings)

        configres = {
            'getsection.return_value': {
                'subdir_out_format': '%j',
                'archive_out_dir': tempfile.mkdtemp()
            }
        }
        self.mockconf.configure_mock(**configres)

        _, mdata, _, error = summarizejob(self.mockjob, self.mockconf,
                                          self.mockresconf, [], [],
                                          self.options)

        self.verify_errors(ProcessingError.PMLOGEXTRACT_ERROR,
                           'skipped_pmlogextract_error', error, mdata)
Пример #10
0
def process_resource(resconf, preprocs, plugins, config, opts):
    with outputter.factory(config, resconf, dry_run=opts["dry_run"]) as m:

        if resconf['batch_system'] == "XDMoD":
            dbif = XDMoDAcct(resconf['resource_id'], config)
        else:
            dbif = DbAcct(resconf['resource_id'], config)

        for job in get_jobs(opts, dbif):
            try:
                summarize_start = time.time()
                res = summarizejob(job, config, resconf, plugins, preprocs,
                                   opts)
                if res is None:
                    continue  # Extract-only mode
                s, mdata, success, s_err = res
                summarize_time = time.time() - summarize_start
                summary_dict = s.get()
            except Exception as e:
                logging.error(
                    "Failure for summarization of job %s %s. Error: %s %s",
                    job.job_id, job.jobdir, str(e), traceback.format_exc())
                clean_jobdir(opts, job)
                if opts["fail_fast"]:
                    raise
                else:
                    continue

            process_summary(m, dbif, opts, job, summarize_time,
                            (summary_dict, mdata, success, s_err))
            clean_jobdir(opts, job)
Пример #11
0
def process_job(config, dbif, job, m, opts, plugins, preprocs, resconf):
    try:
        summarize_start = time.time()
        summary, mdata, success, summarize_error = summarizejob(job, config, resconf, plugins, preprocs, opts)
        summarize_time = time.time() - summarize_start

        # TODO: change behavior so markasdone only happens if this is successful
        m.process(summary, mdata)

        if not opts['dry_run']:
            dbif.markasdone(job, success, summarize_time, summarize_error)

    except Exception as e:
        logging.error("Failure for job %s %s. Error: %s %s", job.job_id, job.jobdir, str(e), traceback.format_exc())

    finally:
        if opts['dodelete'] and job.jobdir is not None and os.path.exists(job.jobdir):
            # Clean up
            shutil.rmtree(job.jobdir)
Пример #12
0
def do_summarize(args):
    """
    used in a separate process
    """
    job, config, resconf, plugins, preprocs, opts = args
    try:
        summarize_start = time.time()
        res = summarizejob(job, config, resconf, plugins, preprocs, opts)
        if res is None:
            return job, None, None  # Extract-only mode
        s, mdata, success, s_err = res
        summarize_time = time.time() - summarize_start
        # Ensure Summarize.get() is called on worker process since it is cpu-intensive
        summary_dict = s.get()
    except Exception as e:
        logging.error("Failure for summarization of job %s %s. Error: %s %s",
                      job.job_id, job.jobdir, str(e), traceback.format_exc())
        if opts["fail_fast"]:
            raise
        return job, None, None

    return job, (summary_dict, mdata, success, s_err), summarize_time
Пример #13
0
def processjobs(config, opts, procid):
    """ main function that does the work. One run of this function per process """

    allpreprocs = loadpreprocessors()
    logging.debug("Loaded %s preprocessors", len(allpreprocs))

    allplugins = loadplugins()
    logging.debug("Loaded %s plugins", len(allplugins))

    for r, resconf in config.resourceconfigs():
        if opts['resource'] == None or opts['resource'] == r or opts[
                'resource'] == str(resconf['resource_id']):
            logging.info("Processing resource %s", r)
        else:
            continue

        resconf = override_defaults(resconf, opts)

        preprocs, plugins = filter_plugins(resconf, allpreprocs, allplugins)

        logging.debug("Using %s preprocessors", len(preprocs))
        logging.debug("Using %s plugins", len(plugins))

        with outputter.factory(config, resconf) as m:

            if resconf['batch_system'] == "XDMoD":
                dbif = XDMoDAcct(resconf['resource_id'], config,
                                 opts['threads'], procid)
            else:
                dbif = DbAcct(resconf['resource_id'], config, opts['threads'],
                              procid)

            if opts['mode'] == "single":
                for job in dbif.getbylocaljobid(opts['local_job_id']):
                    summarizejob(job, config, resconf, plugins, preprocs, m,
                                 dbif, opts)
            elif opts['mode'] == "timerange":
                for job in dbif.getbytimerange(opts['start'], opts['end'],
                                               opts):
                    summarizejob(job, config, resconf, plugins, preprocs, m,
                                 dbif, opts)
            else:
                for job in dbif.get(None, None):
                    summarizejob(job, config, resconf, plugins, preprocs, m,
                                 dbif, opts)
Пример #14
0
def processjobs(config, opts, procid, comm):
    """ main function that does the work. One run of this function per process """

    allpreprocs = loadpreprocessors()
    logging.debug("Loaded %s preprocessors", len(allpreprocs))

    allplugins = loadplugins()
    logging.debug("Loaded %s plugins", len(allplugins))

    for r, resconf in config.resourceconfigs():
        if opts['resource'] == None or opts['resource'] == r or opts['resource'] == str(resconf['resource_id']):
            logging.info("Processing resource %s", r)
        else:
            continue

        resconf = override_defaults(resconf, opts)

        preprocs, plugins = filter_plugins(resconf, allpreprocs, allplugins)

        logging.debug("Using %s preprocessors", len(preprocs))
        logging.debug("Using %s plugins", len(plugins))

        with outputter.factory(config, resconf) as m:

            if resconf['batch_system'] == "XDMoD":
                dbif = XDMoDAcct(resconf['resource_id'], config, None, None)
            else:
                dbif = DbAcct(resconf['resource_id'], config)

            list_procs = 0

            # Master
            if procid == 0:
                getjobs = {}
                if opts['mode'] == "single":
                    getjobs['cmd'] = dbif.getbylocaljobid
                    getjobs['opts'] = [opts['local_job_id'],]
                elif opts['mode'] == "timerange":
                    getjobs['cmd'] = dbif.getbytimerange
                    getjobs['opts'] = [opts['start'], opts['end'], opts]
                else:
                    getjobs['cmd'] = dbif.get
                    getjobs['opts'] = [None, None]

                logging.debug("MASTER STARTING")
                numworkers = opts['threads']-1
                numsent = 0
                numreceived = 0


                for job in getjobs['cmd'](*(getjobs['opts'])):
                    if numsent >= numworkers:
                        list_procs += 1
                        if opts['dump_proclist'] and (list_procs == 1 or list_procs == 1000):
                            # Once all ranks are going, dump the process list for debugging
                            logging.info("Dumping process list")
                            allpinfo = {}
                            for proc in psutil.process_iter():
                                try:
                                    pinfo = proc.as_dict()
                                except psutil.NoSuchProcess:
                                    pass
                                else:
                                    allpinfo[pinfo['pid']] = pinfo

                            with open("rank-{}_{}.proclist".format(procid, list_procs), 'w') as outfile:
                                json.dump(allpinfo, outfile, indent=2)

                        # Wait for a worker to be done and then send more work
                        process = comm.recv(source=MPI.ANY_SOURCE, tag=1)
                        numreceived += 1
                        comm.send(job, dest=process, tag=1)
                        numsent += 1
                        logging.debug("Sent new job: %d sent, %d received", numsent, numreceived)
                    else:
                        # Initial batch
                        comm.send(job, dest=numsent+1, tag=1)
                        numsent += 1
                        logging.debug("Initial Batch: %d sent, %d received", numsent, numreceived)

                logging.info("After all jobs sent: %d sent, %d received", numsent, numreceived)

                # Get leftover results
                while numsent > numreceived:
                    comm.recv(source=MPI.ANY_SOURCE, tag=1)
                    numreceived += 1
                    logging.debug("Getting leftovers. %d sent, %d received", numsent, numreceived)

                # Shut them down
                for worker in xrange(numworkers):
                    logging.debug("Shutting down: %d", worker+1)
                    comm.send(None, dest=worker+1, tag=1)

            # Worker
            else:
                sendtime = time.time()
                midtime = time.time()
                recvtime = time.time()
                logging.debug("WORKER %d STARTING", procid)
                while True:
                    recvtries = 0
                    while not comm.Iprobe(source=0, tag=1):
                        if recvtries < 1000:
                            recvtries += 1
                            continue
                        # Sleep so we can instrument how efficient we are
                        # Otherwise, workers spin on exit at the hidden mpi_finalize call.
                        # If you care about maximum performance and don't care about wasted cycles, remove the Iprobe/sleep loop
                        # Empirically, a tight loop with time.sleep(0.001) uses ~1% CPU
                        time.sleep(0.001)
                    job = comm.recv(source=0, tag=1)
                    recvtime = time.time()
                    mpisendtime = midtime-sendtime
                    mpirecvtime = recvtime-midtime
                    if (mpisendtime+mpirecvtime) > 2:
                        logging.warning("MPI send/recv took %s/%s", mpisendtime, mpirecvtime)
                    if job != None:
                        logging.debug("Rank: %s, Starting: %s", procid, job.job_id)
                        summarizejob(job, config, resconf, plugins, preprocs, m, dbif, opts)
                        logging.debug("Rank: %s, Finished: %s", procid, job.job_id)
                        sendtime = time.time()
                        comm.send(procid, dest=0, tag=1)
                        midtime = time.time()

                        list_procs += 1
                        if opts['dump_proclist'] and (list_procs == 1 or list_procs == 10):
                            # Once all ranks are going, dump the process list for debugging
                            logging.info("Dumping process list")
                            allpinfo = {}
                            for proc in psutil.process_iter():
                                try:
                                    pinfo = proc.as_dict()
                                except psutil.NoSuchProcess:
                                    pass
                                else:
                                    allpinfo[pinfo['pid']] = pinfo

                            with open("rank-{}_{}.proclist".format(procid, list_procs), 'w') as outfile:
                                json.dump(allpinfo, outfile, indent=2)
                    else:
                        # Got shutdown message
                        break