示例#1
0
        jobspec.update({'preemptable':True})
    jobspec.update({'cwd':opts['cwd'], 'command':command[0], 'args':command[1:]})

    if opts['dependencies']:
        Cobalt.Util.check_dependencies(opts['dependencies'])
        jobspec['all_dependencies'] = opts['dependencies']
    try:
        filters = CP.get('cqm', 'filters').split(':')
    except ConfigParser.NoOptionError:
        filters = []
    for filt in filters:
        Cobalt.Util.processfilter(filt, jobspec)

    try:
        cqm = ComponentProxy("queue-manager", defer=False)
        job = cqm.add_jobs([jobspec])
    except ComponentLookupError:
        print >> sys.stderr, "Failed to connect to queue manager"
        sys.exit(1)
    except xmlrpclib.Fault, flt:
        if flt.faultCode == QueueError.fault_code:
            logger.error(flt.faultString)
            sys.exit(1)
        else:
            logger.error("Job submission failed")
            print repr(flt.faultCode)
            print repr(QueueError.fault_code)
            logger.error(flt)
            sys.exit(1)
    except:
        logger.error("Error submitting job")
示例#2
0
        level = 10
    Cobalt.Logging.setup_logging('cmd', to_syslog=False, level=0)
    user = pwd.getpwuid(os.getuid())[0]
    try:
        pm = ComponentProxy("process-manager", defer=False)
    except ComponentLookupError:
        print >> sys.stderr, "Failed to connect to process manager"
        sys.exit(1)

    r = pm.add_jobs([{
        'tag': 'process-group',
        'user': user,
        'args': [],
        'env': {},
        'executable': '/tmp/testscript',
        'size': 700,
        'cwd': '/tmp',
        'location': ['ANLR00'],
        'outputfile': '/tmp/test1-output',
        'errorfile': '/tmp/test1-error',
        'id': '*'
    }])
    print "jobs : " + ` len(r) `
    pgid = r[0]['id']
    while True:
        r = pm.get_jobs([{'tag': 'process-group', 'id': pgid, 'state': '*'}])
        state = r[0]['state']
        if state == 'running':
            time.sleep(5)
            continue
        else:
示例#3
0
from Cobalt.Exceptions import ComponentLookupError
import Cobalt.Util

if __name__ == '__main__':
    level = 20
    if '-d' in sys.argv:
        level = 10
    Cobalt.Logging.setup_logging('cmd', to_syslog=False, level=0)
    user = pwd.getpwuid(os.getuid())[0]
    try:
        pm = ComponentProxy("process-manager", defer=False)
    except ComponentLookupError:
        print >> sys.stderr, "Failed to connect to process manager"
        sys.exit(1)

    r = pm.add_jobs([{'tag':'process-group', 'user':user, 'args':[], 'env':{}, 
                                'executable':'/tmp/testscript', 'size':700, 'cwd':'/tmp', 'location':['ANLR00'],
                                'outputfile':'/tmp/test1-output', 'errorfile':'/tmp/test1-error', 'id': '*'}])
    print "jobs : " + `len(r)`
    pgid = r[0]['id']
    while True:
        r = pm.get_jobs([{'tag':'process-group', 'id':pgid, 'state':'*'}])
        state = r[0]['state']
        if state == 'running':
            Cobalt.Util.sleep(5)
            continue
        else:
            break
    print "process group %s has completed" % (pgid)
        
示例#4
0
    if opts['disable_preboot']:
        jobspec['script_preboot'] = False

    if opts['dependencies']:
        Cobalt.Util.check_dependencies(opts['dependencies'])
        jobspec['all_dependencies'] = opts['dependencies']
    try:
        filters = CP.get('cqm', 'filters').split(':')
    except ConfigParser.NoOptionError:
        filters = []
    for filt in filters:
        Cobalt.Util.processfilter(filt, jobspec)

    try:
        cqm = ComponentProxy("queue-manager", defer=False)
        job = cqm.add_jobs([jobspec])
    except ComponentLookupError:
        print >> sys.stderr, "Failed to connect to queue manager"
        sys.exit(1)
    except xmlrpclib.Fault, flt:
        if flt.faultCode == QueueError.fault_code:
            logger.error(flt.faultString)
            sys.exit(1)
        else:
            logger.error("Job submission failed")
            print repr(flt.faultCode)
            print repr(QueueError.fault_code)
            logger.error(flt)
            sys.exit(1)
    except:
        logger.error("Error submitting job")
示例#5
0
    def test_something(self):
        logging.basicConfig()

        try:
            cqm = ComponentProxy("queue-manager")
        except ComponentLookupError:
            assert not "failed to connect to queue manager"

        # add a queue
        queues = cqm.add_queues([{"tag": "queue", "name": "default"}])
        assert len(queues) == 1

        # try adding a job to a queue that doesn't exist
        try:
            jobs = cqm.add_jobs([{"tag": "job", "queue": "jonx"}])
        except xmlrpclib.Fault:
            # trying to add a job to a queue that doesn't exist results in an xmlrpc Fault
            pass
        else:
            assert not "Adding job to non-existent queue should raise xmlrpclib.Fault"

        # get the list of available partitions and add them to the pool of managed partitions
        try:
            simulator = ComponentProxy("system")
        except ComponentLookupError:
            assert not "failed to connect to simulator"

        for part_name in self.system._partitions:
            partitions = simulator.add_partitions([{"tag": "partition", "name": part_name, "queue": "default"}])
            assert len(partitions) == 1
            partitions = simulator.set_partitions(
                [{"tag": "partition", "name": part_name}], {"functional": True, "scheduled": True}
            )
            assert len(partitions) == 1

        partitions = simulator.get_partitions([{"name": "*", "size": "*", "queue": "*"}])
        assert len(partitions) > 0

        # now run a real job
        #
        # 1. add the job to the default queue
        # 2. obtain a partition for it to run on
        # 3. start running it on that paritition
        # 4. check that it started running
        # 5. sleep for a bit, and then check that it's still running
        # 6. sleep some more and then check to see if it actually finished running

        nodes = partitions[0]["size"]
        jobs = cqm.add_jobs(
            [
                {
                    "queue": "default",
                    "mode": "co",
                    "command": "/bin/ls",
                    "outputdir": os.getcwd(),
                    "walltime": 4,
                    "nodes": nodes,
                    "procs": nodes,
                    "args": [],
                    "user": "******",
                    "jobid": "*",
                }
            ]
        )
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job["jobid"]
        job_location_args = [
            {
                "jobid": jobid,
                "nodes": job["nodes"],
                "queue": job["queue"],
                "utility_score": 1,
                "threshold": 1,
                "walltime": job["walltime"],
                "attrs": {},
            }
        ]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{"jobid": jobid}], location)

        r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
            if r:
                if time.time() - start_time > 240:
                    assert not "the job seems to have run overtime"
                else:
                    time.sleep(5)
            else:
                break

        # this time, we'll add a job to the queue, start the job, sleep for a bit
        # and then try to kill the job before it has finished
        nodes = partitions[0]["size"]
        jobs = cqm.add_jobs(
            [
                {
                    "queue": "default",
                    "mode": "co",
                    "command": "/bin/ls",
                    "outputdir": os.getcwd(),
                    "walltime": 4,
                    "nodes": nodes,
                    "procs": nodes,
                    "args": [],
                    "user": "******",
                    "jobid": "*",
                }
            ]
        )
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job["jobid"]
        job_location_args = [
            {
                "jobid": jobid,
                "nodes": job["nodes"],
                "queue": job["queue"],
                "utility_score": 1,
                "threshold": 1,
                "walltime": job["walltime"],
                "attrs": {},
            }
        ]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{"jobid": jobid}], location)

        r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{"jobid": jobid, "is_active": True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        cqm.del_jobs([{"jobid": jobid}])

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{"jobid": jobid, "is_active": True, "state": "*"}])
            if r:
                if time.time() - start_time > 30:
                    assert not "the job didn't die when asked to"
                else:
                    time.sleep(1)
            else:
                break
示例#6
0
    def test_something(self):
        logging.basicConfig()

        try:
            cqm = ComponentProxy("queue-manager")
        except ComponentLookupError:
            assert not "failed to connect to queue manager"

        # add a queue
        queues = cqm.add_queues([{'tag': "queue", 'name': "default"}])
        assert len(queues) == 1

        # try adding a job to a queue that doesn't exist
        try:
            jobs = cqm.add_jobs([{'tag': "job", 'queue': "jonx"}])
        except xmlrpclib.Fault:
            # trying to add a job to a queue that doesn't exist results in an xmlrpc Fault
            pass
        else:
            assert not "Adding job to non-existent queue should raise xmlrpclib.Fault"

        # get the list of available partitions and add them to the pool of managed partitions
        try:
            simulator = ComponentProxy("system")
        except ComponentLookupError:
            assert not "failed to connect to simulator"

        for part_name in self.system._partitions:
            partitions = simulator.add_partitions([{
                'tag': "partition",
                'name': part_name,
                'queue': "default"
            }])
            assert len(partitions) == 1
            partitions = simulator.set_partitions([{
                'tag': "partition",
                'name': part_name
            }], {
                'functional': True,
                'scheduled': True
            })
            assert len(partitions) == 1

        partitions = simulator.get_partitions([{
            'name': "*",
            'size': "*",
            'queue': "*"
        }])
        assert len(partitions) > 0

        # now run a real job
        #
        # 1. add the job to the default queue
        # 2. obtain a partition for it to run on
        # 3. start running it on that paritition
        # 4. check that it started running
        # 5. sleep for a bit, and then check that it's still running
        # 6. sleep some more and then check to see if it actually finished running

        nodes = partitions[0]['size']
        jobs = cqm.add_jobs([{
            'queue': "default",
            'mode': "co",
            'command': "/bin/ls",
            'outputdir': os.getcwd(),
            'walltime': 4,
            'nodes': nodes,
            'procs': nodes,
            'args': [],
            'user': "******",
            'jobid': "*"
        }])
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job['jobid']
        job_location_args = [{
            'jobid': jobid,
            'nodes': job['nodes'],
            'queue': job['queue'],
            'utility_score': 1,
            'threshold': 1,
            'walltime': job['walltime'],
            'attrs': {}
        }]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{'jobid': jobid}], location)

        r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{
                'jobid': jobid,
                'state': "*",
                'is_active': True
            }])
            if r:
                if time.time() - start_time > 240:
                    assert not "the job seems to have run overtime"
                else:
                    time.sleep(5)
            else:
                break

        # this time, we'll add a job to the queue, start the job, sleep for a bit
        # and then try to kill the job before it has finished
        nodes = partitions[0]['size']
        jobs = cqm.add_jobs([{
            'queue': "default",
            'mode': "co",
            'command': "/bin/ls",
            'outputdir': os.getcwd(),
            'walltime': 4,
            'nodes': nodes,
            'procs': nodes,
            'args': [],
            'user': "******",
            'jobid': "*"
        }])
        assert len(jobs) == 1

        job = jobs[0]
        jobid = job['jobid']
        job_location_args = [{
            'jobid': jobid,
            'nodes': job['nodes'],
            'queue': job['queue'],
            'utility_score': 1,
            'threshold': 1,
            'walltime': job['walltime'],
            'attrs': {}
        }]
        locations = simulator.find_job_location(job_location_args, [])
        assert locations.has_key(jobid)

        location = locations[jobid]
        cqm.run_jobs([{'jobid': jobid}], location)

        r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}])
        if not r:
            assert not "the job didn't start"

        time.sleep(20)

        r = cqm.get_jobs([{'jobid': jobid, 'is_active': True}])
        if len(r) != 1:
            assert not "the job has stopped running prematurely"

        cqm.del_jobs([{'jobid': jobid}])

        start_time = time.time()
        while True:
            r = cqm.get_jobs([{
                'jobid': jobid,
                'is_active': True,
                'state': "*"
            }])
            if r:
                if time.time() - start_time > 30:
                    assert not "the job didn't die when asked to"
                else:
                    time.sleep(1)
            else:
                break