示例#1
0
    def test__pilot_errors(self):
        """ Test if pilot errors are raised properly.
        """
        session = rp.Session()

        try:
            pm = rp.PilotManager(session=session)

            cpd = rp.ComputePilotDescription()
            cpd.resource = "local.localhost"
            cpd.cores = 1
            cpd.runtime = 1
            cpd.sandbox = "/non-/existing/directory..."
            cpd.cleanup = True

            pilot = pm.submit_pilots(descriptions=cpd)
            pilot.wait(timeout=300)
            assert pilot.state == rp.FAILED, "State is '%s' instead of 'Failed'." % pilot.state

            cpd = rp.ComputePilotDescription()
            cpd.resource = "local.localhost"
            cpd.cores = 100000000000  # This should fail - at least in 2014 ;-)
            cpd.runtime = 1
            cpd.sandbox = "/tmp/rp.sandbox.unittests"
            cpd.cleanup = True

            pilot = pm.submit_pilots(descriptions=cpd)
            pilot.wait(timeout=300)
            assert pilot.state == rp.FAILED, ("state should be %s and not %s" %
                                              (rp.FAILED, pilot.state))

        finally:
            session.close()
    def test__pilotmanager_wait(self):
        """Test if wait() waits until all (2) pilots have reached 'DONE' state.
        """
        session = rp.Session()

        pmgr = rp.PilotManager(session=session)

        cpd1 = rp.ComputePilotDescription()
        cpd1.resource = "local.localhost"
        cpd1.cores = 1
        cpd1.runtime = 1
        cpd1.sandbox = "/tmp/rp.sandbox.unittests"
        cpd1.cleanup = True

        cpd2 = rp.ComputePilotDescription()
        cpd2.resource = "local.localhost"
        cpd2.cores = 1
        cpd2.runtime = 2
        cpd2.sandbox = "/tmp/rp.sandbox.unittests"
        cpd2.cleanup = True

        pilots = pmgr.submit_pilots([cpd1, cpd2])

        pmgr.wait_pilots(timeout=300)

        for pilot in pilots:
            assert pilot.state == rp.DONE, "Expected state 'Done' but state is %s" % pilot.state
            assert pilot.stop_time is not None
            assert pilot.start_time is not None

        session.close()
示例#3
0
    def test__unitmanager_pilot_assoc(self):
        """ Test if unit manager <-> pilot association works as expected.
        """
        session = rp.Session()

        pm = rp.PilotManager(session=session)

        cpd = rp.ComputePilotDescription()
        cpd.resource = "local.localhost"
        cpd.cores = 1
        cpd.runtime = 1
        cpd.sandbox = "/tmp/rp.sandbox.unittests"
        cpd.cleanup = True

        p1 = pm.submit_pilots(descriptions=cpd)

        um = rp.UnitManager(session=session, scheduler='round_robin')
        assert um.list_pilots() == [], "Wrong list of pilots"

        um.add_pilots(p1)
        assert um.list_pilots() == [p1.uid], "Wrong list of pilots"

        # adding the same pilot twice should be ignored
        um.add_pilots(p1)
        assert um.list_pilots() == [p1.uid], "Wrong list of pilots"

        um.remove_pilots(p1.uid)
        assert um.list_pilots() == [], "Wrong list of pilots"

        pilot_list = []
        for x in range(0, 2):
            cpd = rp.ComputePilotDescription()
            cpd.resource = "local.localhost"
            cpd.cores = 1
            cpd.runtime = 1
            cpd.sandbox = "/tmp/rp.sandbox.unittests"
            cpd.cleanup = True
            p = pm.submit_pilots(descriptions=cpd)
            um.add_pilots(p)
            pilot_list.append(p)

        pl = um.list_pilots()
        assert len(pl) == 2, "Wrong number of associated pilots"
        for l in pilot_list:
            assert l in pilot_list, "Unknown pilot in list"
            um.remove_pilots(l.uid)

        assert um.list_pilots() == [], "Wrong list of pilots"

        session.close()
示例#4
0
def rp_setup_state(request):

    session = rp.Session(database_url=db_url)

    try:
        pmgr = rp.PilotManager(session=session)
        umgr = rp.UnitManager(session=session,
                              scheduler=rp.SCHED_DIRECT_SUBMISSION,
                              output_transfer_workers=4,
                              input_transfer_workers=4)

        pdesc = rp.ComputePilotDescription()
        pdesc.resource = "local.localhost"
        pdesc.runtime = 20
        pdesc.cores = 1
        pdesc.cleanup = True

        pilot = pmgr.submit_pilots(pdesc)
        pilot.register_callback(pilot_state_cb)

        umgr.add_pilots(pilot)

    except Exception as e:
        print 'test failed'
        raise

    def fin():
        print 'closing session'
        session.close()

    request.addfinalizer(fin)

    return pilot, pmgr, umgr
示例#5
0
    def test_03_multiple_pilots(self):
        """Test multiple pilots"""

        # Have to hard-code list of resources
        # TODO: get real list of resources
        resources = ['local.localhost']

        # Create multiple pilot descriptions, one for each resource
        pilot_descriptions = list()
        resource_count = len(resources)
        for resource in resources:
            pd_init = {
                'resource': resource,
                'runtime': 15,  # pilot runtime (min)
                'exit_on_error': True,
                'project': self.config[resource]['project'],
                'queue': self.config[resource]['queue'],
                'access_schema': self.config[resource]['schema'],
                'cores': self.config[resource]['cores'],
            }
            pilot_descriptions.append(rp.ComputePilotDescription(pd_init))

        # Launch the pilot.
        pilot = self.pmgr.submit_pilots(pilot_descriptions)
        pilot_count = len(pilot)

        self.umgr.add_pilots(pilot)

        # Create a workload of ComputeUnits.
        # Each compute unit runs '/bin/date'.
        cuds = list()
        for i in range(0, self.n):
            # create a new CU description, and fill it.
            # Here we don't use dict initialization.
            cud = rp.ComputeUnitDescription()
            cud.executable = '/bin/date'
            cuds.append(cud)

        # Submit the previously created ComputeUnit descriptions to the
        # PilotManager. This will trigger the selected scheduler to start
        # assigning ComputeUnits to the ComputePilots.
        units = self.umgr.submit_units(cuds)

        # Wait for all compute units to reach a final state (DONE, CANCELED or
        # FAILED).
        self.umgr.wait_units()

        # Verify that 100% of the units came back with 'DONE' status
        done_units = 0
        for description in units:
            if description.state == "DONE":
                done_units += 1
        self.assertEquals(
            (float(done_units) / float(self.n)), 1.0,
            "Only {0}% of CUs were DONE.".format(
                str((float(done_units) / float(self.n)) * 100.00)))

        # Finally assert that the number of requested vs submitted pilots are
        # the same
        self.assertEquals(resource_count, pilot_count)
示例#6
0
def rp_setup_short(request):

    session = rp.Session(database_url=db_url)

    try:
        pmgr = rp.PilotManager(session=session)
        umgr = rp.UnitManager(session=session,
                              scheduler=rp.SCHED_DIRECT_SUBMISSION)

        pdesc = rp.ComputePilotDescription()
        pdesc.resource = "local.localhost"
        pdesc.runtime = 1
        pdesc.cores = 1
        pdesc.sandbox = "/tmp/radical.pilot.sandbox.unittests"
        pdesc.cleanup = True

        pilot = pmgr.submit_pilots(pdesc)
        pilot.register_callback(pilot_state_cb)

        umgr.add_pilots(pilot)

    except Exception as e:
        print 'test failed'
        raise

    def fin():
        pmgr.cancel_pilots()
        pmgr.wait_pilots()

        print 'closing session'
        session.close()

    request.addfinalizer(fin)

    return pilot, pmgr, umgr
示例#7
0
    def test__pilot_cancel(self):
        """ Test if we can cancel a pilot.
        """
        session = rp.Session()

        try:
            pm = rp.PilotManager(session=session)

            cpd = rp.ComputePilotDescription()
            cpd.resource = "local.localhost"
            cpd.cores = 1
            cpd.runtime = 1
            cpd.sandbox = "/tmp/rp.sandbox.unittests"
            cpd.cleanup = True

            pilot = pm.submit_pilots(descriptions=cpd)

            assert pilot is not None
            assert pilot.start_time is None
            assert pilot.stop_time is None

            pilot.wait(state=[rp.PMGR_ACTIVE, rp.FAILED], timeout=300)
            assert pilot.submission_time is not None
            assert pilot.state == rp.PMGR_ACTIVE
            assert pilot.start_time is not None

            # the pilot should finish after it has reached run_time
            pilot.cancel()

            pilot.wait(timeout=300)
            assert pilot.state == rp.CANCELED
            assert pilot.stop_time is not None

        finally:
            session.close()
    def test__pilotmanager_list_pilots_after_reconnect(self):
        """ Test if listing pilots after a reconnect works as expected.
        """
        session = rp.Session()

        pm1 = rp.PilotManager(session=session)
        assert len(pm1.list_pilots()) == 0, "Wrong number of pilots returned."

        pm2 = rp.PilotManager(session=session)
        assert len(pm2.list_pilots()) == 0, "Wrong number of pilots returned."

        for i in range(0, 2):
            cpd = rp.ComputePilotDescription()
            cpd.resource = "local.localhost"
            cpd.cores = 1
            cpd.runtime = 1
            cpd.sandbox = "/tmp/rp.sandbox.unittests"
            cpd.cleanup = True

            pm1.submit_pilots(descriptions=cpd)
            pm2.submit_pilots(descriptions=cpd)

        assert len(pm1.list_pilots()) == 2, "Wrong number of pilots returned."
        assert len(pm2.list_pilots()) == 2, "Wrong number of pilots returned."

        pm1_r = session.get_pilot_managers(pilot_manager_ids=pm1.uid)
        pm2_r = session.get_pilot_managers(pilot_manager_ids=pm2.uid)

        assert len(
            pm1_r.list_pilots()) == 2, "Wrong number of pilots returned."
        assert len(
            pm2_r.list_pilots()) == 2, "Wrong number of pilots returned."

        session.close()
示例#9
0
    def test__issue_114_part_3(self):
        """ https://github.com/radical-cybertools/radical.pilot/issues/114
        """
        session = rp.Session(database_url=DBURL, database_name=DBNAME)

        pm = rp.PilotManager(session=session)

        cpd = rp.ComputePilotDescription()
        cpd.resource = "local.localhost"
        cpd.cores = 1
        cpd.runtime = 1
        cpd.sandbox = "/tmp/radical.pilot.sandbox.unittests"
        cpd.cleanup = True

        pilot = pm.submit_pilots(pilot_descriptions=cpd)

        um = rp.UnitManager(session=session,
                            scheduler=rp.SCHED_DIRECT_SUBMISSION)
        um.add_pilots(pilot)

        state = pm.wait_pilots(state=[rp.PMGR_ACTIVE, rp.DONE, rp.FAILED],
                               timeout=10 * 60)

        assert state == [rp.PMGR_ACTIVE], 'state      : %s' % state
        assert pilot.state == rp.PMGR_ACTIVE, 'pilot state: %s' % pilot.state

        state = pm.wait_pilots(timeout=3 * 60)

        assert state == [rp.DONE], 'state      : %s' % state
        assert pilot.state == rp.DONE, 'pilot state: %s' % pilot.state

        session.close()
    def test__add_resource_config_2(self):
        """ Test if we can wait for different pilot states.
        """
        session = rp.Session()

        rc = rp.ResourceConfig("mylocalhost")
        rc.task_launch_method   = "LOCAL"
        rc.mpi_launch_method    = "MPIRUN"
        rc.job_manager_endpoint = "fork://localhost"
        rc.filesystem_endpoint  = "file://localhost/"
        rc.bootstrapper         = "default_bootstrapper.sh"

        pm = rp.PilotManager(session=session)
        session.add_resource_config(rc)

        pd = rp.ComputePilotDescription()
        pd.resource = "mylocalhost"
        pd.cores    = 1
        pd.runtime  = 1
        pd.sandbox = "/tmp/rp.sandbox.unittests"
        pd.cleanup = True

        pilot = pm.submit_pilots(pd)
        pilot.wait(timeout=300)
        pilot.cancel()

        session.close()
示例#11
0
    def test_01_unit_details(self):
        """Test unit details, units has all details accessible via api
        """

        # Detail keys to be checked in unit dictionary
        expected_detail_keys = [
            'type',
            'umgr',
            'uid',
            'name',
            'state',
            'exit_code',
            'stdout',
            'stderr',
            'pilot',
            'sandbox',
            'description',
        ]

        # Create description object from template description
        pilot_desc = rp.ComputePilotDescription(self.pd_init)

        # Launch the pilot.
        pilot = self.pmgr.submit_pilots(pilot_desc)

        self.umgr.add_pilots(pilot)

        # Create a workload of ComputeUnits.
        # Each compute unit runs '/bin/date'.
        cuds = list()
        for i in range(1, self.n + 1):
            # create a new CU description, and fill it.
            # Here we don't use dict initialization.
            cud = rp.ComputeUnitDescription()
            cud.executable = '/bin/date'
            cuds.append(cud)

        # Submit the previously created ComputeUnit descriptions to the
        # PilotManager. This will trigger the selected scheduler to start
        # assigning ComputeUnits to the ComputePilots.
        units = self.umgr.submit_units(cuds)

        # Wait for all compute units to reach a final state (DONE, CANCELED or
        # FAILED).
        self.umgr.wait_units()

        # Not asserting for 100% completion, that is not the idea here...

        # Check that all items in the dictionary
        # match the expected keys and that all
        # values are *not NONE*
        for unit in units:
            unit_dict = unit.as_dict()
            for key, val in unit_dict.iteritems():
                self.assertIn(key, expected_detail_keys)
                self.assertIsNotNone(val,
                                     msg="'{0}' unexpectedly None".format(key))
示例#12
0
 def desc(self):
     resource = self.resource
     pd = {
         'resource': self.rp_resource,
         'runtime': self.runtime,
         'exit_on_error': resource.exit_on_error,
         'project': resource.project,
         'queue': self.queue,
         'access_schema': resource.access_schema,
         'cores': self.cores
     }
     return rp.ComputePilotDescription(pd)
def test_pass_issue_57():

    for i in [16, 32, 64]:

        session = rp.Session(database_url=db_url)

        try:

            c = rp.Context('ssh')
            c.user_id = CONFIG["xsede.stampede"]["user_id"]
            session.add_context(c)

            pmgr = rp.PilotManager(session=session)
            umgr = rp.UnitManager(session=session,
                                  scheduler=rp.SCHED_ROUND_ROBIN)

            pdesc = rp.ComputePilotDescription()
            pdesc.resource = "xsede.stampede"
            pdesc.project = CONFIG["xsede.stampede"]["project"]
            pdesc.cores = i
            pdesc.runtime = 20
            pdesc.cleanup = False

            pilots = pmgr.submit_pilots(pdesc)

            umgr.add_pilots(pilots)

            unit_descrs = []
            for k in range(0, i * 2):
                cu = rp.ComputeUnitDescription()
                cu.cores = 1
                cu.executable = "/bin/date"
                unit_descrs.append(cu)

            units = umgr.submit_units(unit_descrs)

            try:
                umgr.wait_units()

                for unit in units:
                    unit.wait()
            except:
                pass

            pmgr.cancel_pilots()
            pmgr.wait_pilots()

        except Exception as e:
            print "TEST FAILED"
            raise

        finally:
            session.close()
示例#14
0
 def desc(self):
     # question: This splitting in resource does not make sense to me
     # I would say that these should be coupled with the resource definition
     pd = {
         'resource': self.resource,
         'runtime': self.runtime,
         'exit_on_error': self.exit_on_error,
         'project': self.project,
         'queue': self.queue,
         'access_schema': self.access_schema,
         'cores': self.cores,
     }
     return rp.ComputePilotDescription(pd)
示例#15
0
    def test__issue_262(self):
        """ https://github.com/radical-cybertools/radical.pilot/issues/18
        """
        session = rp.Session()
        pmgr = rp.PilotManager(session=session)

        # Create a local pilot with a million cores. This will most likely
        # fail as not enough cores will be available.  That means the pilot will
        # go quickly into failed state, and trigger the callback from above.
        pd = rp.ComputePilotDescription()
        pd.resource  = "local.localhost"
        pd.cores     = 1
        pd.runtime   = 1

        pilot = pmgr.submit_pilots(pd)

        umgr = rp.UnitManager(
            session=session,
            scheduler=rp.SCHED_DIRECT_SUBMISSION)
        umgr.add_pilots(pilot)

        cud = rp.ComputeUnitDescription()
        cud.executable    = "/bin/sleep"
        cud.arguments     = ["10"]
        cud.cores         = 1
        cud.input_staging = ["/etc/group"]


        unit = umgr.submit_units(cud)
        umgr.wait_units()    

        for log_entry in pilot.log:
             ld = log_entry.as_dict()
             assert "timestamp" in ld
             assert "message"   in ld

             s = "%s" % log_entry
             assert type(s) == unicode

        for log_entry in unit.log:
            ld = log_entry.as_dict()
            assert "timestamp" in ld
            assert "message"   in ld

            s = "%s" % log_entry
            assert type(s) == unicode

        session.close()
示例#16
0
def start_pilot(cr=None):
    """
    In order to start a pilot on the newly created CR, we need to define
    a resource description for that CR.  To do so, we programatically create
    a clone of the local.localhost description, and replace the job submission
    URL with an ssh:// URL pointing to the CR.
    """

    if not cr:

        class _CR(object):
            def __init__(self):
                self.access = 'ssh://remote.host.net:1234/'

        cr = _CR()

    # get the local resource config
    session = rp.Session()
    cfg = session.get_resource_config('local.localhost')

    # create a new config based on the local one, and add it back
    new_cfg = rp.ResourceConfig('ec2.vm', cfg)
    new_cfg.schemas = ['ssh']
    new_cfg['ssh']['job_manager_endpoint'] = cr.access
    new_cfg['ssh']['filesystem_endpoint'] = cr.access

    # the new config needs to make sure we can bootstrap on the VM
    new_cfg['pre_bootstrap_1'] = [
        'sudo apt-get update',
        'sudo apt-get install -y python-virtualenv python-dev dnsutils bc'
    ]
    session.add_resource_config(new_cfg)

    # use the *same* ssh key for ssh access to the VM
    ssh_ctx = rs.Context('SSH')
    ssh_ctx.user_id = 'admin'
    ssh_ctx.user_key = os.environ['EC2_KEYPAIR']
    session.contexts.append(ssh_ctx)

    # submit a pilot to it.
    pd = rp.ComputePilotDescription()
    pd.resource = 'ec2.vm'
    pd.runtime = 10
    pd.cores = 1
    pd.exit_on_error = True,

    pmgr = rp.PilotManager(session=session)
    return pmgr.submit_pilots(pd)
示例#17
0
def setup_gordon(request):

    session1 = rp.Session()

    print "session id gordon: {0}".format(session1.uid)


    c = rp.Context('ssh')
    c.user_id = CONFIG["xsede.gordon"]["user_id"]
    session1.add_context(c)

    try:
        pmgr1 = rp.PilotManager(session=session1)

        print "pm id gordon: {0}".format(pmgr1.uid)

        umgr1 = rp.UnitManager (session=session1,
                               scheduler=rp.SCHEDULER_DIRECT_SUBMISSION)

        pdesc1 = rp.ComputePilotDescription()
        pdesc1.resource = "xsede.gordon"
        pdesc1.project  = CONFIG["xsede.gordon"]["project"]
        pdesc1.runtime  = 30
        pdesc1.cores    = 16
        pdesc1.cleanup  = False

        pilot1 = pmgr1.submit_pilots(pdesc1)
        pilot1.register_callback(pilot_state_cb)

        umgr1.add_pilots(pilot1)

    except Exception as e:
        print 'test failed'
        raise

    def fin():
        print "finalizing..."
        pmgr1.cancel_pilots()       
        pmgr1.wait_pilots() 

        print 'closing session'
        session1.close()
        time.sleep(5)

    request.addfinalizer(fin)

    return session1, pilot1, pmgr1, umgr1, "xsede.gordon"
示例#18
0
def setup_stampede_two(request):

    session3 = rp.Session()

    print "session id stampede: {0}".format(session3.uid)

    c = rp.Context('ssh')
    c.user_id = CONFIG["xsede.stampede"]["user_id"]
    session3.add_context(c)

    try:
        pmgr3 = rp.PilotManager(session=session3)

        print "pm id stampede: {0}".format(pmgr3.uid)

        umgr3 = rp.UnitManager(session=session3,
                               scheduler=rp.SCHED_DIRECT_SUBMISSION)

        pdesc3 = rp.ComputePilotDescription()
        pdesc3.resource = "xsede.stampede"
        pdesc3.project = CONFIG["xsede.stampede"]["project"]
        pdesc3.runtime = 20
        pdesc3.cores = int(CONFIG["xsede.stampede"]["cores"]) * 2
        pdesc3.cleanup = False

        pilot3 = pmgr3.submit_pilots(pdesc3)
        pilot3.register_callback(pilot_state_cb)

        umgr3.add_pilots(pilot3)

    except Exception as e:
        print 'test failed'
        raise

    def fin():
        print "finalizing..."
        pmgr3.cancel_pilots()
        pmgr3.wait_pilots()

        print 'closing session'
        session3.close()

    request.addfinalizer(fin)

    return session3, pilot3, pmgr3, umgr3, "xsede.stampede"
示例#19
0
    def test__issue_114_part_1(self):
        """ https://github.com/radical-cybertools/radical.pilot/issues/114
        """
        session = rp.Session(database_url=DBURL, database_name=DBNAME)

        pm = rp.PilotManager(session=session)

        cpd = rp.ComputePilotDescription()
        cpd.resource = "local.localhost"
        cpd.cores = 1
        cpd.runtime = 5
        cpd.sandbox = "/tmp/radical.pilot.sandbox.unittests"
        cpd.cleanup = True

        pilot = pm.submit_pilots(pilot_descriptions=cpd)
        state = pm.wait_pilots(state=[rp.PMGR_ACTIVE, rp.DONE, rp.FAILED],
                               timeout=5 * 60)

        assert (pilot.state == rp.PMGR_ACTIVE), "pilot state: %s" % pilot.state

        um = rp.UnitManager(session=session,
                            scheduler=rp.SCHED_DIRECT_SUBMISSION)
        um.add_pilots(pilot)

        all_tasks = []

        for i in range(0, 2):
            cudesc = rp.ComputeUnitDescription()
            cudesc.cores = 1
            cudesc.executable = "/bin/sleep"
            cudesc.arguments = ['60']
            all_tasks.append(cudesc)

        units = um.submit_units(all_tasks)
        states = um.wait_units(state=[rp.SCHEDULING, rp.AGENT_EXECUTING],
                               timeout=2 * 60)

        assert rp.SCHEDULING in states, "states: %s" % states

        states = um.wait_units(state=[rp.AGENT_EXECUTING, rp.DONE],
                               timeout=1 * 60)

        assert rp.AGENT_EXECUTING in states, "states: %s" % states

        session.close()
示例#20
0
def test_rp_basic_task(rp_config):
    rp = rp_config['rp']

    # Note: Session creation will fail with a FileNotFound error unless venv is explicitly `activate`d.
    # TODO: Figure out what `activate` does that `rp-venv/bin/python` doesn't do.
    with rp.Session() as session:
        # Based on `radical.pilot/examples/config.json`
        # TODO: Does the Session have a default spec for 'local.localhost'? Can/should we reference it?
        # See also https://github.com/radical-cybertools/radical.pilot/issues/2181
        resource = 'local.localhost'
        resource_config = {resource: {}}
        if resource in rp_config['config']:
            resource_config[resource].update(rp_config.config[resource])
        resource_config[resource].update({
            'project': None,
            'queue': None,
            'schema': None,
            'cores': 1,
            'gpus': 0
        })

        pilot_description = dict(resource=resource,
                                 runtime=30,
                                 exit_on_error=True,
                                 project=resource_config[resource]['project'],
                                 queue=resource_config[resource]['queue'],
                                 cores=resource_config[resource]['cores'],
                                 gpus=resource_config[resource]['gpus'])

        task_description = {
            'executable': '/bin/date',
            'cpu_processes': 1,
        }

        pmgr = rp.PilotManager(session=session)
        umgr = rp.UnitManager(session=session)
        pilot = pmgr.submit_pilots(
            rp.ComputePilotDescription(pilot_description))
        task = umgr.submit_units(rp.ComputeUnitDescription(task_description))

        umgr.add_pilots(pilot)
        umgr.wait_units()

        assert task.exit_code == 0
    assert session.closed
示例#21
0
    def test_02_failing_units(self):
        """Test failing units, about ~50% of the units will fail"""

        # Create description object from template description
        pilot_desc = rp.ComputePilotDescription(self.pd_init)

        # Launch the pilot.
        pilot = self.pmgr.submit_pilots(pilot_desc)

        self.umgr.add_pilots(pilot)

        # Create a workload of ComputeUnits.
        # Each compute unit runs '/bin/date'.
        # About ~50% of them will fail
        cuds = list()
        for i in range(1, self.n + 1):
            # create a new CU description, and fill it.
            # Here we don't use dict initialization.
            cud = rp.ComputeUnitDescription()
            if i % 2:
                cud.executable = '/bin/date'
            else:
                # trigger an error now and then
                cud.executable = '/bin/data'  # does not exist
            cuds.append(cud)

        # Submit the previously created ComputeUnit descriptions to the
        # PilotManager. This will trigger the selected scheduler to start
        # assigning ComputeUnits to the ComputePilots.
        units = self.umgr.submit_units(cuds)

        # Wait for all compute units to reach a final state (DONE, CANCELED or
        # FAILED).
        self.umgr.wait_units()

        # Verify that >= 50% of the units came back with 'DONE' status
        # TODO: better checks for failures...
        done_units = 0
        for description in units:
            if description.state == "DONE":
                done_units += 1
        self.assertGreaterEqual(
            (float(done_units) / float(self.n)), 0.50,
            "Only {0}% of CUs were DONE.".format(
                str((float(done_units) / float(self.n)) * 100.00)))
示例#22
0
def setup_comet(request):

    session2 = rp.Session()

    print "session id comet: {0}".format(session2.uid)

    c = rp.Context('ssh')
    c.user_id = CONFIG["xsede.comet"]["user_id"]
    session2.add_context(c)

    try:
        pmgr2 = rp.PilotManager(session=session2)

        print "pm id gordon: {0}".format(pmgr2.uid)

        umgr2 = rp.UnitManager(session=session2,
                               scheduler=rp.SCHED_DIRECT_SUBMISSION)

        pdesc2 = rp.ComputePilotDescription()
        pdesc2.resource = "xsede.comet"
        pdesc2.project = CONFIG["xsede.comet"]["project"]
        pdesc2.runtime = 30
        pdesc2.cores = 24
        pdesc2.cleanup = False

        pilot2 = pmgr2.submit_pilots(pdesc2)
        pilot2.register_callback(pilot_state_cb)

        umgr2.add_pilots(pilot2)

    except Exception as e:
        print 'test failed'
        raise

    def fin():
        print "finalizing..."
        pmgr2.cancel_pilots()
        pmgr2.wait_pilots()

        print 'closing session'
        session2.close()

    request.addfinalizer(fin)

    return session2, pilot2, pmgr2, umgr2, "xsede.comet"
def setup_stampede_683(request):

    session = rp.Session()

    print "session id stampede: {0}".format(session.uid)

    c = rp.Context('ssh')
    c.user_id = CONFIG["xsede.stampede"]["user_id"]
    session.add_context(c)

    try:
        pmgr = rp.PilotManager(session=session)

        umgr = rp.UnitManager(session=session,
                              scheduler=rp.SCHEDULER_BACKFILLING)

        pdesc = rp.ComputePilotDescription()
        pdesc.resource = "xsede.stampede"
        pdesc.project = CONFIG["xsede.stampede"]["project"]
        pdesc.runtime = 40
        pdesc.cores = 683
        pdesc.cleanup = False

        pilot = pmgr.submit_pilots(pdesc)
        pilot.register_callback(pilot_state_cb)

        umgr.add_pilots(pilot)

    except Exception as e:
        print 'test failed'
        raise

    def fin():
        print "finalizing..."
        pmgr.cancel_pilots()
        pmgr.wait_pilots()

        print 'closing session'
        session.close()

    request.addfinalizer(fin)

    return session, pilot, pmgr, umgr, "xsede.stampede"
示例#24
0
    def _pilots_backfill(self, requests):
        '''
        Request new backfill pilots, chunked by the given max_cores and
        max_walltime.  The given request_stub is used as template for the pilot
        descriptions.
        '''

        self._rep.info('\nrequesting backfilled pilots\n')
        pds = list()

        for request in requests:

            del (request['backfill'])

            policy = request['policy']
            partition = request['partition']

            PWD = os.path.dirname(__file__)
            policy = ru.read_json('%s/policies/%s.json' %
                                  (PWD, request['policy']))

            max_cores = policy.get('max_cores', MAX_CORES)
            max_walltime = policy.get('max_walltime', MAX_WALLTIME)

            self._rep.info('\nrequesting backfill pilots\n')
            bf = get_backfill(request['partition'], max_cores, max_walltime)

            for [partition, cores, walltime] in bf:
                pd = {
                    'resource': request.get('resource', 'local.localhost'),
                    'project': request.get('project'),
                    'queue': request.get('queue'),
                    'cores': cores,
                    'runtime': walltime
                }
                self._rep.ok(
                    'backfill @ %s [%5dcores * %4dmin] @ %10s(%10s)]\n' %
                    (pd['resource'], pd['cores'], pd['runtime'], pd['queue'],
                     pd['project']))
                # pprint.pprint(pd)
                pds.append(rp.ComputePilotDescription(pd))

        return pds
示例#25
0
    def test__issue_114_part_2(self):
        """ https://github.com/radical-cybertools/radical.pilot/issues/114
        """
        session = rp.Session(database_url=DBURL, database_name=DBNAME)

        pm = rp.PilotManager(session=session)

        cpd = rp.ComputePilotDescription()
        cpd.resource = "local.localhost"
        cpd.cores = 1
        cpd.runtime = 5
        cpd.sandbox = "/tmp/radical.pilot.sandbox.unittests"
        cpd.cleanup = True

        pilot = pm.submit_pilots(pilot_descriptions=cpd)

        um = rp.UnitManager(session=session,
                            scheduler=rp.SCHED_DIRECT_SUBMISSION)
        um.add_pilots(pilot)

        state = pm.wait_pilots(state=[rp.ACTIVE, rp.DONE, rp.FAILED],
                               timeout=5 * 60)

        assert (pilot.state == rp.ACTIVE), "pilot state: %s" % pilot.state

        cudesc = rp.ComputeUnitDescription()
        cudesc.cores = 1
        cudesc.executable = "/bin/sleep"
        cudesc.arguments = ['60']

        cu = um.submit_units(cudesc)
        state = um.wait_units(state=[rp.EXECUTING], timeout=60)

        assert state == [rp.EXECUTING], 'state   : %s' % state
        assert cu.state == rp.EXECUTING, 'cu state: %s' % cu.state

        state = um.wait_units(timeout=2 * 60)

        assert state == [rp.DONE], 'state   : %s' % state
        assert cu.state == rp.DONE, 'cu state: %s' % cu.state

        session.close()
示例#26
0
def createWorkload(options, config, withOrte, nodes, nthreads, runtime):
    resource = "ornl.titan_lib"
    cuList = []

    for i in range(0, nodes * 16 / nthreads):
        cuList += [CUDef.createTAUGromacsCU(nthreads)]
    print(len(cuList))
    pd_init = {
        'resource': resource,
        'runtime': runtime,  # pilot runtime (min)
        'exit_on_error': True,
        'project': config[resource]['project'],
        'queue': config[resource]['queue'],
        'access_schema': config[resource]['schema'],
        'cores': 16 * nnodes + 16,  # Additional 16 cores are for ORTE 
    }

    pilots = []
    pilots.append(rp.ComputePilotDescription(pd_init))
    return (pilots, cuList)
示例#27
0
def setup_local_1(request):

    session1 = rp.Session()

    print "session id local_1: {0}".format(session1.uid)

    try:
        pmgr1 = rp.PilotManager(session=session1)

        print "pm id local_1: {0}".format(pmgr1.uid)

        umgr1 = rp.UnitManager(session=session1,
                               scheduler=rp.SCHED_DIRECT_SUBMISSION)

        pdesc1 = rp.ComputePilotDescription()
        pdesc1.resource = "local.localhost"
        pdesc1.runtime = 30
        pdesc1.cores = 1
        pdesc1.cleanup = False

        pilot1 = pmgr1.submit_pilots(pdesc1)
        pilot1.register_callback(pilot_state_cb)

        umgr1.add_pilots(pilot1)

    except Exception as e:
        print 'test failed'
        raise

    def fin():
        print "finalizing..."
        pmgr1.cancel_pilots()
        pmgr1.wait_pilots()

        print 'closing session'
        session1.close()
        time.sleep(5)

    request.addfinalizer(fin)

    return session1, pilot1, pmgr1, umgr1, "local.localhost"
    def test__pilotmanager_get_pilots(self):
        session = rp.Session()

        pm1 = rp.PilotManager(session=session)
        assert len(pm1.list_pilots()) == 0, "Wrong number of pilots returned."

        pm2 = rp.PilotManager(session=session)
        assert len(pm2.list_pilots()) == 0, "Wrong number of pilots returned."

        pm1_pilot_uids = []
        pm2_pilot_uids = []

        for i in range(0, 2):
            cpd = rp.ComputePilotDescription()
            cpd.resource = "local.localhost"
            cpd.cores = 1
            cpd.runtime = 1
            cpd.sandbox = "/tmp/rp.sandbox.unittests"
            cpd.cleanup = True

            pilot_pm1 = pm1.submit_pilots(descriptions=cpd)
            pm1_pilot_uids.append(pilot_pm1.uid)

            pilot_pm2 = pm2.submit_pilots(descriptions=cpd)
            pm2_pilot_uids.append(pilot_pm2.uid)

        for i in pm1.list_pilots():
            pilot = pm1.get_pilots(i)
            assert pilot.uid in pm1_pilot_uids, "Wrong pilot ID %s (not in %s)" % (
                pilot.uid, pm1_pilot_uids)

        assert len(pm1.get_pilots()) == 2, "Wrong number of pilots."

        for i in pm2.list_pilots():
            pilot = pm2.get_pilots(i)
            assert pilot.uid in pm2_pilot_uids, "Wrong pilot ID %s" % pilot.uid

        assert len(pm2.get_pilots()) == 2, "Wrong number of pilots."

        session.close()
示例#29
0
    def test_00_getting_started(self):
        """Test a standard pilot run"""

        # Create description object from template description
        pilot_desc = rp.ComputePilotDescription(self.pd_init)

        # Launch the pilot.
        pilot = self.pmgr.submit_pilots(pilot_desc)

        self.umgr.add_pilots(pilot)

        # Create a workload of ComputeUnits.
        # Each compute unit runs '/bin/date'.
        cuds = list()
        for i in range(0, self.n):
            # create a new CU description, and fill it.
            # Here we don't use dict initialization.
            cud = rp.ComputeUnitDescription()
            cud.executable = '/bin/date'
            cuds.append(cud)

        # Submit the previously created ComputeUnit descriptions to the
        # PilotManager. This will trigger the selected scheduler to start
        # assigning ComputeUnits to the ComputePilots.
        units = self.umgr.submit_units(cuds)

        # Wait for all compute units to reach a final state (DONE, CANCELED or
        # FAILED).
        self.umgr.wait_units()

        # Verify that 100% of the units came back with 'DONE' status
        done_units = 0
        for description in units:
            if description.state == "DONE":
                done_units += 1
        self.assertEquals(
            (float(done_units) / float(self.n)), 1.0,
            "Only {0}% of CUs were DONE.".format(
                str((float(done_units) / float(self.n)) * 100.00)))
示例#30
0
    def _pilots_queue(self, requests):
        '''
        submit a new pilot to the batchs system
        '''

        self._rep.info('\nrequesting dedicated pilots\n')
        pds = list()

        for request in requests:
            pd = {
                'resource': request.get('resource', 'local.localhost'),
                'project': request.get('project'),
                'queue': request.get('queue'),
                'cores': request['cores'],
                'runtime': request['walltime']
            }
            self._rep.ok(
                'provision on %s [%5dcores * %4dmin] @ %10s(%10s)]\n' %
                (pd['resource'], pd['cores'], pd['runtime'], pd['queue'],
                 pd['project']))
            pds.append(rp.ComputePilotDescription(pd))

        return pds