예제 #1
0
def test_get_global_nodelist_frm_wrklst_file():
    # node_list file should override env variables
    os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = "20-22,137-139,1234"  # Should not be this
    exp_out = ['knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-2345']  # Should be this

    open('node_list', 'w').close()
    try:
        _ = Resources.get_global_nodelist(rundir=os.getcwd())
    except ResourcesException as e:
        assert e.args[0] == 'Error. global_nodelist is empty'
    else:
        assert 0

    with open('node_list', 'w') as f:
        for node in exp_out:
            f.write(node + '\n')

    # Do not specify env vars.
    global_nodelist1 = Resources.get_global_nodelist(rundir=os.getcwd())
    assert global_nodelist1 == exp_out, "global_nodelist returned does not match expected"

    # Specify env vars - should ignore
    env_resources = EnvResources(nodelist_env_slurm="THIS_ENV_VARIABLE_IS_DEF_NOT_SET",
                                 nodelist_env_cobalt="LIBE_RESOURCES_TEST_NODE_LIST",
                                 nodelist_env_lsf="THIS_ENV_VARIABLE_IS_DEF_NOT_SET",
                                 nodelist_env_lsf_shortform="THIS_ENV_VARIABLE_IS_DEF_NOT_SET")
    global_nodelist2 = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources)
    assert global_nodelist2 == exp_out, "global_nodelist returned does not match expected"
    os.remove('node_list')
예제 #2
0
def test_get_local_nodelist_central_mode_remove_libE_proc():
    mynode = socket.gethostname()
    nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']
    with open('node_list', 'w') as f:
        for i, node in enumerate(nodelist_in):
            f.write(node + '\n')
            if i == 3:
                f.write(mynode + '\n')

    resources = Resources(central_mode=True)
    resources.add_comm_info(libE_nodes=[mynode])

    # Now mock up some more stuff - so consistent

    # Spoof current process as each worker and check nodelist.
    num_workers = 8
    exp_out = [['knl-0020'], ['knl-0021'], ['knl-0022'], ['knl-0036'],
               ['knl-0137'], ['knl-0138'], ['knl-0139'], ['knl-1234']]
    for wrk in range(num_workers):
        workerID = wrk + 1
        local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
        assert local_nodelist == exp_out[wrk], "local_nodelist returned does not match expected"

    # Spoof current process as each worker and check nodelist.
    num_workers = 4
    exp_out = [['knl-0020', 'knl-0021'], ['knl-0022', 'knl-0036'], ['knl-0137', 'knl-0138'], ['knl-0139', 'knl-1234']]
    for wrk in range(num_workers):
        workerID = wrk + 1
        local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
        assert local_nodelist == exp_out[wrk], "local_nodelist returned does not match expected"

    # Spoof current process as each worker and check nodelist.
    num_workers = 1
    exp_out = [['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']]
    for wrk in range(num_workers):
        workerID = wrk + 1
        local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
        assert local_nodelist == exp_out[wrk], "local_nodelist returned does not match expected"

    # Test the best_split algorithm
    num_workers = 3
    exp_out = [['knl-0020', 'knl-0021', 'knl-0022'], ['knl-0036', 'knl-0137', 'knl-0138'], ['knl-0139', 'knl-1234']]
    for wrk in range(num_workers):
        workerID = wrk + 1
        local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
        assert local_nodelist == exp_out[wrk], "local_nodelist returned does not match expected"

    os.remove('node_list')
예제 #3
0
def test_remove_libE_nodes():
    mynode = socket.gethostname()
    exp_out = ['knl-0019', 'knl-0021', 'knl-0022', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-2345']

    # Add at beginning
    nodes_in = [mynode] + exp_out
    nodes_out = Resources.remove_nodes(nodes_in, mynode)
    assert nodes_out == exp_out, "nodelist returned does not match expected"

    # Add twice in middle and at end
    nodes_in = []
    for i, node in enumerate(exp_out):
        nodes_in.append(node)
        if i == 1 or i == 4 or i == 6:
            nodes_in.append(mynode)
    nodes_out = Resources.remove_nodes(nodes_in, mynode)
    assert nodes_out == exp_out, "nodelist returned does not match expected"
예제 #4
0
def test_get_global_nodelist_standalone():
    mynode = socket.gethostname()
    exp_node = mynode  # sname(mynode)
    env_resources = EnvResources(nodelist_env_slurm="THIS_ENV_VARIABLE_IS_DEF_NOT_SET",
                                 nodelist_env_cobalt="THIS_ENV_VARIABLE_IS_DEF_NOT_SET",
                                 nodelist_env_lsf="THIS_ENV_VARIABLE_IS_DEF_NOT_SET",
                                 nodelist_env_lsf_shortform="THIS_ENV_VARIABLE_IS_DEF_NOT_SET")
    global_nodelist = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources)
    assert global_nodelist == [exp_node], "global_nodelist returned does not match expected"
예제 #5
0
def test_get_global_nodelist_frm_lsf_shortform():
    os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = 'batch5 1 g06n02 42 h21n18 42'
    exp_out = ['g06n02', 'h21n18']
    env_resources = EnvResources(nodelist_env_slurm="THIS_ENV_VARIABLE_IS_DEF_NOT_SET",
                                 nodelist_env_cobalt="THIS_ENV_VARIABLE_IS_DEF_NOT_SET",
                                 nodelist_env_lsf="THIS_ENV_VARIABLE_IS_DEF_NOT_SET",
                                 nodelist_env_lsf_shortform="LIBE_RESOURCES_TEST_NODE_LIST")
    global_nodelist = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources)
    assert global_nodelist == exp_out, "global_nodelist returned does not match expected"
예제 #6
0
def test_get_global_nodelist_frm_cobalt():
    os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = "20-22,137-139,1234"
    exp_out = ['20', '21', '22', '137', '138', '139', '1234']
    env_resources = EnvResources(nodelist_env_slurm="THIS_ENV_VARIABLE_IS_DEF_NOT_SET",
                                 nodelist_env_cobalt="LIBE_RESOURCES_TEST_NODE_LIST",
                                 nodelist_env_lsf="THIS_ENV_VARIABLE_IS_DEF_NOT_SET",
                                 nodelist_env_lsf_shortform="THIS_ENV_VARIABLE_IS_DEF_NOT_SET")
    global_nodelist = Resources.get_global_nodelist(rundir=os.getcwd(), env_resources=env_resources)
    assert global_nodelist == exp_out, "global_nodelist returned does not match expected"
예제 #7
0
def test_get_local_nodelist_distrib_mode():
    mynode = socket.gethostname()
    # nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']
    nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139']
    with open('node_list', 'w') as f:
        for i, node in enumerate(nodelist_in):
            f.write(node + '\n')
            if i == 3:
                f.write(mynode + '\n')

    resources = Resources(central_mode=False)

    # Spoof current process as each worker and check nodelist.
    num_workers = 8

    # Test workerID not in local_nodelist [update: This should now work - check removed]
    # workerID = 4
    # try:
    #     local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
    # except:
    #     assert 1
    # else:
    #     assert 0

    workerID = 5
    exp_node = mynode  # sname(mynode)
    exp_out = [exp_node]
    local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
    assert local_nodelist == exp_out, "local_nodelist returned does not match expected"

    num_workers = 1
    workerID = 1
    exp_out = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', exp_node, 'knl-0137', 'knl-0138', 'knl-0139']
    local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
    assert local_nodelist == exp_out, "local_nodelist returned does not match expected"

    num_workers = 4
    workerID = 3
    exp_out = [exp_node, 'knl-0137']
    local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
    assert local_nodelist == exp_out, "local_nodelist returned does not match expected"

    # Sub-node workers
    num_workers = 16

    workerID = 9
    exp_out = [exp_node]
    local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
    assert local_nodelist == exp_out, "local_nodelist returned does not match expected"

    workerID = 10
    exp_out = [exp_node]

    local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
    assert local_nodelist == exp_out, "local_nodelist returned does not match expected"
    os.remove('node_list')
예제 #8
0
def test_get_local_nodelist_central_mode():
    os.environ[
        "LIBE_RESOURCES_TEST_NODE_LIST"] = "knl-[0020-0022,0036,0137-0139,1234]"
    resources = Resources(nodelist_env_slurm="LIBE_RESOURCES_TEST_NODE_LIST",
                          central_mode=True)

    # Now mock up some more stuff - so consistent

    # Spoof current process as each worker and check nodelist.
    num_workers = 8
    exp_out = [['knl-0020'], ['knl-0021'], ['knl-0022'], ['knl-0036'],
               ['knl-0137'], ['knl-0138'], ['knl-0139'], ['knl-1234']]
    for wrk in range(num_workers):
        workerID = wrk + 1
        local_nodelist = WorkerResources.get_local_nodelist(
            num_workers, workerID, resources)
        assert local_nodelist == exp_out[
            wrk], "local_nodelist returned does not match expected"

    # Spoof current process as each worker and check nodelist.
    num_workers = 4
    exp_out = [['knl-0020', 'knl-0021'], ['knl-0022', 'knl-0036'],
               ['knl-0137', 'knl-0138'], ['knl-0139', 'knl-1234']]
    for wrk in range(num_workers):
        workerID = wrk + 1
        local_nodelist = WorkerResources.get_local_nodelist(
            num_workers, workerID, resources)
        assert local_nodelist == exp_out[
            wrk], "local_nodelist returned does not match expected"

    # Spoof current process as each worker and check nodelist.
    num_workers = 1
    exp_out = [[
        'knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138',
        'knl-0139', 'knl-1234'
    ]]
    for wrk in range(num_workers):
        workerID = wrk + 1
        local_nodelist = WorkerResources.get_local_nodelist(
            num_workers, workerID, resources)
        assert local_nodelist == exp_out[
            wrk], "local_nodelist returned does not match expected"

    # Test the best_split algorithm
    num_workers = 3
    exp_out = [['knl-0020', 'knl-0021', 'knl-0022'],
               ['knl-0036', 'knl-0137', 'knl-0138'], ['knl-0139', 'knl-1234']]
    for wrk in range(num_workers):
        workerID = wrk + 1
        local_nodelist = WorkerResources.get_local_nodelist(
            num_workers, workerID, resources)
        assert local_nodelist == exp_out[
            wrk], "local_nodelist returned does not match expected"
예제 #9
0
def test_get_local_nodelist_distrib_mode_host_not_in_list():
    os.environ["LIBE_RESOURCES_TEST_NODE_LIST"] = "knl-[0020-0022,0036,0137-0139,1234]"
    resources = Resources(nodelist_env_slurm="LIBE_RESOURCES_TEST_NODE_LIST", central_mode=False)

    # Spoof current process as each worker and check nodelist.
    num_workers = 4
    exp_out = ['knl-0022', 'knl-0036']

    # Test running distributed mode without current host in list.
    workerID = 2
    local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)

    # Now this should work
    assert local_nodelist == exp_out, "local_nodelist returned does not match expected"
예제 #10
0
def test_worker_resources():
    os.environ[
        "LIBE_RESOURCES_TEST_NODE_LIST"] = "knl-[0020-0022,0036,0137-0139,1234]"
    resources = Resources(nodelist_env_slurm="LIBE_RESOURCES_TEST_NODE_LIST",
                          central_mode=True)

    # One worker per node
    exp_nodelist1 = [['knl-0020'], ['knl-0021'], ['knl-0022'], ['knl-0036'],
                     ['knl-0137'], ['knl-0138'], ['knl-0139'], ['knl-1234']]
    num_workers = 8
    comm = Fake_comm(num_workers)
    for wrk in range(num_workers):
        workerID = wrk + 1
        worker = WorkerResources(workerID, comm, resources)
        assert worker.num_workers == 8, 'worker.num_workers does not match'
        assert worker.workerID == workerID, 'worker.workerID does not match'
        assert worker.local_nodelist == exp_nodelist1[
            wrk], 'worker.local_nodelist does not match'
        assert worker.local_node_count == 1, 'worker.local_node_count does not match'
        assert worker.workers_per_node == 1, 'worker.workers_per_node does not match'

    # Multiple nodes per worker
    exp_nodelist2 = [['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036'],
                     ['knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']]
    num_workers = 2
    comm2 = Fake_comm(num_workers)
    for wrk in range(num_workers):
        workerID = wrk + 1
        worker = WorkerResources(workerID, comm2, resources)
        assert worker.num_workers == 2, 'worker.num_workers does not match'
        assert worker.workerID == workerID, 'worker.workerID does not match'
        assert worker.local_nodelist == exp_nodelist2[
            wrk], 'worker.local_nodelist does not match'
        assert worker.local_node_count == 4, 'worker.local_node_count does not match'
        assert worker.workers_per_node == 1, 'worker.workers_per_node does not match'

    # Multiple workers per node
    num_workers = 16
    comm3 = Fake_comm(num_workers)
    for wrk in range(num_workers):
        workerID = wrk + 1
        worker = WorkerResources(workerID, comm3, resources)
        assert worker.num_workers == 16, 'worker.num_workers does not match'
        assert worker.workerID == workerID, 'worker.workerID does not match'
        assert worker.local_nodelist == exp_nodelist1[
            wrk // 2], 'worker.local_nodelist does not match'
        assert worker.local_node_count == 1, 'worker.local_node_count does not match'
        assert worker.workers_per_node == 2, 'worker.workers_per_node does not match'
예제 #11
0
def test_get_local_nodelist_distrib_mode_uneven_split():
    mynode = socket.gethostname()
    exp_node = mynode  # sname(mynode)
    nodelist_in = ['knl-0020', 'knl-0021', 'knl-0022', 'knl-0036', 'knl-0137', 'knl-0138', 'knl-0139', 'knl-1234']
    with open('node_list', 'w') as f:
        for i, node in enumerate(nodelist_in):
            f.write(node + '\n')
            if i == 4:
                f.write(mynode + '\n')

    resources = Resources(central_mode=False)
    num_workers = 2

    # May not be at head of list - should perhaps be warning or enforced
    workerID = 2
    exp_out = ['knl-0137', exp_node, 'knl-0138', 'knl-0139']
    local_nodelist = WorkerResources.get_local_nodelist(num_workers, workerID, resources)
    assert local_nodelist == exp_out, "local_nodelist returned does not match expected"
    os.remove('node_list')
예제 #12
0
 def set_worker_info(self, comm, workerid=None):
     """Sets info for this executor"""
     self.workerID = workerid
     if not self.resources:
         self.resources = Resources()
     self.resources.set_worker_resources(self.workerID, comm)
예제 #13
0
class SerialExecutor(Executor):
    def __init__(self):
        super().__init__()
        self._launch_with_retries = MPIExecutor._launch_with_retries
        self.max_launch_attempts = 5
        self.fail_time = 2
        self.retry_delay_incr = 5

    def add_comm_info(self, libE_nodes, serial_setup):
        """Adds comm-specific information to executor.
        Updates resources information if auto_resources is true.
        """
        self.resources.add_comm_info(libE_nodes=libE_nodes)
        # if serial_setup:
        #     self._serial_setup()

    def set_worker_info(self, comm, workerid=None):
        """Sets info for this executor"""
        self.workerID = workerid
        if not self.resources:
            self.resources = Resources()
        self.resources.set_worker_resources(self.workerID, comm)

    def submit(self,
               calc_type=None,
               app_name=None,
               app_args=None,
               stdout=None,
               stderr=None,
               dry_run=False,
               wait_on_run=False):
        if app_name is not None:
            app = self.get_app(app_name)
        elif calc_type is not None:
            app = self.default_app(calc_type)
        else:
            raise ExecutorException("Either app_name or calc_type must be set")
        default_workdir = os.getcwd()
        task = Task(app, app_args, default_workdir, stdout, stderr,
                    self.workerID)
        runline = []
        runline.extend(task.app.full_path.split())
        if task.app_args is not None:
            runline.extend(task.app_args.split())
        task.runline = ' '.join(runline)  # Allow to be queried
        if dry_run:
            task.dry_run = True
            logger.info('Test (No submit) Runline: {}'.format(
                ' '.join(runline)))
            task._set_complete(dry_run=True)
        else:
            # Launch Task
            self._launch_with_retries(task,
                                      runline,
                                      subgroup_launch=False,
                                      wait_on_run=wait_on_run)
            if not task.timer.timing:
                task.timer.start()
                task.submit_time = task.timer.tstart  # Time not date - may not need if using timer.
        self.list_of_tasks.append(task)
        return task