def test_slurmenv_api_resources(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) api_res = m.resources() assert all(('total_nodes' in api_res, 'total_cores' in api_res)) assert all((api_res['total_nodes'] == resources.total_nodes, api_res['total_cores'] == resources.total_cores)) aux_dir = find_single_aux_dir(str(tmpdir)) assert all((exists(join(tmpdir, '.qcgpjm-client', 'api.log')), exists(join(aux_dir, 'service.log')))) finally: if m: m.finish() # stopManager is using 'terminate' method on service process, which is not a best option when using # pytest and gathering code coverage # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_slurmenv_simple_resources(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip( 'test not run in slurm allocation or allocation is smaller than 2 nodes' ) get_slurm_resources()
def test_slurmenv_api_submit_simple(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs().\ add_std({ 'name': 'host', 'execution': { 'exec': '/bin/hostname', 'args': [ '--fqdn' ], 'stdout': 'std.out', 'stderr': 'std.err' }}) assert submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_slurmenv_api_submit_many_cores(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs(). \ add_std({ 'name': 'host', 'execution': { 'exec': '/bin/hostname', 'args': [ '--fqdn' ], 'stdout': 'out', }, 'resources': { 'numCores': { 'exact': resources.total_cores } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') # check working directories of job's inside working directory of service assert tmpdir == jinfos['host'].wdir, str(jinfos['host'].wdir) assert all((len(jinfos['host'].nodes) == resources.total_nodes, jinfos['host'].total_cores == resources.total_cores)), str(jinfos['host']) finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_slurmenv_api_cancel_kill_nl(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) print(f'tmpdir: {tmpdir}') try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) iters=10 ids = m.submit(Jobs(). add(script='trap "" SIGTERM; sleep 30s', iteration=iters, stdout='sleep.out.${it}', stderr='sleep.err.${it}', numCores=1) ) jid = ids[0] assert len(m.list()) == 1 list_jid = list(m.list().keys())[0] assert list_jid == jid # wait for job to start executing sleep(2) m.cancel([jid]) # wait for SIGTERM job cancel sleep(2) jinfos = m.info_parsed(ids) assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status == 'QUEUED')) # wait for SIGKILL job cancel (~ExecutionJob.SIG_KILL_TIMEOUT) sleep(ExecutionJob.SIG_KILL_TIMEOUT) jinfos = m.info_parsed(ids, withChilds=True) assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status == 'CANCELED')) # the canceled iterations are included in 'failed' entry in job statistics # the cancel status is presented in 'childs/state' entry assert all((jinfos[jid].iterations, jinfos[jid].iterations.get('start', -1) == 0, jinfos[jid].iterations.get('stop', 0) == iters, jinfos[jid].iterations.get('total', 0) == iters, jinfos[jid].iterations.get('finished', 0) == iters, jinfos[jid].iterations.get('failed', -1) == iters)) assert len(jinfos[jid].childs) == iters for iteration in range(iters): job_it = jinfos[jid].childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jid, iteration), job_it.status == 'CANCELED')), str(job_it) m.remove(jid) finally: m.finish() m.cleanup()
def test_slurmenv_api_iteration_simple(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) its = 2 jobs = Jobs(). \ add_std({ 'name': 'host', 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'exact': 1 } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') assert jinfos jinfo = jinfos['host'] print('jinfo: {}'.format(jinfo)) assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)) its = 2 jobs = Jobs(). \ add_std({ 'name': 'host2', 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'exact': 1 } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos['host2'] print('jinfo: {}'.format(jinfo)) assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format('host2', iteration), job_it.wdir == tmpdir, job_it.total_cores == 1)) finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_slurmenv_api_submit_exceed_total_cores(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs(). \ add_std({ 'name': 'date', 'execution': { 'exec': '/bin/date' }, 'resources': { 'numCores': { 'exact': resources.total_cores + 1 } }}) with pytest.raises(ConnectionError, match=r".*Not enough resources.*"): m.submit(jobs) assert len(m.list()) == 0 jobs = Jobs(). \ add_std({ 'name': 'date', 'execution': { 'exec': '/bin/date' }, 'resources': { 'numNodes': { 'exact': resources.total_nodes + 1 } }}) with pytest.raises(ConnectionError, match=r".*Not enough resources.*"): ids = m.submit(jobs) assert len(m.list()) == 0 jobs = Jobs(). \ add_std({ 'name': 'date', 'execution': { 'exec': '/bin/date', 'stdout': 'std.out', }, 'resources': { 'numCores': { 'exact': resources.total_cores } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') assert jinfos['date'].total_cores == resources.total_cores finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_slurmenv_simple_resources_binding(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip( 'test not run in slurm allocation or allocation is smaller than 2 nodes' ) print('SLURM_NODELIST: {}'.format(environ.get('SLURM_NODELIST', None))) print('SLURM_JOB_CPUS_PER_NODE: {}'.format( environ.get('SLURM_JOB_CPUS_PER_NODE', None))) print('SLURM_CPU_BIND_LIST: {}'.format( environ.get('SLURM_CPU_BIND_LIST', None))) print('SLURM_CPU_BIND_TYPE: {}'.format( environ.get('SLURM_CPU_BIND_TYPE', None))) print('CUDA_VISIBLE_DEVICES: {}'.format( environ.get('CUDA_VISIBLE_DEVICES', None))) get_slurm_resources_binded()
def test_slurmenv_api_std_streams_many_cores(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs(). \ add_std({ 'name': 'host', 'execution': { 'exec': 'cat', 'stdin': '/etc/system-release', 'stdout': 'out', 'stderr': 'err' }, 'resources': { 'numCores': { 'exact': 2 } } }) assert submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') assert all((exists(join(tmpdir, 'out')), exists(join(tmpdir, 'err')))) with open(join(tmpdir, 'out'), 'rt') as out_f: out = out_f.read() with open(join('/etc/system-release'), 'rt') as sr_f: system_release = sr_f.read() assert system_release in out finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_slurm_partition_resources(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() governor_dir = tempfile.mkdtemp(dir=SHARED_PATH) print('governor work dir: {}'.format(governor_dir)) governor_id = 'm-governor' governor_args = ['--log', 'debug', '--wd', governor_dir, '--report-format', 'json', '--id', governor_id, '--system-core', '--slurm-partition-nodes', '1'] governor_process, governor_address = fork_manager(governor_args) time.sleep(10) try: resources_reply = send_request_valid(governor_address, {'request': 'resourcesInfo'}) print('got total resource reply: {}'.format(str(resources_reply))) assert all((resources_reply['code'] == 0, 'data' in resources_reply)) # as 1 core is reserved for each partition manager, total number of allocated cores # will be the same as number of partition managers - in our case 1 partition manager # per node assert all((resources_reply['data'].get('total_nodes', 0) == resources.total_nodes, resources_reply['data'].get('total_cores', 0) == resources.total_cores, resources_reply['data'].get('used_cores', -1) == resources.total_nodes, resources_reply['data'].get('free_cores', 0) == resources.total_cores - resources.total_nodes)),\ str(resources_reply) send_request_valid(governor_address, {'request': 'finish'}) governor_process.join(30) assert governor_process.exitcode == 0 finally: if governor_process: governor_process.terminate() rmtree(governor_dir)
def test_slurmenv_launcher_agents(): pytest.skip('somehow this test doesn\t properly close event loop') if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip( 'test not run in slurm allocation or allocation is smaller than 2 nodes' ) resources, allocation = get_slurm_resources_binded() # with tempfile.TemporaryDirectory(dir=SHARED_PATH) as tmpdir: set_pythonpath_to_qcg_module() tmpdir = tempfile.mkdtemp(dir=SHARED_PATH) print('tmpdir: {}'.format(tmpdir)) try: auxdir = join(tmpdir, 'qcg') print("aux directory set to: {}".format(auxdir)) mkdir(auxdir) if asyncio.get_event_loop() and asyncio.get_event_loop().is_closed(): asyncio.set_event_loop(asyncio.new_event_loop()) LauncherExecutionJob.start_agents(tmpdir, auxdir, resources.nodes, resources.binding) try: assert len( LauncherExecutionJob.launcher.agents) == resources.total_nodes # there should be only one launcher per node, so check based on 'node' in agent['data']['slurm'] node_names = set(node.name for node in resources.nodes) for agent_name, agent in LauncherExecutionJob.launcher.agents.items( ): print("found agent {}: {}".format(agent_name, str(agent))) assert all(('process' in agent, 'data' in agent, 'options' in agent.get('data', {}))), str(agent) assert all(('slurm' in agent['data'], 'node' in agent.get('data', {}).get('slurm', {}))), str(agent) assert agent['data']['slurm']['node'] in node_names assert all( ('binding' in agent['data']['options'], agent['data']['options']['binding'] == True)), str(agent) node_names.remove(agent['data']['slurm']['node']) assert len(node_names) == 0 # launching once more should raise exception with pytest.raises(Exception): LauncherExecutionJob.start_agents(tmpdir, auxdir, resources.nodes, resources.binding) finally: asyncio.get_event_loop().run_until_complete( asyncio.ensure_future(LauncherExecutionJob.stop_agents())) time.sleep(1) asyncio.get_event_loop().close() finally: rmtree(tmpdir) pass
def test_slurmenv_simple_job(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip( 'test not run in slurm allocation or allocation is smaller than 2 nodes' ) resources, allocation = get_slurm_resources_binded() resources_node_names = set(n.name for n in resources.nodes) set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) file_path = join(tmpdir, 'jobs.json') print('tmpdir: {}'.format(tmpdir)) jobName = 'mdate' jobs = [ job.to_dict() for job in [ Job( jobName, JobExecution('date', wd=abspath(join(tmpdir, 'date.sandbox')), stdout='date.out', stderr='date.err'), JobResources(numCores=ResourceSize(1))) ] ] reqs = [{ 'request': 'submit', 'jobs': jobs }, { 'request': 'control', 'command': 'finishAfterAllTasksDone' }] save_reqs_to_file(reqs, file_path) print('jobs saved to file_path: {}'.format(str(file_path))) sys.argv = [ 'QCG-PilotJob', '--log', 'debug', '--file', '--file-path', str(file_path), '--wd', tmpdir, '--report-format', 'json' ] QCGPMService().start() jobEntries = check_job_status_in_json([jobName], workdir=tmpdir, dest_state='SUCCEED') assert all( (isdir(abspath(join(tmpdir, 'date.sandbox'))), exists(join(abspath(join(tmpdir, 'date.sandbox')), 'date.out')), exists(join(abspath(join(tmpdir, 'date.sandbox')), 'date.err')), stat(join(abspath(join(tmpdir, 'date.sandbox')), 'date.out')).st_size > 0)) # there can be some debugging messages in the stderr # stat(join(abspath(join(tmpdir, 'date.sandbox')), 'date.err')).st_size == 0)) for jname, jentry in jobEntries.items(): assert all(('runtime' in jentry, 'allocation' in jentry.get('runtime', {}))) jalloc = jentry['runtime']['allocation'] for jalloc_node in jalloc.split(','): node_name = jalloc_node[:jalloc_node.index('[')] print('{} in available nodes ({})'.format( node_name, ','.join(resources_node_names))) assert node_name in resources_node_names, '{} not in nodes ({}'.format( node_name, ','.join(resources_node_names)) with pytest.raises(ValueError): check_job_status_in_json([jobName + 'xxx'], workdir=tmpdir, dest_state='SUCCEED') rmtree(tmpdir)
def test_slurmenv_many_nodes_many_cores(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip( 'test not run in slurm allocation or allocation is smaller than 2 nodes' ) resources, allocation = get_slurm_resources_binded() resources_node_names = set(n.name for n in resources.nodes) set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) file_path = join(tmpdir, 'jobs.json') print('tmpdir: {}'.format(tmpdir)) jobName = 'hostname' jobwdir_base = 'hostname.sandbox' cores_num = resources.nodes[0].free nodes_num = resources.total_nodes jobs = [ job.to_dict() for job in [ Job( jobName, JobExecution(exec='mpirun', args=['--allow-run-as-root', 'hostname'], wd=abspath(join(tmpdir, jobwdir_base)), stdout='hostname.out', stderr='hostname.err', modules=['mpi/openmpi-x86_64']), JobResources(numCores=ResourceSize(cores_num), numNodes=ResourceSize(nodes_num))) ] ] reqs = [{ 'request': 'submit', 'jobs': jobs }, { 'request': 'control', 'command': 'finishAfterAllTasksDone' }] save_reqs_to_file(reqs, file_path) print('jobs saved to file_path: {}'.format(str(file_path))) sys.argv = [ 'QCG-PilotJob', '--log', 'debug', '--file', '--file-path', str(file_path), '--wd', tmpdir, '--report-format', 'json' ] QCGPMService().start() jobEntries = check_job_status_in_json([jobName], workdir=tmpdir, dest_state='SUCCEED') assert all( (isdir(abspath(join(tmpdir, jobwdir_base))), exists(join(abspath(join(tmpdir, jobwdir_base)), 'hostname.out')), exists(join(abspath(join(tmpdir, jobwdir_base)), 'hostname.err')), stat(join(abspath(join(tmpdir, jobwdir_base)), 'hostname.out')).st_size > 0)) job_nodes = [] allocated_cores = 0 for jname, jentry in jobEntries.items(): assert all(('runtime' in jentry, 'allocation' in jentry.get('runtime', {}))) jalloc = jentry['runtime']['allocation'] for jalloc_node in jalloc.split(','): node_name = jalloc_node[:jalloc_node.index('[')] job_nodes.append(node_name) print('{} in available nodes ({})'.format( node_name, ','.join(resources_node_names))) assert node_name in resources_node_names, '{} not in nodes ({}'.format( node_name, ','.join(resources_node_names)) ncores = len(jalloc_node[jalloc_node.index('[') + 1:-1].split(':')) print('#{} cores on node {}'.format(ncores, node_name)) allocated_cores += ncores assert len(job_nodes) == nodes_num, str(job_nodes) assert allocated_cores == nodes_num * cores_num, allocated_cores # check if hostname is in stdout in two lines with open(abspath(join(tmpdir, join(jobwdir_base, 'hostname.out'))), 'rt') as stdout_file: stdout_content = [line.rstrip() for line in stdout_file.readlines()] assert len(stdout_content) == nodes_num * cores_num, str(stdout_content) assert all(hostname in job_nodes for hostname in stdout_content), str(stdout_content) with pytest.raises(ValueError): check_job_status_in_json([jobName + 'xxx'], workdir=tmpdir, dest_state='SUCCEED') rmtree(tmpdir)
def test_slurmenv_api_iteration_node_scheduling(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') # TODO: it's hard to write comprehensive iteration scheduling node tests on only two nodes (in slurm's \ # development docker) resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) # in that case the 'split-into' is default the number of iterations # so total available resources should be splited into two partitions and each of the # iteration should run on its own partition jname = 'host' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out_${it}', 'stderr': 'err_${it}' }, 'resources': { 'numCores': { 'exact': resources.nodes[0].total }, 'numNodes': { 'min': 1, 'scheduler': { 'name': 'split-into' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it) # all iterations has been scheduled across all nodes assert sum([ len(child.nodes) for child in jinfo.childs ]) == resources.total_nodes # the iterations should execute on different node assert list(jinfo.childs[0].nodes)[0] != list(jinfo.childs[1].nodes)[0] # we explicity specify the 'split-into' parameter to 2, behavior should be the same as in the # previous example jname = 'host2' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'exact': resources.nodes[0].total }, 'numNodes': { 'min': 1, 'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it) # all iterations has been scheduled across all nodes assert sum([ len(child.nodes) for child in jinfo.childs ]) == resources.total_nodes # the iterations should execute on different node assert list(jinfo.childs[0].nodes)[0] != list(jinfo.childs[1].nodes)[0] # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available # resources jname = 'host3' its = 4 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'exact': resources.nodes[0].total }, 'numNodes': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it) assert sum([len(child.nodes) for child in jinfo.childs]) == its finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_slurmenv_api_iteration_core_scheduling(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) # in that case the 'split-into' is default the number of iterations # so total available resources should be splited into two partitions and each of the # iteration should run on its own partition jname = 'host' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'split-into' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) # all iterations has been scheduled across all resources assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores assert all(child.total_cores == resources.total_cores / its for child in jinfo.childs) # we explicity specify the 'split-into' parameter to 2, behavior should be the same as in the # previous example jname = 'host2' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) # all iterations has been scheduled across all resources assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores assert all(child.total_cores == resources.total_cores / 2 for child in jinfo.childs) # we explicity specify the 'split-into' parameter to 4, the two iterations should be sheduled # on half of the available resources jname = 'host3' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'split-into', 'params': { 'parts': 4 } } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) # all iterations has been scheduled across all resources assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores / 2 assert all(child.total_cores == resources.total_cores / 4 for child in jinfo.childs) # we explicity specify the 'split-into' parameter to 2, but the number of iterations is larger than # available partitions in the same time, so they should be executed serially (by parts) jname = 'host4' its = 10 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) assert all(child.total_cores == resources.total_cores / 2 for child in jinfo.childs) # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available # resources jname = 'host5' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available # resources jname = 'host6' its = resources.total_cores jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each # iteration inside 'step' jname = 'host7' its = resources.total_cores jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) assert (child.total_cores == 1 for child in jinfo.childs) assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each # iteration inside 'step' jname = 'host8' its = resources.total_cores * 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) assert (child.total_cores == 1 for child in jinfo.childs) assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores * 2 # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each # iteration inside 'step' jname = 'host9' its = resources.total_cores + 1 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1)), str(job_it) assert (child.total_cores == 1 for child in jinfo.childs) # because all iterations will be splited in two 'steps' and in each step the iterations that has been assigned # for the step should usage maximum available resources assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores * 2 # in this case where two iterations can't fit at once on resources, all the iterations should be scheduled # serially on all available resources jname = 'host10' its = resources.total_nodes jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': resources.total_cores - 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores == resources.total_cores, len(job_it.nodes) == resources.total_nodes)),\ str(job_it) finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_slurm_partition_submit(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() governor_dir = tempfile.mkdtemp(dir=SHARED_PATH) print('governor work dir: {}'.format(governor_dir)) governor_id = 'm-governor' governor_args = ['--log', 'debug', '--wd', str(governor_dir), '--report-format', 'json', '--id', governor_id, '--system-core', '--slurm-partition-nodes', '1'] governor_process, governor_address = fork_manager(governor_args) time.sleep(10) try: resources_reply = send_request_valid(governor_address, {'request': 'resourcesInfo'}) print('got total resource reply: {}'.format(str(resources_reply))) assert all((resources_reply['code'] == 0, 'data' in resources_reply)) # as 1 core is reserved for each partition manager, total number of allocated cores # will be the same as number of partition managers - in our case 1 partition manager # per node assert all((resources_reply['data'].get('total_nodes', 0) == resources.total_nodes, resources_reply['data'].get('total_cores', 0) == resources.total_cores, resources_reply['data'].get('used_cores', -1) == resources.total_nodes, resources_reply['data'].get('free_cores', 0) == resources.total_cores - resources.total_nodes)), str(resources_reply) # submit a bunch of jobs njobs = 100 jname = 'date_iter' wdir_base = 'env.sandbox' submit_reply = send_request_valid(governor_address, { 'request': 'submit', 'jobs': [ { 'name': jname, 'iteration': { "start": 0, "stop": njobs }, 'execution': { 'exec': '/usr/bin/env', 'args': [ 'date' ], 'wd': abspath(join(governor_dir, "{}_$${{it}}".format(wdir_base))), 'stdout': 'env.stdout', 'stderr': 'env.stderr' }, 'resources': { 'numCores': { 'exact': 1 } } } ] }) assert all((submit_reply['code'] == 0, 'data' in resources_reply)) assert all((submit_reply['data'].get('submitted', 0) == 1, len(submit_reply['data'].get('jobs', [])) == 1, jname in submit_reply['data'].get('jobs', []))) status_reply = send_request_valid(governor_address, { 'request': 'jobStatus', 'jobNames': [ jname ]}) assert all((status_reply['code'] == 0, 'data' in status_reply)) assert 'jobs' in status_reply['data'] assert len(status_reply['data']['jobs']) == 1 assert jname in status_reply['data']['jobs'] jstatus = status_reply['data']['jobs'][jname] assert all((jstatus['status'] == 0, jstatus['data']['jobName'] == jname, jstatus['data']['status'] in [ JobState.QUEUED.name, JobState.SUCCEED.name ])) status_reply = send_request_valid(governor_address, {'request': 'control', 'command': 'finishAfterAllTasksDone'}) assert status_reply['code'] == 0 # wait up to 5 seconds governor_process.join(20) assert governor_process.exitcode == 0 for i in range(njobs): job_wd = join(governor_dir, "{}_{}".format(wdir_base, i)) assert isdir(job_wd), job_wd # check stdout & stderr files assert all((isfile(join(job_wd, 'env.stdout')), isfile(join(job_wd, 'env.stderr')))) assert all((getsize(join(job_wd, 'env.stdout')) > 0, getsize(join(job_wd, 'env.stderr')) == 0)) finally: if governor_process: governor_process.terminate() rmtree(str(governor_dir))
def test_slurmenv_api_submit_resource_ranges(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs(). \ add_std({ 'name': 'host', 'execution': { 'exec': '/bin/hostname', 'args': [ '--fqdn' ], 'stdout': 'out', }, 'resources': { 'numCores': { 'min': 1 } } }) # job should faile because of missing 'max' parameter jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'FAILED') jinfo = jinfos['host'] assert "Both core's range boundaries (min, max) must be defined" in jinfo.messages, str(jinfo) jobs = Jobs(). \ add_std({ 'name': 'host2', 'execution': { 'exec': '/bin/hostname', 'args': [ '--fqdn' ], 'stdout': 'out', }, 'resources': { 'numNodes': { 'exact': 1 }, 'numCores': { 'min': 1, 'max': resources.nodes[0].total + 1 } } }) # job should run on single node (the first free) with all available cores jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') jinfo = jinfos['host2'] assert all((len(jinfo.nodes) == 1, jinfo.total_cores == resources.nodes[0].total)), str(jinfo) jobs = Jobs(). \ add_std({ 'name': 'host3', 'execution': { 'exec': '/bin/hostname', 'args': [ '--fqdn' ], 'stdout': 'out', }, 'resources': { 'numCores': { 'min': 1, 'max': resources.nodes[0].total + 1 } } }) # job should run on at least two nodes with total maximum given cores jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') jinfo = jinfos['host3'] assert all((len(jinfo.nodes) == 2, jinfo.total_cores == resources.nodes[0].total + 1)), str(jinfo) finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)