def test_request_remove_job(tmpdir): # raw removeJob request test req = RemoveJobReq({'request': 'removeJob', 'jobNames': ['job1', 'job2']}) req_clone = RemoveJobReq(json.loads(req.to_json())) assert req.to_json() == req_clone.to_json() m = LocalManager(['--wd', str(tmpdir), '--nodes', 2], {'wdir': str(tmpdir)}) try: # missing 'jobNames' for jobInfo request with pytest.raises( ConnectionError, match=r".*Wrong remove job request - missing job names.*"): m.send_request({'request': 'removeJob'}) # wrong format of 'jobNames' element with pytest.raises( ConnectionError, match=r".*Wrong remove job request - missing job names.*"): m.send_request({'request': 'removeJob', 'jobNames': 'not a list'}) # wrong format of 'jobNames' element - empty list with pytest.raises( ConnectionError, match=r".*Wrong remove job request - missing job names.*"): m.send_request({'request': 'removeJob', 'jobNames': []}) finally: m.finish()
def test_resume_failed(tmpdir): non_existing_path = 'some-non-existing-directory' with pytest.raises(ServiceError, match=r".*Resume directory.*not exists or is not valid QCG-PilotJob auxiliary directory.*"): m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', '4', '--resume', non_existing_path], {'wdir': str(tmpdir)}) non_existing_path = join(tmpdir, 'non-pilotjob-dir') mkdir(non_existing_path) with pytest.raises(ServiceError, match=r".*Resume directory.*not exists or is not valid QCG-PilotJob auxiliary directory.*"): m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', '4', '--resume', non_existing_path], {'wdir': str(tmpdir)})
def __init__(self, wd=".", resources=None, reserve_core=False, enable_rt_stats=False, wrapper_rt_stats=None, log_level='info', *other_args): self.finished = False # ---- QCG PILOT JOB INITIALISATION --- # Establish logging levels service_log_level, client_log_level = self._setup_qcgpj_logging( log_level) # Prepare input arguments for QCG-PJM args = ['--log', service_log_level, '--wd', wd] if resources: args.append('--nodes') args.append(str(resources)) if reserve_core: args.append('--system-core') if enable_rt_stats: args.append('--enable-rt-stats') if wrapper_rt_stats: args.append('--wrapper-rt-stats') args.append(wrapper_rt_stats) if other_args: args.append(other_args) client_conf = { 'log_file': wd + '/api.log', 'log_level': client_log_level } _logger.info(f'Starting QCG-PJ Manager with arguments: {args}') # create QCGPJ Manager (service part) self._qcgpjm = LocalManager(args, client_conf)
def test_slurmenv_api_resources(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) api_res = m.resources() assert all(('total_nodes' in api_res, 'total_cores' in api_res)) assert all((api_res['total_nodes'] == resources.total_nodes, api_res['total_cores'] == resources.total_cores)) aux_dir = find_single_aux_dir(str(tmpdir)) assert all((exists(join(tmpdir, '.qcgpjm-client', 'api.log')), exists(join(aux_dir, 'service.log')))) finally: if m: m.finish() # stopManager is using 'terminate' method on service process, which is not a best option when using # pytest and gathering code coverage # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_slurmenv_api_submit_many_cores(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs(). \ add_std({ 'name': 'host', 'execution': { 'exec': '/bin/hostname', 'args': [ '--fqdn' ], 'stdout': 'out', }, 'resources': { 'numCores': { 'exact': resources.total_cores } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') # check working directories of job's inside working directory of service assert tmpdir == jinfos['host'].wdir, str(jinfos['host'].wdir) assert all((len(jinfos['host'].nodes) == resources.total_nodes, jinfos['host'].total_cores == resources.total_cores)), str(jinfos['host']) finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_request_general(tmpdir): m = LocalManager(['--wd', str(tmpdir), '--nodes', 2], {'wdir': str(tmpdir)}) try: # missing 'request' element with pytest.raises(ConnectionError, match=r".*Invalid request.*"): m.send_request({'notARequestElement': 'some value'}) # unknown 'request' with pytest.raises(ConnectionError, match=r".*Unknown request name.*"): m.send_request({'request': 'some unknown request'}) finally: m.finish()
def test_resume_tracker_files(tmpdir): try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', '4'], {'wdir': str(tmpdir)}) job_req = { 'name': 'host', 'execution': { 'exec': '/bin/date', 'stdout': 'out', }, 'resources': { 'numCores': { 'exact': 1 } } } jobs = Jobs().add_std(job_req) submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') time.sleep(1) aux_dir = find_single_aux_dir(str(tmpdir)) print(f'aux_dir content: {str(listdir(aux_dir))}') assert all(exists(join(aux_dir, fname)) for fname in ['track.reqs', 'track.states']), \ f"missing tracker files in {aux_dir}: {str(listdir(aux_dir))}" finally: if m: m.finish() m.cleanup() rmtree(tmpdir)
def test_slurmenv_api_submit_simple(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs().\ add_std({ 'name': 'host', 'execution': { 'exec': '/bin/hostname', 'args': [ '--fqdn' ], 'stdout': 'std.out', 'stderr': 'std.err' }}) assert submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_request_control(tmpdir): # raw control request test req = ControlReq({ 'request': 'control', 'command': 'finishAfterAllTasksDone' }) req_clone = ControlReq(json.loads(req.to_json())) assert req.to_json() == req_clone.to_json() m = LocalManager(['--wd', str(tmpdir), '--nodes', 2], {'wdir': str(tmpdir)}) try: # missing 'command' for control request with pytest.raises( ConnectionError, match=r".*Wrong control request - missing command.*"): m.send_request({'request': 'control'}) # unknown 'command' for control request with pytest.raises( ConnectionError, match=r".*Wrong control request - unknown command.*"): m.send_request({ 'request': 'control', 'command': 'unknown command' }) # finishAfterAllTasksDone 'command' for control request res = m.send_request({ 'request': 'control', 'command': 'finishAfterAllTasksDone' }) assert all( (res.get('code', -1) == 0, res.get('message', None) == 'finishAfterAllTasksDone command accepted')) finally: try: # if finishAfterAllTasksDone has been sent we might get error 'Finish request already requested' m.finish() except Exception: pass
def test_local_manager_resources(tmpdir): cores = 4 # switch on debugging (by default in api.log file) m = LocalManager(['--wd', str(tmpdir), '--nodes', str(cores)], {'wdir': str(tmpdir)}) res = m.resources() assert all(('total_nodes' in res, 'total_cores' in res, res['total_nodes'] == 1, res['total_cores'] == cores)) m.finish() # m.stopManager() m.cleanup()
def test_slurmenv_api_iteration_simple(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) its = 2 jobs = Jobs(). \ add_std({ 'name': 'host', 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'exact': 1 } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') assert jinfos jinfo = jinfos['host'] print('jinfo: {}'.format(jinfo)) assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)) its = 2 jobs = Jobs(). \ add_std({ 'name': 'host2', 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'exact': 1 } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos['host2'] print('jinfo: {}'.format(jinfo)) assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format('host2', iteration), job_it.wdir == tmpdir, job_it.total_cores == 1)) finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_local_manager_resources_nodes(tmpdir): nodes = 2 cores_per_node = 3 res_desc = ','.join([str(cores_per_node) for i in range(nodes)]) # switch on debugging (by default in api.log file) m = LocalManager(['--wd', str(tmpdir), '--nodes', res_desc], {'wdir': str(tmpdir)}) res = m.resources() assert all( ('total_nodes' in res, 'total_cores' in res, res['total_nodes'] == 2, res['total_cores'] == cores_per_node * nodes)) m.finish() # m.stopManager() m.cleanup()
def test_slurmenv_api_std_streams_many_cores(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs(). \ add_std({ 'name': 'host', 'execution': { 'exec': 'cat', 'stdin': '/etc/system-release', 'stdout': 'out', 'stderr': 'err' }, 'resources': { 'numCores': { 'exact': 2 } } }) assert submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') assert all((exists(join(tmpdir, 'out')), exists(join(tmpdir, 'err')))) with open(join(tmpdir, 'out'), 'rt') as out_f: out = out_f.read() with open(join('/etc/system-release'), 'rt') as sr_f: system_release = sr_f.read() assert system_release in out finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
class QCGPJExecutor(Executor): """QCG-PilotJob Executor. It provides simplified interface for common uses of QCG-PilotJob Parameters ---------- wd : str, optional Working directory where QCG-PilotJob manager should be started, by default it is a current directory resources : str, optional The resources to use. If specified forces usage of Local mode of QCG-PilotJob Manager. The format is compliant with the NODES format of QCG-PilotJob, i.e.: [node_name:]cores_on_node[,node_name2:cores_on_node][,...]. Eg. to define 4 cores on an unnamed node use `resources="4"`, to define 2 nodes: node_1 with 2 cores and node_2 with 3 cores, use `resources="node_1:2,node_2:3"` reserve_core : bool, optional If True reserves a core for QCG-PilotJob Manager instance, by default QCG-PilotJob Manager shares a core with computing tasks Parameters. enable_rt_stats : bool, optional If True, QCG-PilotJob Manager will collect its runtime statistics wrapper_rt_stats : str, optional The path to the QCG-PilotJob Manager tasks wrapper program used for collection of statistics log_level : str, optional Logging level for QCG-PilotJob Manager (for both service and client part). other_args : optional Optional list of additional arguments for initialisation of QCG-PilotJob Manager Returns ------- None """ def __init__(self, wd=".", resources=None, reserve_core=False, enable_rt_stats=False, wrapper_rt_stats=None, log_level='info', *other_args): self.finished = False # ---- QCG PILOT JOB INITIALISATION --- # Establish logging levels service_log_level, client_log_level = self._setup_qcgpj_logging( log_level) # Prepare input arguments for QCG-PJM args = ['--log', service_log_level, '--wd', wd] if resources: args.append('--nodes') args.append(str(resources)) if reserve_core: args.append('--system-core') if enable_rt_stats: args.append('--enable-rt-stats') if wrapper_rt_stats: args.append('--wrapper-rt-stats') args.append(wrapper_rt_stats) if other_args: args.append(other_args) client_conf = { 'log_file': wd + '/api.log', 'log_level': client_log_level } _logger.info(f'Starting QCG-PJ Manager with arguments: {args}') # create QCGPJ Manager (service part) self._qcgpjm = LocalManager(args, client_conf) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.shutdown() def shutdown(self, wait=True): """Shutdowns the QCG-PJ manager service. If it is already closed, the method has no effect. """ if not self.finished: self._qcgpjm.finish() self.finished = True else: pass def submit(self, fn: Callable[..., Union[str, Tuple[str, Dict[str, Any]]]], *args, **kwargs): """Submits a specific task to the QCG-PJ manager using template-based, executor-like interface. Parameters ---------- fn : Callable A callable that returns a tuple representing a task's template. The first element of the tuple should be a string containing a QCG-PilotJob task's description with placeholders (identifiers preceded by $ symbol) and the second a dictionary that assigns default values for selected placeholders. *args: variable length list with dicts, optional A set of dicts which contain parameters that will be used to substitute placeholders defined in the template. Note: *args overwrite defaults, but they are overwritten by **kwargs **kwargs: arbitrary keyword arguments A set of keyword arguments that will be used to substitute placeholders defined in the template. Note: **kwargs overwrite *args and defaults. Returns ------- QCGPJFuture The QCGPJFuture object assigned with the submitted task """ template = fn() if isinstance(template, tuple): template_str = template[0] defaults = template[1] else: template_str = template defaults = {} t = Template(textwrap.dedent(template_str)) substitutions = {} for a in args: if a is not None: substitutions.update(a) substitutions.update(kwargs) td_str = t.substitute(defaults, **substitutions) td = ast.literal_eval(td_str) if 'env' not in td['execution']: td['execution']['env'] = {} td['execution']['env']['QCG_PM_EXEC_API_JOB_ID'] = '${jname}' jobs = Jobs() jobs.add_std(td) jobs_ids = self._qcgpjm.submit(jobs) return QCGPJFuture(jobs_ids, self._qcgpjm) @property def qcgpj_manager(self): """Returns current QCG-PilotJob manager instance """ return self._qcgpjm @staticmethod def _setup_qcgpj_logging(log_level): log_level = log_level.upper() try: service_log_level = ServiceLogLevel[log_level].value except KeyError: service_log_level = ServiceLogLevel.DEBUG.value try: client_log_level = ClientLogLevel[log_level].value except KeyError: client_log_level = ClientLogLevel.DEBUG.value return service_log_level, client_log_level
def test_resume_simple(tmpdir): try: ncores = 4 m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', str(ncores)], {'wdir': str(tmpdir)}) its = 10 job_req = { 'name': 'sleep', 'execution': { 'exec': '/bin/sleep', 'args': [ '4s' ], 'stdout': 'out', }, 'iteration': { 'stop': its }, 'resources': { 'numCores': { 'exact': 1 } } } jobs = Jobs().add_std(job_req) job_ids = m.submit(jobs) # because job iterations executes in order, after finish of 4th iteration, the three previous should also finish m.wait4('sleep:3') jinfos = m.info_parsed(job_ids, withChilds=True) assert jinfos jinfo = jinfos['sleep'] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == ncores, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] exp_status = ['SUCCEED'] if iteration > 3: exp_status = ['EXECUTING', 'SCHEDULED', 'QUEUED'] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format('sleep', iteration), job_it.status in exp_status)),\ f"{job_it.iteration} != {iteration}, {job_it.name} != {'{}:{}'.format('sleep', iteration)}, {job_it.status} != {exp_status}" # kill process m.kill_manager_process() m.cleanup() ncores = 4 m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', str(ncores), '--resume', tmpdir], {'wdir': str(tmpdir)}) m.wait4all() jinfos = m.info_parsed(job_ids, withChilds=True) assert jinfos jinfo = jinfos['sleep'] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format('sleep', iteration), job_it.status == 'SUCCEED')), \ f"{job_it.iteration} != {iteration}, {job_it.name} != {'{}:{}'.format('sleep', iteration)}, {job_it.status} != SUCCEED" finally: if m: m.finish() m.cleanup()
WORKER_CORES = int(args.WORKER_CORES) DATA_DIR = 'input_csv' if WORKER_CORES > 1: PYTHON_CMD = "mpirun -n %d python3" % (WORKER_CORES) else: PYTHON_CMD = "python3" ''' ###################################################################### config PilotJob ###################################################################### ''' from qcg.pilotjob.api.manager import Manager from qcg.pilotjob.api.manager import LocalManager from qcg.pilotjob.api.job import Jobs m = LocalManager(cfg={'log_level': 'DEBUG'}, server_args=['--log', 'debug']) # get available resources print("\n\navailable resources:\n%s\n" % str(m.resources())) # submit jobs and save their names in 'ids' list jobs = Jobs() print("Start Adding jobs . . .\n\n") WORKER_INDEX = 0 for i in range(NUM_WORKERS): for SUBMODEL in ['macro', 'micro']: cmd = '%s run_couple.py --submodel %s --data_dir=%s --worker_index %d --coupling_type %s --num_workers %d --weather_coupling %s' % ( PYTHON_CMD, SUBMODEL, DATA_DIR, WORKER_INDEX, COUPLING_TYPE, NUM_WORKERS, WEATHER_COUPLING)
DATA_DIR = "input_csv" if INSTANCE_CORES > 1: PYTHON_CMD = "mpirun -n %d python3" % (INSTANCE_CORES) else: PYTHON_CMD = "python3" """ ###################################################################### config PilotJob ###################################################################### """ from qcg.pilotjob.api.manager import Manager from qcg.pilotjob.api.manager import LocalManager from qcg.pilotjob.api.job import Jobs # m = LocalManager(cfg={'log_level': 'DEBUG'}, server_args=['--log', 'debug']) m = LocalManager() # get available resources print("\n\navailable resources:\n%s\n" % str(m.resources())) # submit jobs and save their names in 'ids' list jobs = Jobs() print("Start Adding jobs . . .\n\n") INSTANCE_INDEX = 0 for i in range(NUM_INSTANCES): for SUBMODEL in ['macro', 'micro']: cmd = '%s run_mscale.py --submodel %s --data_dir=%s --instance_index %d --coupling_type %s --num_instances %d --weather_coupling %s' % ( PYTHON_CMD, SUBMODEL, DATA_DIR, INSTANCE_INDEX, COUPLING_TYPE, NUM_INSTANCES, WEATHER_COUPLING)
def test_slurmenv_api_cancel_kill_nl(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) print(f'tmpdir: {tmpdir}') try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) iters=10 ids = m.submit(Jobs(). add(script='trap "" SIGTERM; sleep 30s', iteration=iters, stdout='sleep.out.${it}', stderr='sleep.err.${it}', numCores=1) ) jid = ids[0] assert len(m.list()) == 1 list_jid = list(m.list().keys())[0] assert list_jid == jid # wait for job to start executing sleep(2) m.cancel([jid]) # wait for SIGTERM job cancel sleep(2) jinfos = m.info_parsed(ids) assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status == 'QUEUED')) # wait for SIGKILL job cancel (~ExecutionJob.SIG_KILL_TIMEOUT) sleep(ExecutionJob.SIG_KILL_TIMEOUT) jinfos = m.info_parsed(ids, withChilds=True) assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status == 'CANCELED')) # the canceled iterations are included in 'failed' entry in job statistics # the cancel status is presented in 'childs/state' entry assert all((jinfos[jid].iterations, jinfos[jid].iterations.get('start', -1) == 0, jinfos[jid].iterations.get('stop', 0) == iters, jinfos[jid].iterations.get('total', 0) == iters, jinfos[jid].iterations.get('finished', 0) == iters, jinfos[jid].iterations.get('failed', -1) == iters)) assert len(jinfos[jid].childs) == iters for iteration in range(iters): job_it = jinfos[jid].childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jid, iteration), job_it.status == 'CANCELED')), str(job_it) m.remove(jid) finally: m.finish() m.cleanup()
def test_request_notify(tmpdir): # raw notify request test req = NotifyReq({ 'request': 'notify', 'entity': 'job', 'params': { 'name': 'j1', 'state': 'FINISHED', 'attributes': { 'a1': True } } }) req_clone = NotifyReq(json.loads(req.to_json())) assert req.to_json() == req_clone.to_json() m = LocalManager(['--wd', str(tmpdir), '--nodes', 2], {'wdir': str(tmpdir)}) try: # missing 'entity' for notify request with pytest.raises( ConnectionError, match=r".*Wrong notify request - missing/unknown entity.*"): m.send_request({'request': 'notify'}) # unknown 'entity' for notify request with pytest.raises( ConnectionError, match=r".*Wrong notify request - missing/unknown entity.*"): m.send_request({'request': 'notify', 'entity': 'task'}) # missing params with pytest.raises( ConnectionError, match=r".*Wrong notify request - missing register parameters.*" ): m.send_request({'request': 'notify', 'entity': 'job'}) # missing key params with pytest.raises( ConnectionError, match= r".*Wrong notify request - missing key notify parameters.*"): m.send_request({ 'request': 'notify', 'entity': 'job', 'params': { 'name': 'j1' } }) # missing key params with pytest.raises( ConnectionError, match= r".*Wrong notify request - missing key notify parameters.*"): m.send_request({ 'request': 'notify', 'entity': 'job', 'params': { 'name': 'j1', 'state': 'FINISHED' } }) # missing key params with pytest.raises( ConnectionError, match= r".*Wrong notify request - missing key notify parameters.*"): m.send_request({ 'request': 'notify', 'entity': 'job', 'params': { 'state': 'FINISHED', 'attributes': 'a1' } }) finally: m.finish()
def test_slurmenv_api_submit_resource_ranges(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs(). \ add_std({ 'name': 'host', 'execution': { 'exec': '/bin/hostname', 'args': [ '--fqdn' ], 'stdout': 'out', }, 'resources': { 'numCores': { 'min': 1 } } }) # job should faile because of missing 'max' parameter jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'FAILED') jinfo = jinfos['host'] assert "Both core's range boundaries (min, max) must be defined" in jinfo.messages, str(jinfo) jobs = Jobs(). \ add_std({ 'name': 'host2', 'execution': { 'exec': '/bin/hostname', 'args': [ '--fqdn' ], 'stdout': 'out', }, 'resources': { 'numNodes': { 'exact': 1 }, 'numCores': { 'min': 1, 'max': resources.nodes[0].total + 1 } } }) # job should run on single node (the first free) with all available cores jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') jinfo = jinfos['host2'] assert all((len(jinfo.nodes) == 1, jinfo.total_cores == resources.nodes[0].total)), str(jinfo) jobs = Jobs(). \ add_std({ 'name': 'host3', 'execution': { 'exec': '/bin/hostname', 'args': [ '--fqdn' ], 'stdout': 'out', }, 'resources': { 'numCores': { 'min': 1, 'max': resources.nodes[0].total + 1 } } }) # job should run on at least two nodes with total maximum given cores jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') jinfo = jinfos['host3'] assert all((len(jinfo.nodes) == 2, jinfo.total_cores == resources.nodes[0].total + 1)), str(jinfo) finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_request_register(tmpdir): # raw register request test req = RegisterReq({ 'request': 'register', 'entity': 'manager', 'params': { 'id': 'm1', 'address': '0.0.0.0', 'resources': { 'nodes': 2 } } }) req_clone = RegisterReq(json.loads(req.to_json())) assert req.to_json() == req_clone.to_json() m = LocalManager(['--wd', str(tmpdir), '--nodes', 2], {'wdir': str(tmpdir)}) try: # missing 'entity' for register request with pytest.raises( ConnectionError, match=r".*Wrong register request - missing/unknown entity.*"): m.send_request({'request': 'register'}) # unknown 'entity' for register request with pytest.raises( ConnectionError, match=r".*Wrong register request - missing/unknown entity.*"): m.send_request({'request': 'register', 'entity': 'job'}) # missing params with pytest.raises( ConnectionError, match= r".*Wrong register request - missing register parameters.*"): m.send_request({'request': 'register', 'entity': 'manager'}) # missing key params with pytest.raises( ConnectionError, match= r".*Wrong register request - missing key register parameters.*" ): m.send_request({ 'request': 'register', 'entity': 'manager', 'params': { 'id': 'm1' } }) # missing key params with pytest.raises( ConnectionError, match= r".*Wrong register request - missing key register parameters.*" ): m.send_request({ 'request': 'register', 'entity': 'manager', 'params': { 'id': 'm1', 'address': '0.0.0.0' } }) # missing key params with pytest.raises( ConnectionError, match= r".*Wrong register request - missing key register parameters.*" ): m.send_request({ 'request': 'register', 'entity': 'manager', 'params': { 'resources': { 'nodes': 1 }, 'address': '0.0.0.0' } }) finally: m.finish()
def test_slurmenv_api_iteration_core_scheduling(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) # in that case the 'split-into' is default the number of iterations # so total available resources should be splited into two partitions and each of the # iteration should run on its own partition jname = 'host' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'split-into' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) # all iterations has been scheduled across all resources assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores assert all(child.total_cores == resources.total_cores / its for child in jinfo.childs) # we explicity specify the 'split-into' parameter to 2, behavior should be the same as in the # previous example jname = 'host2' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) # all iterations has been scheduled across all resources assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores assert all(child.total_cores == resources.total_cores / 2 for child in jinfo.childs) # we explicity specify the 'split-into' parameter to 4, the two iterations should be sheduled # on half of the available resources jname = 'host3' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'split-into', 'params': { 'parts': 4 } } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) # all iterations has been scheduled across all resources assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores / 2 assert all(child.total_cores == resources.total_cores / 4 for child in jinfo.childs) # we explicity specify the 'split-into' parameter to 2, but the number of iterations is larger than # available partitions in the same time, so they should be executed serially (by parts) jname = 'host4' its = 10 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'hostname', 'args': [ '--fqdn' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) assert all(child.total_cores == resources.total_cores / 2 for child in jinfo.childs) # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available # resources jname = 'host5' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available # resources jname = 'host6' its = resources.total_cores jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each # iteration inside 'step' jname = 'host7' its = resources.total_cores jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) assert (child.total_cores == 1 for child in jinfo.childs) assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each # iteration inside 'step' jname = 'host8' its = resources.total_cores * 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1, job_it.total_cores < resources.total_cores)), str(job_it) assert (child.total_cores == 1 for child in jinfo.childs) assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores * 2 # in case where number of iterations exceeds the number of available resources, the 'maximum-iters' schedulers # splits iterations into 'steps' minimizing this number, and allocates as many resources as possible for each # iteration inside 'step' jname = 'host9' its = resources.total_cores + 1 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores >= 1)), str(job_it) assert (child.total_cores == 1 for child in jinfo.childs) # because all iterations will be splited in two 'steps' and in each step the iterations that has been assigned # for the step should usage maximum available resources assert sum([ child.total_cores for child in jinfo.childs ]) == resources.total_cores * 2 # in this case where two iterations can't fit at once on resources, all the iterations should be scheduled # serially on all available resources jname = 'host10' its = resources.total_nodes jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'min': resources.total_cores - 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores == resources.total_cores, len(job_it.nodes) == resources.total_nodes)),\ str(job_it) finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_slurmenv_api_submit_exceed_total_cores(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) jobs = Jobs(). \ add_std({ 'name': 'date', 'execution': { 'exec': '/bin/date' }, 'resources': { 'numCores': { 'exact': resources.total_cores + 1 } }}) with pytest.raises(ConnectionError, match=r".*Not enough resources.*"): m.submit(jobs) assert len(m.list()) == 0 jobs = Jobs(). \ add_std({ 'name': 'date', 'execution': { 'exec': '/bin/date' }, 'resources': { 'numNodes': { 'exact': resources.total_nodes + 1 } }}) with pytest.raises(ConnectionError, match=r".*Not enough resources.*"): ids = m.submit(jobs) assert len(m.list()) == 0 jobs = Jobs(). \ add_std({ 'name': 'date', 'execution': { 'exec': '/bin/date', 'stdout': 'std.out', }, 'resources': { 'numCores': { 'exact': resources.total_cores } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED') assert jinfos['date'].total_cores == resources.total_cores finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
def test_local_manager_wait4all(tmpdir): cores = 4 # switch on debugging (by default in api.log file) m = LocalManager(['--wd', str(tmpdir), '--nodes', str(cores)], {'wdir': str(tmpdir)}) res = m.resources() assert all(('total_nodes' in res, 'total_cores' in res, res['total_nodes'] == 1, res['total_cores'] == cores)) ids = m.submit(Jobs().add(name='host', exec='/bin/hostname', args=['--fqdn'], stdout='host.stdout').add(name='date', exec='/bin/date', stdout='date.stdout', numCores={'exact': 2})) assert len(m.list()) == 2 m.wait4all() jinfos = m.info(ids) assert all(('jobs' in jinfos, len(jinfos['jobs'].keys()) == 2, 'host' in jinfos['jobs'], 'date' in jinfos['jobs'], jinfos['jobs']['host'].get('data', {}).get('status', '') == 'SUCCEED', jinfos['jobs']['date'].get('data', {}).get('status', '') == 'SUCCEED')) aux_dir = find_single_aux_dir(str(tmpdir)) assert all( (exists(tmpdir.join('.qcgpjm-client', 'api.log')), exists(join(aux_dir, 'service.log')), exists(tmpdir.join('host.stdout')), exists(tmpdir.join('date.stdout')))) m.finish() # m.stopManager() m.cleanup()
def test_slurmenv_api_iteration_node_scheduling(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes') # TODO: it's hard to write comprehensive iteration scheduling node tests on only two nodes (in slurm's \ # development docker) resources, allocation = get_slurm_resources_binded() set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) try: m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)}) # in that case the 'split-into' is default the number of iterations # so total available resources should be splited into two partitions and each of the # iteration should run on its own partition jname = 'host' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out_${it}', 'stderr': 'err_${it}' }, 'resources': { 'numCores': { 'exact': resources.nodes[0].total }, 'numNodes': { 'min': 1, 'scheduler': { 'name': 'split-into' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it) # all iterations has been scheduled across all nodes assert sum([ len(child.nodes) for child in jinfo.childs ]) == resources.total_nodes # the iterations should execute on different node assert list(jinfo.childs[0].nodes)[0] != list(jinfo.childs[1].nodes)[0] # we explicity specify the 'split-into' parameter to 2, behavior should be the same as in the # previous example jname = 'host2' its = 2 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'exact': resources.nodes[0].total }, 'numNodes': { 'min': 1, 'scheduler': { 'name': 'split-into', 'params': { 'parts': 2 } } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it) # all iterations has been scheduled across all nodes assert sum([ len(child.nodes) for child in jinfo.childs ]) == resources.total_nodes # the iterations should execute on different node assert list(jinfo.childs[0].nodes)[0] != list(jinfo.childs[1].nodes)[0] # the 'maximum-iters' scheduler is trying to launch as many iterations in the same time on all available # resources jname = 'host3' its = 4 jobs = Jobs(). \ add_std({ 'name': jname, 'iteration': { 'stop': its }, 'execution': { 'exec': 'sleep', 'args': [ '2s' ], 'stdout': 'out' }, 'resources': { 'numCores': { 'exact': resources.nodes[0].total }, 'numNodes': { 'min': 1, 'scheduler': { 'name': 'maximum-iters' } } } }) jinfos = submit_2_manager_and_wait_4_info(m, jobs, 'SUCCEED', withChilds=True) assert jinfos jinfo = jinfos[jname] assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0, jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its, jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo) assert len(jinfo.childs) == its for iteration in range(its): job_it = jinfo.childs[iteration] print('job iteration {}: {}'.format(iteration, str(job_it))) assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jname, iteration), job_it.total_cores == resources.nodes[0].total, len(job_it.nodes) == 1)), str(job_it) assert sum([len(child.nodes) for child in jinfo.childs]) == its finally: if m: m.finish() # m.stopManager() m.cleanup() rmtree(tmpdir)
default="log_MOO.txt") parser.add_argument("--cores", action="store", type=int, default="1") parser.add_argument("--USE_PJ", action="store", default="False") args = parser.parse_args() execution_mode = args.execution_mode simulation_period = args.simulation_period cores = args.cores if args.USE_PJ.lower() == "true": USE_PJ = True from qcg.pilotjob.api.manager import LocalManager QCG_MANAGER = LocalManager( cfg={'log_level': 'DEBUG'}, server_args=['--log', 'debug'] ) else: USE_PJ = False EXEC_LOG_FILE = os.path.join(work_dir, args.exec_log_file) MOO_log(msg="run_MOO input args : {}".format(args)) # read MOO setting from config yaml file MOO_CONFIG = read_MOO_setting_yaml() MOO_log(msg="MOO_CONFIG =\n{}".format(pformat(MOO_CONFIG))) problem = FLEE_MOO_Problem( execution_mode=execution_mode, simulation_period=simulation_period,
def test_request_submit(tmpdir): # raw submit request test req = SubmitReq({ 'request': 'submit', 'jobs': [{ 'name': 'job1', 'execution': { 'exec': '/bin/date', 'args': ['1', '2'] } }, { 'name': 'job2', 'execution': { 'script': 'date' }, 'resources': { 'numCores': { 'exact': 1 } } }] }) req_clone = SubmitReq(json.loads(req.to_json())) assert req.to_json() == req_clone.to_json() m = LocalManager(['--wd', str(tmpdir), '--nodes', 2], {'wdir': str(tmpdir)}) try: # missing 'jobs' for submit request with pytest.raises( ConnectionError, match=r".*Wrong submit request - missing jobs data.*"): m.send_request({'request': 'submit'}) # wrong 'jobs' data format for submit request with pytest.raises( ConnectionError, match=r".*Wrong submit request - missing jobs data.*"): m.send_request({'request': 'submit', 'jobs': None}) # wrong 'jobs' data format for submit request with pytest.raises( ConnectionError, match=r".*Wrong submit request - missing jobs data.*"): m.send_request({'request': 'submit', 'jobs': 'not a list'}) # wrong 'jobs' data format for submit request with pytest.raises(ConnectionError, match=r".*Wrong submit request - wrong job data.*"): m.send_request({'request': 'submit', 'jobs': ['not a dictionary']}) # missing job's name with pytest.raises(ConnectionError, match=r".*Missing name in job description.*"): m.send_request({ 'request': 'submit', 'jobs': [{ 'execution': '/bin/date' }] }) # missing execution element with pytest.raises( ConnectionError, match=r".*Missing execution element in job description.*"): m.send_request({'request': 'submit', 'jobs': [{'name': 'date'}]}) # wrong iterations format with pytest.raises( ConnectionError, match= r".*Wrong format of iteration directive: not a dictionary.*"): m.send_request({ 'request': 'submit', 'jobs': [{ 'name': 'date', 'execution': { 'exec': '/bin/date' }, 'iteration': 'not a list' }] }) # wrong iterations format with pytest.raises( ConnectionError, match= r".*Wrong format of iteration directive: start index larger then stop one.*" ): m.send_request({ 'request': 'submit', 'jobs': [{ 'name': 'date', 'execution': { 'exec': '/bin/date' }, 'iteration': { 'start': 2, 'stop': 1 } }] }) finally: m.finish()