def test_local_error_duplicate_name_job_separate_reqs(tmpdir): file_path = tmpdir.join('jobs.json') print('tmpdir: {}'.format(str(tmpdir))) jobName = 'mdate' jobs1 = [ job.to_dict() for job in [ Job( jobName, JobExecution('date', wd=abspath(tmpdir.join('date.sandbox')), stdout='date.out', stderr='date.err'), JobResources(numCores=ResourceSize(1))) ] ] jobs2 = [ job.to_dict() for job in [ Job( jobName, JobExecution('sleep', wd=abspath(tmpdir.join('sleep.sandbox')), stdout='sleep.out', stderr='sleep.err'), JobResources(numCores=ResourceSize(1))) ] ] reqs = [{ 'request': 'submit', 'jobs': jobs1 }, { 'request': 'submit', 'jobs': jobs2 }, { 'request': 'control', 'command': 'finishAfterAllTasksDone' }] save_reqs_to_file(reqs, file_path) print('jobs saved to file_path: {}'.format(str(file_path))) sys.argv = [ 'QCG-PilotJob', '--file', '--file-path', str(file_path), '--nodes', '2', '--wd', str(tmpdir), '--report-format', 'json' ] QCGPMService().start() # the first job (date) should execute check_job_status_in_json([jobName], workdir=str(tmpdir), dest_state='SUCCEED') assert all((isdir(abspath(tmpdir.join('date.sandbox'))), exists(join(abspath(tmpdir.join('date.sandbox')), 'date.out')), exists(join(abspath(tmpdir.join('date.sandbox')), 'date.err')))) # the second job (sleep) due to the name clash should not execute assert not isdir(abspath(tmpdir.join('sleep.sandbox')))
def test_local_simple_script_job(tmpdir): file_path = tmpdir.join('jobs.json') print('tmpdir: {}'.format(str(tmpdir))) jobName = 'mdate_script' jobs = [ job.to_dict() for job in [ Job( jobName, JobExecution(script='/bin/date\n/bin/hostname\n', wd=abspath(tmpdir.join('date.sandbox')), stdout='date.out', stderr='date.err'), JobResources(numCores=ResourceSize(1))) ] ] reqs = [{ 'request': 'submit', 'jobs': jobs }, { 'request': 'control', 'command': 'finishAfterAllTasksDone' }] save_reqs_to_file(reqs, file_path) print('jobs saved to file_path: {}'.format(str(file_path))) sys.argv = [ 'QCG-PilotJob', '--log', 'debug', '--file', '--file-path', str(file_path), '--nodes', '2', '--wd', str(tmpdir), '--report-format', 'json' ] QCGPMService().start() check_job_status_in_json([jobName], workdir=str(tmpdir), dest_state='SUCCEED') assert all( (isdir(abspath(tmpdir.join('date.sandbox'))), exists(join(abspath(tmpdir.join('date.sandbox')), 'date.out')), exists(join(abspath(tmpdir.join('date.sandbox')), 'date.err')), stat(join(abspath(tmpdir.join('date.sandbox')), 'date.out')).st_size > 0, stat(join(abspath(tmpdir.join('date.sandbox')), 'date.err')).st_size == 0)) with pytest.raises(ValueError): check_job_status_in_json([jobName + 'xxx'], workdir=str(tmpdir), dest_state='SUCCEED')
def test_local_workflows(tmpdir): file_path = tmpdir.join('jobs.json') print('tmpdir: {}'.format(str(tmpdir))) jobs = [ job.to_dict() for job in [ Job( 'first', JobExecution('sleep', args=['2s'], wd=abspath(tmpdir.join('first.sandbox')), stdout='out', stderr='err'), JobResources(numCores=ResourceSize(1))), Job('second', JobExecution('sleep', args=['1s'], wd=abspath(tmpdir.join('second.sandbox')), stdout='out', stderr='err'), JobResources(numCores=ResourceSize(1)), dependencies=JobDependencies(after=['first'])), Job('third', JobExecution('date', wd=abspath(tmpdir.join('third.sandbox')), stdout='out', stderr='err'), JobResources(numCores=ResourceSize(1)), dependencies=JobDependencies(after=['first', 'second'])) ] ] reqs = [{ 'request': 'submit', 'jobs': jobs }, { 'request': 'control', 'command': 'finishAfterAllTasksDone' }] save_reqs_to_file(reqs, file_path) print('jobs saved to file_path: {}'.format(str(file_path))) # the ammount of resources should be enough to theoretically start all three job's at once sys.argv = [ 'QCG-PilotJob', '--file', '--file-path', str(file_path), '--nodes', '4', '--wd', str(tmpdir), '--report-format', 'json' ] QCGPMService().start() jnames = ['first', 'second', 'third'] check_job_status_in_json(jnames, workdir=str(tmpdir), dest_state='SUCCEED') for jname in jnames: assert all((isdir(abspath(tmpdir.join('{}.sandbox'.format(jname)))), exists( join(abspath(tmpdir.join('{}.sandbox'.format(jname))), 'out')), exists( join(abspath(tmpdir.join('{}.sandbox'.format(jname))), 'err')))) with open(join(find_single_aux_dir(str(tmpdir)), 'jobs.report'), 'r') as f: job_stats = [json.loads(line) for line in f.readlines()] assert len(job_stats) == len(jnames) jstats = {} for i in range(0, len(jnames)): job = job_stats[i] print('readed job stats: {}'.format(str(job))) t = datetime.strptime(job['runtime']['rtime'], "%H:%M:%S.%f") rtime = timedelta(hours=t.hour, minutes=t.minute, seconds=t.second, microseconds=t.microsecond) # find start executing time exec_state = list( filter(lambda st_en: st_en['state'] == 'EXECUTING', job['history'])) assert len(exec_state) == 1 # find finish executing time finish_state = list( filter(lambda st_en: st_en['state'] == 'SUCCEED', job['history'])) assert len(finish_state) == 1 start_time = datetime.strptime(exec_state[0]['date'], '%Y-%m-%dT%H:%M:%S.%f') finish_time = datetime.strptime(finish_state[0]['date'], '%Y-%m-%dT%H:%M:%S.%f') jstats[job['name']] = { 'r_time': rtime, 's_time': start_time, 'f_time': finish_time } # assert second job started after the first one assert jstats['second']['s_time'] > jstats['first']['f_time'] # assert third job started after the first and second ones assert all((jstats['third']['s_time'] > jstats['first']['f_time'], jstats['third']['s_time'] > jstats['second']['f_time'])) rmtree(str(tmpdir))
def test_profile_local_iter_scheduling_job_large(tmpdir): file_path = tmpdir.join('jobs.json') print('tmpdir: {}'.format(str(tmpdir))) jobName = "sleep-iter" nits = 100 jobSleepTime = 2 jobCores = 2 availCores = 40 rounds = nits * jobCores / availCores totalExecTime = rounds * jobSleepTime jobs = [{ "name": jobName, "iteration": { "stop": nits }, "execution": { "exec": "/bin/sleep", "args": ["{}s".format(str(jobSleepTime))], "wd": abspath(tmpdir.join("{}_$${{it}}".format(jobName))), "stdout": "sleep-iter.stdout", "stderr": "sleep-iter.stderr" }, "resources": { "numCores": { "exact": jobCores, } } }] reqs = [{ 'request': 'submit', 'jobs': jobs }, { 'request': 'control', 'command': 'finishAfterAllTasksDone' }] save_reqs_to_file(reqs, file_path) print('jobs saved to file_path: {}'.format(str(file_path))) sys.argv = [ 'QCG-PilotJob', '--file', '--file-path', str(file_path), '--nodes', str(availCores), '--wd', str(tmpdir), '--report-format', 'json' ] QCGPMService().start() check_job_status_in_json( [jobName] + ["{}:{}".format(jobName, i) for i in range(0, nits)], workdir=str(tmpdir), dest_state='SUCCEED') for i in range(0, nits): wd_path = abspath(tmpdir.join("{}_{}".format(jobName, i))) stdout_path = join(wd_path, 'sleep-iter.stdout') stderr_path = join(wd_path, 'sleep-iter.stderr') assert all((isdir(wd_path), exists(stdout_path), exists(stderr_path) )), "stdout({}) and/or stderr({}) doesn't exist".format( stdout_path, stderr_path) rmtree(str(tmpdir))
def test_local_iter_scheduling_job_large(tmpdir): file_path = tmpdir.join('jobs.json') print('tmpdir: {}'.format(str(tmpdir))) jobName = "sleep-iter" nits = 20 jobSleepTime = 2 jobCores = 2 availCores = 10 rounds = nits * jobCores / availCores totalExecTime = rounds * jobSleepTime jobs = [{ "name": jobName, "iteration": { "stop": nits }, "execution": { "exec": "/bin/sleep", "args": ["{}s".format(str(jobSleepTime))], "wd": abspath(tmpdir.join("{}_$${{it}}".format(jobName))), "stdout": "sleep-iter.stdout", "stderr": "sleep-iter.stderr" }, "resources": { "numCores": { "exact": jobCores, } } }] reqs = [{ 'request': 'submit', 'jobs': jobs }, { 'request': 'control', 'command': 'finishAfterAllTasksDone' }] save_reqs_to_file(reqs, file_path) print('jobs saved to file_path: {}'.format(str(file_path))) sys.argv = [ 'QCG-PilotJob', '--log', 'debug', '--file', '--file-path', str(file_path), '--nodes', str(availCores), '--wd', str(tmpdir), '--report-format', 'json' ] QCGPMService().start() check_job_status_in_json( [jobName] + ["{}:{}".format(jobName, i) for i in range(0, nits)], workdir=str(tmpdir), dest_state='SUCCEED') for i in range(0, nits): wd_path = abspath(tmpdir.join("{}_{}".format(jobName, i))) stdout_path = join(wd_path, 'sleep-iter.stdout') stderr_path = join(wd_path, 'sleep-iter.stderr') assert all((isdir(wd_path), exists(stdout_path), exists(stderr_path) )), "stdout({}) and/or stderr({}) doesn't exist".format( stdout_path, stderr_path) with open(join(find_single_aux_dir(str(tmpdir)), 'jobs.report'), 'r') as f: job_stats = [json.loads(line) for line in f.readlines()] assert len(job_stats) == nits + 1 min_start, max_finish = None, None for i in range(0, nits): job = job_stats[i] print('readed job stats: {}'.format(str(job))) t = datetime.strptime(job['runtime']['rtime'], "%H:%M:%S.%f") rtime = timedelta(hours=t.hour, minutes=t.minute, seconds=t.second, microseconds=t.microsecond) assert all((rtime.total_seconds() > jobSleepTime, rtime.total_seconds() < jobSleepTime + 2)), \ "job {} runtime exceeded assumed value {}s vs max {}s".format(i, rtime.total_seconds(), jobSleepTime + 2) # find start executing time exec_state = list( filter(lambda st_en: st_en['state'] == 'EXECUTING', job['history'])) assert len(exec_state) == 1 finish_state = list( filter(lambda st_en: st_en['state'] == 'SUCCEED', job['history'])) assert len(finish_state) == 1 start_time = datetime.strptime(exec_state[0]['date'], '%Y-%m-%dT%H:%M:%S.%f') finish_time = datetime.strptime(finish_state[0]['date'], '%Y-%m-%dT%H:%M:%S.%f') if not min_start or start_time < min_start: min_start = start_time if not max_finish or finish_time > max_finish: max_finish = finish_time assert all((min_start, finish_time)) # check if duration from executing first job till the end of last job is about 2 rounds, each with jobSleepTime scenario_duration = finish_time - min_start assert all((scenario_duration.total_seconds() > totalExecTime, scenario_duration.total_seconds() < totalExecTime + 4)), \ "scenario duration runtime exceeded assumed value {}s vs max {}s".format(scenario_duration.total_seconds(), totalExecTime + 4) rmtree(str(tmpdir))
def test_local_simple_uneven_resources_many_iter_jobs(tmpdir): file_path = tmpdir.join('jobs.json') print('tmpdir: {}'.format(str(tmpdir))) bigJobName = "big-echo-iter" smallJobName = "small-echo-iter" nits = 10 jobs = [{ "name": bigJobName, "iteration": { "start": 0, "stop": nits }, "execution": { "exec": "/bin/echo", "args": ["iteration ${it}"], "wd": abspath(tmpdir.join("{}_$${{it}}".format(bigJobName))), "stdout": "echo-iter.stdout", "stderr": "echo-iter.stderr" }, "resources": { "numCores": { "exact": 2, } } }, { "name": smallJobName, "iteration": { "start": 0, "stop": nits }, "execution": { "exec": "/bin/echo", "args": ["iteration ${it}"], "wd": abspath(tmpdir.join("{}_$${{it}}".format(smallJobName))), "stdout": "echo-iter.stdout", "stderr": "echo-iter.stderr" }, "resources": { "numCores": { "exact": 1, } } }] reqs = [{ 'request': 'submit', 'jobs': jobs }, { 'request': 'control', 'command': 'finishAfterAllTasksDone' }] save_reqs_to_file(reqs, file_path) print('jobs saved to file_path: {}'.format(str(file_path))) sys.argv = [ 'QCG-PilotJob', '--log', 'debug', '--file', '--file-path', str(file_path), '--nodes', '3', '--wd', str(tmpdir), '--report-format', 'json' ] QCGPMService().start() check_job_status_in_json( [bigJobName] + ["{}:{}".format(bigJobName, i) for i in range(0, nits)] + [smallJobName] + ["{}:{}".format(smallJobName, i) for i in range(0, nits)], workdir=str(tmpdir), dest_state='SUCCEED') for jobName in [bigJobName, smallJobName]: for i in range(0, nits): wd_path = abspath(tmpdir.join("{}_{}".format(jobName, i))) stdout_path = join(wd_path, 'echo-iter.stdout') stderr_path = join(wd_path, 'echo-iter.stderr') assert all( (isdir(wd_path), exists(stdout_path), exists(stderr_path) )), "stdout({}) and/or stderr({}) doesn't exist".format( stdout_path, stderr_path) with open(stdout_path, 'r') as f: assert f.read().strip() == Template( "iteration ${it}").substitute(it=i) rmtree(str(tmpdir))
def test_slurmenv_simple_job(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip( 'test not run in slurm allocation or allocation is smaller than 2 nodes' ) resources, allocation = get_slurm_resources_binded() resources_node_names = set(n.name for n in resources.nodes) set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) file_path = join(tmpdir, 'jobs.json') print('tmpdir: {}'.format(tmpdir)) jobName = 'mdate' jobs = [ job.to_dict() for job in [ Job( jobName, JobExecution('date', wd=abspath(join(tmpdir, 'date.sandbox')), stdout='date.out', stderr='date.err'), JobResources(numCores=ResourceSize(1))) ] ] reqs = [{ 'request': 'submit', 'jobs': jobs }, { 'request': 'control', 'command': 'finishAfterAllTasksDone' }] save_reqs_to_file(reqs, file_path) print('jobs saved to file_path: {}'.format(str(file_path))) sys.argv = [ 'QCG-PilotJob', '--log', 'debug', '--file', '--file-path', str(file_path), '--wd', tmpdir, '--report-format', 'json' ] QCGPMService().start() jobEntries = check_job_status_in_json([jobName], workdir=tmpdir, dest_state='SUCCEED') assert all( (isdir(abspath(join(tmpdir, 'date.sandbox'))), exists(join(abspath(join(tmpdir, 'date.sandbox')), 'date.out')), exists(join(abspath(join(tmpdir, 'date.sandbox')), 'date.err')), stat(join(abspath(join(tmpdir, 'date.sandbox')), 'date.out')).st_size > 0)) # there can be some debugging messages in the stderr # stat(join(abspath(join(tmpdir, 'date.sandbox')), 'date.err')).st_size == 0)) for jname, jentry in jobEntries.items(): assert all(('runtime' in jentry, 'allocation' in jentry.get('runtime', {}))) jalloc = jentry['runtime']['allocation'] for jalloc_node in jalloc.split(','): node_name = jalloc_node[:jalloc_node.index('[')] print('{} in available nodes ({})'.format( node_name, ','.join(resources_node_names))) assert node_name in resources_node_names, '{} not in nodes ({}'.format( node_name, ','.join(resources_node_names)) with pytest.raises(ValueError): check_job_status_in_json([jobName + 'xxx'], workdir=tmpdir, dest_state='SUCCEED') rmtree(tmpdir)
def test_slurmenv_many_nodes_many_cores(): if not in_slurm_allocation() or get_num_slurm_nodes() < 2: pytest.skip( 'test not run in slurm allocation or allocation is smaller than 2 nodes' ) resources, allocation = get_slurm_resources_binded() resources_node_names = set(n.name for n in resources.nodes) set_pythonpath_to_qcg_module() tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH)) file_path = join(tmpdir, 'jobs.json') print('tmpdir: {}'.format(tmpdir)) jobName = 'hostname' jobwdir_base = 'hostname.sandbox' cores_num = resources.nodes[0].free nodes_num = resources.total_nodes jobs = [ job.to_dict() for job in [ Job( jobName, JobExecution(exec='mpirun', args=['--allow-run-as-root', 'hostname'], wd=abspath(join(tmpdir, jobwdir_base)), stdout='hostname.out', stderr='hostname.err', modules=['mpi/openmpi-x86_64']), JobResources(numCores=ResourceSize(cores_num), numNodes=ResourceSize(nodes_num))) ] ] reqs = [{ 'request': 'submit', 'jobs': jobs }, { 'request': 'control', 'command': 'finishAfterAllTasksDone' }] save_reqs_to_file(reqs, file_path) print('jobs saved to file_path: {}'.format(str(file_path))) sys.argv = [ 'QCG-PilotJob', '--log', 'debug', '--file', '--file-path', str(file_path), '--wd', tmpdir, '--report-format', 'json' ] QCGPMService().start() jobEntries = check_job_status_in_json([jobName], workdir=tmpdir, dest_state='SUCCEED') assert all( (isdir(abspath(join(tmpdir, jobwdir_base))), exists(join(abspath(join(tmpdir, jobwdir_base)), 'hostname.out')), exists(join(abspath(join(tmpdir, jobwdir_base)), 'hostname.err')), stat(join(abspath(join(tmpdir, jobwdir_base)), 'hostname.out')).st_size > 0)) job_nodes = [] allocated_cores = 0 for jname, jentry in jobEntries.items(): assert all(('runtime' in jentry, 'allocation' in jentry.get('runtime', {}))) jalloc = jentry['runtime']['allocation'] for jalloc_node in jalloc.split(','): node_name = jalloc_node[:jalloc_node.index('[')] job_nodes.append(node_name) print('{} in available nodes ({})'.format( node_name, ','.join(resources_node_names))) assert node_name in resources_node_names, '{} not in nodes ({}'.format( node_name, ','.join(resources_node_names)) ncores = len(jalloc_node[jalloc_node.index('[') + 1:-1].split(':')) print('#{} cores on node {}'.format(ncores, node_name)) allocated_cores += ncores assert len(job_nodes) == nodes_num, str(job_nodes) assert allocated_cores == nodes_num * cores_num, allocated_cores # check if hostname is in stdout in two lines with open(abspath(join(tmpdir, join(jobwdir_base, 'hostname.out'))), 'rt') as stdout_file: stdout_content = [line.rstrip() for line in stdout_file.readlines()] assert len(stdout_content) == nodes_num * cores_num, str(stdout_content) assert all(hostname in job_nodes for hostname in stdout_content), str(stdout_content) with pytest.raises(ValueError): check_job_status_in_json([jobName + 'xxx'], workdir=tmpdir, dest_state='SUCCEED') rmtree(tmpdir)