示例#1
0
def test_local_error_duplicate_name_job_separate_reqs(tmpdir):
    file_path = tmpdir.join('jobs.json')

    print('tmpdir: {}'.format(str(tmpdir)))

    jobName = 'mdate'
    jobs1 = [
        job.to_dict() for job in [
            Job(
                jobName,
                JobExecution('date',
                             wd=abspath(tmpdir.join('date.sandbox')),
                             stdout='date.out',
                             stderr='date.err'),
                JobResources(numCores=ResourceSize(1)))
        ]
    ]
    jobs2 = [
        job.to_dict() for job in [
            Job(
                jobName,
                JobExecution('sleep',
                             wd=abspath(tmpdir.join('sleep.sandbox')),
                             stdout='sleep.out',
                             stderr='sleep.err'),
                JobResources(numCores=ResourceSize(1)))
        ]
    ]
    reqs = [{
        'request': 'submit',
        'jobs': jobs1
    }, {
        'request': 'submit',
        'jobs': jobs2
    }, {
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    }]
    save_reqs_to_file(reqs, file_path)
    print('jobs saved to file_path: {}'.format(str(file_path)))

    sys.argv = [
        'QCG-PilotJob', '--file', '--file-path',
        str(file_path), '--nodes', '2', '--wd',
        str(tmpdir), '--report-format', 'json'
    ]
    QCGPMService().start()

    # the first job (date) should execute
    check_job_status_in_json([jobName],
                             workdir=str(tmpdir),
                             dest_state='SUCCEED')
    assert all((isdir(abspath(tmpdir.join('date.sandbox'))),
                exists(join(abspath(tmpdir.join('date.sandbox')), 'date.out')),
                exists(join(abspath(tmpdir.join('date.sandbox')),
                            'date.err'))))

    # the second job (sleep) due to the name clash should not execute
    assert not isdir(abspath(tmpdir.join('sleep.sandbox')))
示例#2
0
def test_local_simple_script_job(tmpdir):
    file_path = tmpdir.join('jobs.json')

    print('tmpdir: {}'.format(str(tmpdir)))

    jobName = 'mdate_script'
    jobs = [
        job.to_dict() for job in [
            Job(
                jobName,
                JobExecution(script='/bin/date\n/bin/hostname\n',
                             wd=abspath(tmpdir.join('date.sandbox')),
                             stdout='date.out',
                             stderr='date.err'),
                JobResources(numCores=ResourceSize(1)))
        ]
    ]
    reqs = [{
        'request': 'submit',
        'jobs': jobs
    }, {
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    }]
    save_reqs_to_file(reqs, file_path)
    print('jobs saved to file_path: {}'.format(str(file_path)))

    sys.argv = [
        'QCG-PilotJob', '--log', 'debug', '--file', '--file-path',
        str(file_path), '--nodes', '2', '--wd',
        str(tmpdir), '--report-format', 'json'
    ]
    QCGPMService().start()

    check_job_status_in_json([jobName],
                             workdir=str(tmpdir),
                             dest_state='SUCCEED')
    assert all(
        (isdir(abspath(tmpdir.join('date.sandbox'))),
         exists(join(abspath(tmpdir.join('date.sandbox')), 'date.out')),
         exists(join(abspath(tmpdir.join('date.sandbox')), 'date.err')),
         stat(join(abspath(tmpdir.join('date.sandbox')), 'date.out')).st_size >
         0, stat(join(abspath(tmpdir.join('date.sandbox')),
                      'date.err')).st_size == 0))

    with pytest.raises(ValueError):
        check_job_status_in_json([jobName + 'xxx'],
                                 workdir=str(tmpdir),
                                 dest_state='SUCCEED')
示例#3
0
def test_local_workflows(tmpdir):
    file_path = tmpdir.join('jobs.json')

    print('tmpdir: {}'.format(str(tmpdir)))

    jobs = [
        job.to_dict() for job in [
            Job(
                'first',
                JobExecution('sleep',
                             args=['2s'],
                             wd=abspath(tmpdir.join('first.sandbox')),
                             stdout='out',
                             stderr='err'),
                JobResources(numCores=ResourceSize(1))),
            Job('second',
                JobExecution('sleep',
                             args=['1s'],
                             wd=abspath(tmpdir.join('second.sandbox')),
                             stdout='out',
                             stderr='err'),
                JobResources(numCores=ResourceSize(1)),
                dependencies=JobDependencies(after=['first'])),
            Job('third',
                JobExecution('date',
                             wd=abspath(tmpdir.join('third.sandbox')),
                             stdout='out',
                             stderr='err'),
                JobResources(numCores=ResourceSize(1)),
                dependencies=JobDependencies(after=['first', 'second']))
        ]
    ]
    reqs = [{
        'request': 'submit',
        'jobs': jobs
    }, {
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    }]
    save_reqs_to_file(reqs, file_path)
    print('jobs saved to file_path: {}'.format(str(file_path)))

    # the ammount of resources should be enough to theoretically start all three job's at once
    sys.argv = [
        'QCG-PilotJob', '--file', '--file-path',
        str(file_path), '--nodes', '4', '--wd',
        str(tmpdir), '--report-format', 'json'
    ]
    QCGPMService().start()

    jnames = ['first', 'second', 'third']
    check_job_status_in_json(jnames, workdir=str(tmpdir), dest_state='SUCCEED')
    for jname in jnames:
        assert all((isdir(abspath(tmpdir.join('{}.sandbox'.format(jname)))),
                    exists(
                        join(abspath(tmpdir.join('{}.sandbox'.format(jname))),
                             'out')),
                    exists(
                        join(abspath(tmpdir.join('{}.sandbox'.format(jname))),
                             'err'))))

    with open(join(find_single_aux_dir(str(tmpdir)), 'jobs.report'), 'r') as f:
        job_stats = [json.loads(line) for line in f.readlines()]

    assert len(job_stats) == len(jnames)

    jstats = {}
    for i in range(0, len(jnames)):
        job = job_stats[i]

        print('readed job stats: {}'.format(str(job)))
        t = datetime.strptime(job['runtime']['rtime'], "%H:%M:%S.%f")
        rtime = timedelta(hours=t.hour,
                          minutes=t.minute,
                          seconds=t.second,
                          microseconds=t.microsecond)

        # find start executing time
        exec_state = list(
            filter(lambda st_en: st_en['state'] == 'EXECUTING',
                   job['history']))
        assert len(exec_state) == 1

        # find finish executing time
        finish_state = list(
            filter(lambda st_en: st_en['state'] == 'SUCCEED', job['history']))
        assert len(finish_state) == 1

        start_time = datetime.strptime(exec_state[0]['date'],
                                       '%Y-%m-%dT%H:%M:%S.%f')
        finish_time = datetime.strptime(finish_state[0]['date'],
                                        '%Y-%m-%dT%H:%M:%S.%f')

        jstats[job['name']] = {
            'r_time': rtime,
            's_time': start_time,
            'f_time': finish_time
        }

    # assert second job started after the first one
    assert jstats['second']['s_time'] > jstats['first']['f_time']

    # assert third job started after the first and second ones
    assert all((jstats['third']['s_time'] > jstats['first']['f_time'],
                jstats['third']['s_time'] > jstats['second']['f_time']))

    rmtree(str(tmpdir))
示例#4
0
def test_profile_local_iter_scheduling_job_large(tmpdir):
    file_path = tmpdir.join('jobs.json')

    print('tmpdir: {}'.format(str(tmpdir)))

    jobName = "sleep-iter"
    nits = 100
    jobSleepTime = 2
    jobCores = 2
    availCores = 40
    rounds = nits * jobCores / availCores
    totalExecTime = rounds * jobSleepTime
    jobs = [{
        "name": jobName,
        "iteration": {
            "stop": nits
        },
        "execution": {
            "exec": "/bin/sleep",
            "args": ["{}s".format(str(jobSleepTime))],
            "wd": abspath(tmpdir.join("{}_$${{it}}".format(jobName))),
            "stdout": "sleep-iter.stdout",
            "stderr": "sleep-iter.stderr"
        },
        "resources": {
            "numCores": {
                "exact": jobCores,
            }
        }
    }]
    reqs = [{
        'request': 'submit',
        'jobs': jobs
    }, {
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    }]
    save_reqs_to_file(reqs, file_path)
    print('jobs saved to file_path: {}'.format(str(file_path)))

    sys.argv = [
        'QCG-PilotJob', '--file', '--file-path',
        str(file_path), '--nodes',
        str(availCores), '--wd',
        str(tmpdir), '--report-format', 'json'
    ]
    QCGPMService().start()

    check_job_status_in_json(
        [jobName] + ["{}:{}".format(jobName, i) for i in range(0, nits)],
        workdir=str(tmpdir),
        dest_state='SUCCEED')
    for i in range(0, nits):
        wd_path = abspath(tmpdir.join("{}_{}".format(jobName, i)))
        stdout_path = join(wd_path, 'sleep-iter.stdout')
        stderr_path = join(wd_path, 'sleep-iter.stderr')
        assert all((isdir(wd_path), exists(stdout_path), exists(stderr_path)
                    )), "stdout({}) and/or stderr({}) doesn't exist".format(
                        stdout_path, stderr_path)

    rmtree(str(tmpdir))
示例#5
0
def test_local_iter_scheduling_job_large(tmpdir):
    file_path = tmpdir.join('jobs.json')

    print('tmpdir: {}'.format(str(tmpdir)))

    jobName = "sleep-iter"
    nits = 20
    jobSleepTime = 2
    jobCores = 2
    availCores = 10
    rounds = nits * jobCores / availCores
    totalExecTime = rounds * jobSleepTime
    jobs = [{
        "name": jobName,
        "iteration": {
            "stop": nits
        },
        "execution": {
            "exec": "/bin/sleep",
            "args": ["{}s".format(str(jobSleepTime))],
            "wd": abspath(tmpdir.join("{}_$${{it}}".format(jobName))),
            "stdout": "sleep-iter.stdout",
            "stderr": "sleep-iter.stderr"
        },
        "resources": {
            "numCores": {
                "exact": jobCores,
            }
        }
    }]
    reqs = [{
        'request': 'submit',
        'jobs': jobs
    }, {
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    }]
    save_reqs_to_file(reqs, file_path)
    print('jobs saved to file_path: {}'.format(str(file_path)))

    sys.argv = [
        'QCG-PilotJob', '--log', 'debug', '--file', '--file-path',
        str(file_path), '--nodes',
        str(availCores), '--wd',
        str(tmpdir), '--report-format', 'json'
    ]
    QCGPMService().start()

    check_job_status_in_json(
        [jobName] + ["{}:{}".format(jobName, i) for i in range(0, nits)],
        workdir=str(tmpdir),
        dest_state='SUCCEED')

    for i in range(0, nits):
        wd_path = abspath(tmpdir.join("{}_{}".format(jobName, i)))
        stdout_path = join(wd_path, 'sleep-iter.stdout')
        stderr_path = join(wd_path, 'sleep-iter.stderr')
        assert all((isdir(wd_path), exists(stdout_path), exists(stderr_path)
                    )), "stdout({}) and/or stderr({}) doesn't exist".format(
                        stdout_path, stderr_path)

    with open(join(find_single_aux_dir(str(tmpdir)), 'jobs.report'), 'r') as f:
        job_stats = [json.loads(line) for line in f.readlines()]

    assert len(job_stats) == nits + 1

    min_start, max_finish = None, None

    for i in range(0, nits):
        job = job_stats[i]

        print('readed job stats: {}'.format(str(job)))
        t = datetime.strptime(job['runtime']['rtime'], "%H:%M:%S.%f")
        rtime = timedelta(hours=t.hour,
                          minutes=t.minute,
                          seconds=t.second,
                          microseconds=t.microsecond)

        assert all((rtime.total_seconds() > jobSleepTime, rtime.total_seconds() < jobSleepTime + 2)), \
            "job {} runtime exceeded assumed value {}s vs max {}s".format(i, rtime.total_seconds(), jobSleepTime + 2)

        # find start executing time
        exec_state = list(
            filter(lambda st_en: st_en['state'] == 'EXECUTING',
                   job['history']))
        assert len(exec_state) == 1

        finish_state = list(
            filter(lambda st_en: st_en['state'] == 'SUCCEED', job['history']))
        assert len(finish_state) == 1

        start_time = datetime.strptime(exec_state[0]['date'],
                                       '%Y-%m-%dT%H:%M:%S.%f')
        finish_time = datetime.strptime(finish_state[0]['date'],
                                        '%Y-%m-%dT%H:%M:%S.%f')

        if not min_start or start_time < min_start:
            min_start = start_time

        if not max_finish or finish_time > max_finish:
            max_finish = finish_time

    assert all((min_start, finish_time))

    # check if duration from executing first job till the end of last job is about 2 rounds, each with jobSleepTime
    scenario_duration = finish_time - min_start
    assert all((scenario_duration.total_seconds() > totalExecTime,
                scenario_duration.total_seconds() < totalExecTime + 4)), \
            "scenario duration runtime exceeded assumed value {}s vs max {}s".format(scenario_duration.total_seconds(),
                                                                                     totalExecTime + 4)

    rmtree(str(tmpdir))
示例#6
0
def test_local_simple_uneven_resources_many_iter_jobs(tmpdir):
    file_path = tmpdir.join('jobs.json')

    print('tmpdir: {}'.format(str(tmpdir)))

    bigJobName = "big-echo-iter"
    smallJobName = "small-echo-iter"
    nits = 10
    jobs = [{
        "name": bigJobName,
        "iteration": {
            "start": 0,
            "stop": nits
        },
        "execution": {
            "exec": "/bin/echo",
            "args": ["iteration ${it}"],
            "wd": abspath(tmpdir.join("{}_$${{it}}".format(bigJobName))),
            "stdout": "echo-iter.stdout",
            "stderr": "echo-iter.stderr"
        },
        "resources": {
            "numCores": {
                "exact": 2,
            }
        }
    }, {
        "name": smallJobName,
        "iteration": {
            "start": 0,
            "stop": nits
        },
        "execution": {
            "exec": "/bin/echo",
            "args": ["iteration ${it}"],
            "wd": abspath(tmpdir.join("{}_$${{it}}".format(smallJobName))),
            "stdout": "echo-iter.stdout",
            "stderr": "echo-iter.stderr"
        },
        "resources": {
            "numCores": {
                "exact": 1,
            }
        }
    }]
    reqs = [{
        'request': 'submit',
        'jobs': jobs
    }, {
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    }]
    save_reqs_to_file(reqs, file_path)
    print('jobs saved to file_path: {}'.format(str(file_path)))

    sys.argv = [
        'QCG-PilotJob', '--log', 'debug', '--file', '--file-path',
        str(file_path), '--nodes', '3', '--wd',
        str(tmpdir), '--report-format', 'json'
    ]
    QCGPMService().start()

    check_job_status_in_json(
        [bigJobName] + ["{}:{}".format(bigJobName, i)
                        for i in range(0, nits)] + [smallJobName] +
        ["{}:{}".format(smallJobName, i) for i in range(0, nits)],
        workdir=str(tmpdir),
        dest_state='SUCCEED')

    for jobName in [bigJobName, smallJobName]:
        for i in range(0, nits):
            wd_path = abspath(tmpdir.join("{}_{}".format(jobName, i)))
            stdout_path = join(wd_path, 'echo-iter.stdout')
            stderr_path = join(wd_path, 'echo-iter.stderr')
            assert all(
                (isdir(wd_path), exists(stdout_path), exists(stderr_path)
                 )), "stdout({}) and/or stderr({}) doesn't exist".format(
                     stdout_path, stderr_path)

            with open(stdout_path, 'r') as f:
                assert f.read().strip() == Template(
                    "iteration ${it}").substitute(it=i)

    rmtree(str(tmpdir))
示例#7
0
def test_slurmenv_simple_job():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip(
            'test not run in slurm allocation or allocation is smaller than 2 nodes'
        )

    resources, allocation = get_slurm_resources_binded()
    resources_node_names = set(n.name for n in resources.nodes)

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    file_path = join(tmpdir, 'jobs.json')
    print('tmpdir: {}'.format(tmpdir))

    jobName = 'mdate'
    jobs = [
        job.to_dict() for job in [
            Job(
                jobName,
                JobExecution('date',
                             wd=abspath(join(tmpdir, 'date.sandbox')),
                             stdout='date.out',
                             stderr='date.err'),
                JobResources(numCores=ResourceSize(1)))
        ]
    ]
    reqs = [{
        'request': 'submit',
        'jobs': jobs
    }, {
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    }]
    save_reqs_to_file(reqs, file_path)
    print('jobs saved to file_path: {}'.format(str(file_path)))

    sys.argv = [
        'QCG-PilotJob', '--log', 'debug', '--file', '--file-path',
        str(file_path), '--wd', tmpdir, '--report-format', 'json'
    ]
    QCGPMService().start()

    jobEntries = check_job_status_in_json([jobName],
                                          workdir=tmpdir,
                                          dest_state='SUCCEED')
    assert all(
        (isdir(abspath(join(tmpdir, 'date.sandbox'))),
         exists(join(abspath(join(tmpdir, 'date.sandbox')), 'date.out')),
         exists(join(abspath(join(tmpdir, 'date.sandbox')), 'date.err')),
         stat(join(abspath(join(tmpdir, 'date.sandbox')), 'date.out')).st_size
         > 0))
    # there can be some debugging messages in the stderr
    #                stat(join(abspath(join(tmpdir, 'date.sandbox')), 'date.err')).st_size == 0))

    for jname, jentry in jobEntries.items():
        assert all(('runtime' in jentry, 'allocation'
                    in jentry.get('runtime', {})))

        jalloc = jentry['runtime']['allocation']
        for jalloc_node in jalloc.split(','):
            node_name = jalloc_node[:jalloc_node.index('[')]
            print('{} in available nodes ({})'.format(
                node_name, ','.join(resources_node_names)))
            assert node_name in resources_node_names, '{} not in nodes ({}'.format(
                node_name, ','.join(resources_node_names))

    with pytest.raises(ValueError):
        check_job_status_in_json([jobName + 'xxx'],
                                 workdir=tmpdir,
                                 dest_state='SUCCEED')

    rmtree(tmpdir)
示例#8
0
def test_slurmenv_many_nodes_many_cores():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip(
            'test not run in slurm allocation or allocation is smaller than 2 nodes'
        )

    resources, allocation = get_slurm_resources_binded()
    resources_node_names = set(n.name for n in resources.nodes)

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))

    file_path = join(tmpdir, 'jobs.json')
    print('tmpdir: {}'.format(tmpdir))

    jobName = 'hostname'
    jobwdir_base = 'hostname.sandbox'
    cores_num = resources.nodes[0].free
    nodes_num = resources.total_nodes
    jobs = [
        job.to_dict() for job in [
            Job(
                jobName,
                JobExecution(exec='mpirun',
                             args=['--allow-run-as-root', 'hostname'],
                             wd=abspath(join(tmpdir, jobwdir_base)),
                             stdout='hostname.out',
                             stderr='hostname.err',
                             modules=['mpi/openmpi-x86_64']),
                JobResources(numCores=ResourceSize(cores_num),
                             numNodes=ResourceSize(nodes_num)))
        ]
    ]
    reqs = [{
        'request': 'submit',
        'jobs': jobs
    }, {
        'request': 'control',
        'command': 'finishAfterAllTasksDone'
    }]
    save_reqs_to_file(reqs, file_path)
    print('jobs saved to file_path: {}'.format(str(file_path)))

    sys.argv = [
        'QCG-PilotJob', '--log', 'debug', '--file', '--file-path',
        str(file_path), '--wd', tmpdir, '--report-format', 'json'
    ]
    QCGPMService().start()

    jobEntries = check_job_status_in_json([jobName],
                                          workdir=tmpdir,
                                          dest_state='SUCCEED')
    assert all(
        (isdir(abspath(join(tmpdir, jobwdir_base))),
         exists(join(abspath(join(tmpdir, jobwdir_base)), 'hostname.out')),
         exists(join(abspath(join(tmpdir, jobwdir_base)), 'hostname.err')),
         stat(join(abspath(join(tmpdir, jobwdir_base)),
                   'hostname.out')).st_size > 0))

    job_nodes = []
    allocated_cores = 0
    for jname, jentry in jobEntries.items():
        assert all(('runtime' in jentry, 'allocation'
                    in jentry.get('runtime', {})))

        jalloc = jentry['runtime']['allocation']
        for jalloc_node in jalloc.split(','):
            node_name = jalloc_node[:jalloc_node.index('[')]
            job_nodes.append(node_name)
            print('{} in available nodes ({})'.format(
                node_name, ','.join(resources_node_names)))
            assert node_name in resources_node_names, '{} not in nodes ({}'.format(
                node_name, ','.join(resources_node_names))

            ncores = len(jalloc_node[jalloc_node.index('[') + 1:-1].split(':'))
            print('#{} cores on node {}'.format(ncores, node_name))
            allocated_cores += ncores
    assert len(job_nodes) == nodes_num, str(job_nodes)
    assert allocated_cores == nodes_num * cores_num, allocated_cores

    # check if hostname is in stdout in two lines
    with open(abspath(join(tmpdir, join(jobwdir_base, 'hostname.out'))),
              'rt') as stdout_file:
        stdout_content = [line.rstrip() for line in stdout_file.readlines()]
    assert len(stdout_content) == nodes_num * cores_num, str(stdout_content)
    assert all(hostname in job_nodes
               for hostname in stdout_content), str(stdout_content)

    with pytest.raises(ValueError):
        check_job_status_in_json([jobName + 'xxx'],
                                 workdir=tmpdir,
                                 dest_state='SUCCEED')

    rmtree(tmpdir)