예제 #1
0
def test_slurmenv_api_cancel_kill_nl():
    if not in_slurm_allocation() or get_num_slurm_nodes() < 2:
        pytest.skip('test not run in slurm allocation or allocation is smaller than 2 nodes')

    resources, allocation = get_slurm_resources_binded()

    set_pythonpath_to_qcg_module()
    tmpdir = str(tempfile.mkdtemp(dir=SHARED_PATH))
    print(f'tmpdir: {tmpdir}')

    try:
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json'], {'wdir': str(tmpdir)})

        iters=10
        ids = m.submit(Jobs().
                       add(script='trap "" SIGTERM; sleep 30s', iteration=iters, stdout='sleep.out.${it}',
                           stderr='sleep.err.${it}', numCores=1)
                       )
        jid = ids[0]
        assert len(m.list()) == 1

        list_jid = list(m.list().keys())[0]
        assert list_jid == jid

        # wait for job to start executing
        sleep(2)

        m.cancel([jid])

        # wait for SIGTERM job cancel
        sleep(2)

        jinfos = m.info_parsed(ids)
        assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status  == 'QUEUED'))

        # wait for SIGKILL job cancel (~ExecutionJob.SIG_KILL_TIMEOUT)
        sleep(ExecutionJob.SIG_KILL_TIMEOUT)

        jinfos = m.info_parsed(ids, withChilds=True)
        assert all((len(jinfos) == 1, jid in jinfos, jinfos[jid].status  == 'CANCELED'))

        # the canceled iterations are included in 'failed' entry in job statistics
        # the cancel status is presented in 'childs/state' entry
        assert all((jinfos[jid].iterations, jinfos[jid].iterations.get('start', -1) == 0,
                    jinfos[jid].iterations.get('stop', 0) == iters, jinfos[jid].iterations.get('total', 0) == iters,
                    jinfos[jid].iterations.get('finished', 0) == iters, jinfos[jid].iterations.get('failed', -1) == iters))
        assert len(jinfos[jid].childs) == iters
        for iteration in range(iters):
            job_it = jinfos[jid].childs[iteration]
            assert all((job_it.iteration == iteration, job_it.name == '{}:{}'.format(jid, iteration),
                        job_it.status == 'CANCELED')), str(job_it)

        m.remove(jid)

    finally:
        m.finish()
        m.cleanup()
예제 #2
0
def test_resume_simple(tmpdir):
    try:
        ncores = 4
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', str(ncores)],
                         {'wdir': str(tmpdir)})

        its = 10
        job_req = {
            'name': 'sleep',
            'execution': {
                'exec': '/bin/sleep',
                'args': [ '4s' ],
                'stdout': 'out',
            },
            'iteration': { 'stop': its },
            'resources': { 'numCores': { 'exact': 1 } }
        }
        jobs = Jobs().add_std(job_req)
        job_ids = m.submit(jobs)

        # because job iterations executes in order, after finish of 4th iteration, the three previous should also finish
        m.wait4('sleep:3')
        jinfos = m.info_parsed(job_ids, withChilds=True)
        assert jinfos
        jinfo = jinfos['sleep']

        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == ncores, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]

            exp_status = ['SUCCEED']
            if iteration > 3:
                exp_status = ['EXECUTING', 'SCHEDULED', 'QUEUED']
            assert all((job_it.iteration == iteration,
                        job_it.name == '{}:{}'.format('sleep', iteration),
                        job_it.status in exp_status)),\
                f"{job_it.iteration} != {iteration}, {job_it.name} != {'{}:{}'.format('sleep', iteration)}, {job_it.status} != {exp_status}"

        # kill process
        m.kill_manager_process()
        m.cleanup()

        ncores = 4
        m = LocalManager(['--log', 'debug', '--wd', tmpdir, '--report-format', 'json', '--nodes', str(ncores),
                          '--resume', tmpdir],
                         {'wdir': str(tmpdir)})

        m.wait4all()
        jinfos = m.info_parsed(job_ids, withChilds=True)
        assert jinfos
        jinfo = jinfos['sleep']

        assert all((jinfo.iterations, jinfo.iterations.get('start', -1) == 0,
                    jinfo.iterations.get('stop', 0) == its, jinfo.iterations.get('total', 0) == its,
                    jinfo.iterations.get('finished', 0) == its, jinfo.iterations.get('failed', -1) == 0)), str(jinfo)
        assert len(jinfo.childs) == its
        for iteration in range(its):
            job_it = jinfo.childs[iteration]

            assert all((job_it.iteration == iteration,
                        job_it.name == '{}:{}'.format('sleep', iteration),
                        job_it.status == 'SUCCEED')), \
                f"{job_it.iteration} != {iteration}, {job_it.name} != {'{}:{}'.format('sleep', iteration)}, {job_it.status} != SUCCEED"
    finally:
        if m:
            m.finish()
            m.cleanup()