示例#1
0
def test_job_bulk_resubmit(driver, state, monkeypatch):
    root = Folder.get_root()

    jobs = [
        driver.create_job(
            command="echo 'begin'; sleep 0.2 ; echo 'end' ; exit 1", folder=root
        ),
        driver.create_job(
            command="echo 'begin'; sleep 0.2 ; echo 'end' ; exit 1", folder=root
        ),
        driver.create_job(
            command="echo 'begin'; sleep 0.2 ; echo 'end' ; exit 1", folder=root
        ),
    ]

    other_job = driver.create_job(
        command="echo 'begin'; sleep 0.2 ; echo 'end' ; exit 1", folder=root
    )
    other_job.status = Job.Status.COMPLETED
    other_job.save()

    jobs[0].status = Job.Status.FAILED
    jobs[0].save()

    sbatch = Mock(side_effect=[1, 2, 3])
    monkeypatch.setattr(driver.slurm, "sbatch", sbatch)
    driver.bulk_submit(jobs[1:])
    assert sbatch.call_count == 2

    for job in jobs[1:]:
        job.status = Job.Status.COMPLETED

        with open(job.data["stdout"], "w") as f:
            f.write("hurz")

        job.save()

    shutil.rmtree(jobs[0].data["output_dir"])

    # we need to prevent driver from actually calling submit
    submit = Mock()
    remove = Mock(wraps=os.remove)
    makedirs = Mock()
    with monkeypatch.context() as m:
        m.setattr(driver, "submit", submit)
        m.setattr(driver.slurm, "sacct", Mock(return_value=[]))
        m.setattr(driver, "bulk_kill", Mock(side_effect=RuntimeError))
        m.setattr("os.remove", remove)
        m.setattr("os.makedirs", makedirs)
        driver.bulk_resubmit(jobs)
    assert submit.call_count == len(jobs)
    remove.assert_has_calls([call(j.data["stdout"]) for j in jobs[1:]], any_order=True)
    makedirs.assert_has_calls(
        [call(j.data["output_dir"]) for j in jobs[1:]], any_order=True
    )

    for job in jobs:
        job.reload()
        assert job.status == Job.Status.CREATED

    # bug: all jobs where reset to created. Check this is not the case anymore
    other_job.reload()
    assert other_job.status != Job.Status.CREATED
示例#2
0
def test_bulk_sync_status(driver, state, monkeypatch):

    root = Folder.get_root()

    jobs = [
        driver.create_job(folder=root, command=f"sleep 0.1; echo 'JOB{i}'")
        for i in range(15)
    ]

    assert len(jobs) == 15
    for job in jobs:
        assert job.status == Job.Status.CREATED

    sbatch = Mock(side_effect=[i + 1 for i in range(len(jobs))])
    monkeypatch.setattr(driver.slurm, "sbatch", sbatch)
    driver.bulk_submit(jobs)

    sacct_return = [
        "|".join([str(i + 1), "RUNNING", "0:0", "", "", "", ""])
        for i in range(len(jobs))
    ]
    sacct = Mock(return_value=sacct_return)
    # pretend they're all running now
    monkeypatch.setattr(driver.slurm, "_sacct", sacct)

    jobs = driver.bulk_sync_status(jobs)

    sacct.assert_called_once_with(
        jobs=",".join([j.batch_job_id for j in jobs]),
        format="JobID,State,ExitCode,Submit,Start,End,NodeList",
        noheader=True,
        parsable2=True,
        starttime=ANY,
        _iter=True,
    )

    for job in jobs:
        assert job.status == Job.Status.RUNNING

    sacct_return = [
        "|".join([str(i + 1), "COMPLETED" if i < 6 else "FAILED", "0:0"] + [""] * 4)
        for i in range(len(jobs))
    ]

    sacct = Mock(return_value=sacct_return)
    monkeypatch.setattr(driver.slurm, "_sacct", sacct)

    jobs = driver.bulk_sync_status(jobs)
    sacct.assert_called_once_with(
        jobs=",".join([j.batch_job_id for j in jobs]),
        format="JobID,State,ExitCode,Submit,Start,End,NodeList",
        noheader=True,
        parsable2=True,
        starttime=ANY,
        _iter=True,
    )

    for job in jobs[:6]:
        assert job.status == Job.Status.COMPLETED
    for job in jobs[6:]:
        assert job.status == Job.Status.FAILED
示例#3
0
def test_resubmit_job(driver, state, monkeypatch):
    root = Folder.get_root()
    j1 = driver.create_job(command="sleep 1", folder=root)

    assert j1.status == Job.Status.CREATED

    batch_job_id = 5_207_375
    sbatch = Mock(return_value=batch_job_id)
    monkeypatch.setattr(driver.slurm, "sbatch", sbatch)
    driver.submit(j1)
    sbatch.assert_called_once_with(j1)

    assert j1.status == Job.Status.SUBMITTED
    assert j1.batch_job_id == str(batch_job_id)

    monkeypatch.setattr(driver.slurm, "sacct", Mock(return_value=[]))
    with pytest.raises(InvalidJobStatus):
        driver.resubmit(j1)

    SAI = SlurmAccountingItem
    monkeypatch.setattr(
        driver.slurm,
        "sacct",
        Mock(return_value=[SAI(j1.batch_job_id, Job.Status.FAILED, 0, {})]),
    )

    bjid2 = 42
    sbatch = Mock(return_value=bjid2)
    monkeypatch.setattr(driver.slurm, "sbatch", sbatch)

    with monkeypatch.context() as m:
        # job errors on kill, resubmits anyway
        m.setattr(driver, "kill", Mock(side_effect=RuntimeError()))
        m.setattr("os.path.exists", Mock(side_effect=[True, False, False]))
        m.setattr("os.remove", Mock())
        j1 = driver.resubmit(j1)

    sbatch.assert_called_once()
    assert j1.status == Job.Status.SUBMITTED
    assert j1.batch_job_id == str(bjid2)  # gets new batch job id

    with monkeypatch.context() as m:
        m.setattr(driver, "sync_status", Mock())  # disable sync for a second
        with pytest.raises(InvalidJobStatus):
            driver.resubmit(j1)  # stays in SUBMITTED, not accepted

    monkeypatch.setattr(
        driver.slurm,
        "sacct",
        Mock(return_value=[SAI(j1.batch_job_id, Job.Status.FAILED, 0, {})]),
    )

    # will go to failed

    bjid3 = 99
    sbatch = Mock(return_value=bjid3)
    monkeypatch.setattr(driver.slurm, "sbatch", sbatch)
    j1 = driver.resubmit(j1)
    sbatch.assert_called_once()
    assert j1.status == Job.Status.SUBMITTED
    assert j1.batch_job_id == str(bjid3)