Пример #1
0
def test__roll_forward_paused_update_abort(client):
    """
    Create an update, pause it and then abort it
    """
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat.yaml"), "create job")
    key = res.key

    # Sleep for sometime to let Peloton transition workflow state
    # from INITIALIZED -> ROLLING_FORWARD or let it be as-is
    time.sleep(5)

    client.pause_job_update(key, "pause update")
    wait_for_update_status(
        client,
        key,
        {api.JobUpdateStatus.ROLLING_FORWARD},
        api.JobUpdateStatus.ROLL_FORWARD_PAUSED,
    )

    client.abort_job_update(key, "abort update")
    wait_for_update_status(
        client,
        key,
        {
            api.JobUpdateStatus.ROLLING_FORWARD,
            api.JobUpdateStatus.ROLL_FORWARD_PAUSED,
        },
        api.JobUpdateStatus.ABORTED,
    )
Пример #2
0
def test__abort_auto_rollback_and_update(client):
    """
    1. Create a job
    2. Start a bad update, wait for auto-rollback to kick-in
    3. Once auto-rollback starts, abort an update.
    4. Do a new good update and all the instances should converge to the new config.
    """
    start_job_update(
        client,
        "test_dc_labrat_large_job.yaml",
        "start job update test/dc/labrat_large_job",
    )

    # Add some wait time for lucene index to build
    time.sleep(10)

    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_bad_config.yaml"),
        "rollout bad config",
    )

    # wait for auto-rollback to kick-in
    wait_for_auto_rolling_back(client, res.key)

    client.abort_job_update(res.key, "abort update")
    wait_for_update_status(
        client,
        res.key,
        {api.JobUpdateStatus.ROLLING_BACK},
        api.JobUpdateStatus.ABORTED,
    )

    new_config = get_job_update_request(
        "test_dc_labrat_large_job_new_config.yaml"
    )
    res = client.start_job_update(new_config, "rollout good config")
    # Sleep for a while so that update gets triggered.
    time.sleep(5)
    wait_for_rolled_forward(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(
            jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING}
        )
    )
    assert len(res.tasks) == 10

    for t in res.tasks:
        assert len(t.assignedTask.task.metadata) == 1
        assert (
            list(t.assignedTask.task.metadata)[0].key
            == list(new_config.taskConfig.metadata)[0].key
        )
        assert (
            list(t.assignedTask.task.metadata)[0].value
            == list(new_config.taskConfig.metadata)[0].value
        )

        assert t.ancestorId
Пример #3
0
def test__failed_update(client):
    """
    update failed
    """
    res = client.start_job_update(
        get_job_update_request('test_dc_labrat_bad_config.yaml'),
        'rollout bad config')

    wait_for_update_status(client, res.key,
                           {api.JobUpdateStatus.ROLLING_FORWARD},
                           api.JobUpdateStatus.FAILED)
Пример #4
0
def test__start_job_update_with_pulse(client):
    req = get_job_update_request('test_dc_labrat_pulsed.yaml')
    res = client.start_job_update(req,
                                  'start pulsed job update test/dc/labrat')
    assert get_update_status(client, res.key) == \
        api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE

    client.pulse_job_update(res.key)
    wait_for_update_status(
        client, res.key, {
            api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE,
            api.JobUpdateStatus.ROLLING_FORWARD,
        }, api.JobUpdateStatus.ROLLED_FORWARD)
Пример #5
0
def test__pulsed_update_abort(client):
    """
    Create a pulse update, and then abort it
    """
    req = get_job_update_request('test_dc_labrat_pulsed.yaml')
    res = client.start_job_update(req,
                                  'start pulsed job update test/dc/labrat')
    assert get_update_status(client, res.key) == \
        api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE
    key = res.key

    client.abort_job_update(key, 'abort update')
    wait_for_update_status(client, key,
                           {api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE},
                           api.JobUpdateStatus.ABORTED)
Пример #6
0
def test__rolling_forward_abort(client):
    """
    Create an update and then abort it
    """
    res = client.start_job_update(
        get_job_update_request('test_dc_labrat.yaml'), 'create job')
    key = res.key

    # Sleep for sometime to let Peloton transition workflow state
    # from INITIALIZED -> ROLLING_FORWARD or let it be as-is
    time.sleep(5)

    client.abort_job_update(key, 'abort update')
    wait_for_update_status(client, key, {api.JobUpdateStatus.ROLLING_FORWARD},
                           api.JobUpdateStatus.ABORTED)
Пример #7
0
def test__deploy_on_aborted_update(client):
    """
    Deploy an update, and abort half-way. Then re-deploy
    same update. Updated instances should not restart again.
    """
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )

    # Few instances start
    time.sleep(5)

    client.abort_job_update(res.key, "abort update")
    wait_for_update_status(
        client,
        res.key,
        {api.JobUpdateStatus.ROLLING_FORWARD},
        api.JobUpdateStatus.ABORTED,
    )

    # Not all instances were created
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={res.key.job})
    )
    assert len(res.tasks) < 10

    # deploy same update, should impact remaining instances
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_executor.yaml"),
        "start job update test/dc/labrat_large_job_diff_executor",
    )
    wait_for_rolled_forward(client, res.key)

    # All instances are created
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={res.key.job})
    )
    assert len(res.tasks) == 10

    # No instances should have ancestor id, thereby validating
    # instances created in previous update are not restarted/redeployed
    for task in res.tasks:
        assert task.ancestorId is None
Пример #8
0
def test__abort_auto_rollback_with_pinned_instances_and_update(client):
    """
    1. Create a job.
    2. Start a bad update (version 2) targeting subset of instances.
    3. Wait for the auto-rollback to kick-in.
    4. Once auto-rollback kicks in, abort the update.
    4. Start a new good update and wait for all instances to converge to that update.
    """
    # Create a job
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start a bad update with updateOnlyTheseInstances parameter
    update_instances = [0, 2, 3, 7, 9]
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_bad_config.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])
    pinned_req.settings.maxFailedInstances = 4

    res = client.start_job_update(
        pinned_req,
        "start a bad update test/dc/labrat_large_job with pinned instances",
    )
    # wait for auto-rollback to kick-in
    wait_for_auto_rolling_back(client, res.key, timeout_secs=150)

    # abort the update
    client.abort_job_update(res.key, "abort update")
    wait_for_update_status(
        client,
        res.key,
        {api.JobUpdateStatus.ROLLING_BACK},
        api.JobUpdateStatus.ABORTED,
    )

    # start a new good update
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_new_config.yaml"),
        "start job update test/dc/labrat_large_job with a good config",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert len(t.assignedTask.task.metadata) == 1
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_12":
                assert m.value == "test_value_12"
            else:
                assert False, "unexpected metadata %s" % m

        if t.assignedTask.instanceId in update_instances:
            # only a few of the pinned instances might have rolled back
            assert run_id == "3" or run_id == "4"
        else:
            assert run_id == "2"
Пример #9
0
def test__get_job_update_details__restart_jobmgr(client, jobmgr, resmgr,
                                                 hostmgr, mesos_master):
    """
    Start an update, call getJobUpdateDetails, restart jobmgr:
    1. Before recovery finishes, expect error
    2. After recovery finishes, expect getJobUpdateDetails to include the
       correct job
    """
    # start job update paused
    req = get_job_update_request('test_dc_labrat_large_job.yaml')
    req.settings.updateGroupSize = 10
    req.settings.blockIfNoPulsesAfterMs = 604800000
    res = client.start_job_update(req,
                                  'start job update test/dc/labrat_large_job')

    job_update_key = res.key
    job_key = job_update_key.job
    wait_for_update_status(
        client,
        job_update_key,
        {api.JobUpdateStatus.ROLLING_FORWARD},
        api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE,
        120,
    )

    def check():
        res = client.get_job_update_details(None,
                                            api.JobUpdateQuery(jobKey=job_key))
        assert len(res.detailsList) == 1
        assert res.detailsList[0].update.summary.key.job == job_key

    def wait():
        start = time.time()
        while time.time() - start < 120:
            try:
                check()
                break
            except Exception as e:
                log.info('getJobUpdateDetails failed: %s, retrying...', e)
                time.sleep(0.5)
        else:
            assert False, \
                'Timed out waiting for getJobUpdateDetails endpoint to recover'

    # verify getJobUpdateDetailsResult
    check()

    # restart jobmgr
    jobmgr.restart()
    wait()

    # wait additional time before proceeding, so that jobmgr has leader elected
    log.info('Wating 5 seconds before proceeding')
    time.sleep(5)

    # resume update
    client.pulse_job_update(job_update_key)
    wait_for_rolled_forward(client, job_update_key)
    wait_for_running(client, job_key)

    # verify getJobUpdateDetailsResult
    check()

    # restart jobmgr
    jobmgr.restart()
    wait()

    # wait additional time before exiting, so that jobmgr has leader elected
    log.info('Wating 5 seconds before exiting')
    time.sleep(5)