def test__roll_forward_paused_update_abort(client): """ Create an update, pause it and then abort it """ res = client.start_job_update( get_job_update_request("test_dc_labrat.yaml"), "create job") key = res.key # Sleep for sometime to let Peloton transition workflow state # from INITIALIZED -> ROLLING_FORWARD or let it be as-is time.sleep(5) client.pause_job_update(key, "pause update") wait_for_update_status( client, key, {api.JobUpdateStatus.ROLLING_FORWARD}, api.JobUpdateStatus.ROLL_FORWARD_PAUSED, ) client.abort_job_update(key, "abort update") wait_for_update_status( client, key, { api.JobUpdateStatus.ROLLING_FORWARD, api.JobUpdateStatus.ROLL_FORWARD_PAUSED, }, api.JobUpdateStatus.ABORTED, )
def test__abort_auto_rollback_and_update(client): """ 1. Create a job 2. Start a bad update, wait for auto-rollback to kick-in 3. Once auto-rollback starts, abort an update. 4. Do a new good update and all the instances should converge to the new config. """ start_job_update( client, "test_dc_labrat_large_job.yaml", "start job update test/dc/labrat_large_job", ) # Add some wait time for lucene index to build time.sleep(10) res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_bad_config.yaml"), "rollout bad config", ) # wait for auto-rollback to kick-in wait_for_auto_rolling_back(client, res.key) client.abort_job_update(res.key, "abort update") wait_for_update_status( client, res.key, {api.JobUpdateStatus.ROLLING_BACK}, api.JobUpdateStatus.ABORTED, ) new_config = get_job_update_request( "test_dc_labrat_large_job_new_config.yaml" ) res = client.start_job_update(new_config, "rollout good config") # Sleep for a while so that update gets triggered. time.sleep(5) wait_for_rolled_forward(client, res.key) res = client.get_tasks_without_configs( api.TaskQuery( jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING} ) ) assert len(res.tasks) == 10 for t in res.tasks: assert len(t.assignedTask.task.metadata) == 1 assert ( list(t.assignedTask.task.metadata)[0].key == list(new_config.taskConfig.metadata)[0].key ) assert ( list(t.assignedTask.task.metadata)[0].value == list(new_config.taskConfig.metadata)[0].value ) assert t.ancestorId
def test__failed_update(client): """ update failed """ res = client.start_job_update( get_job_update_request('test_dc_labrat_bad_config.yaml'), 'rollout bad config') wait_for_update_status(client, res.key, {api.JobUpdateStatus.ROLLING_FORWARD}, api.JobUpdateStatus.FAILED)
def test__start_job_update_with_pulse(client): req = get_job_update_request('test_dc_labrat_pulsed.yaml') res = client.start_job_update(req, 'start pulsed job update test/dc/labrat') assert get_update_status(client, res.key) == \ api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE client.pulse_job_update(res.key) wait_for_update_status( client, res.key, { api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE, api.JobUpdateStatus.ROLLING_FORWARD, }, api.JobUpdateStatus.ROLLED_FORWARD)
def test__pulsed_update_abort(client): """ Create a pulse update, and then abort it """ req = get_job_update_request('test_dc_labrat_pulsed.yaml') res = client.start_job_update(req, 'start pulsed job update test/dc/labrat') assert get_update_status(client, res.key) == \ api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE key = res.key client.abort_job_update(key, 'abort update') wait_for_update_status(client, key, {api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE}, api.JobUpdateStatus.ABORTED)
def test__rolling_forward_abort(client): """ Create an update and then abort it """ res = client.start_job_update( get_job_update_request('test_dc_labrat.yaml'), 'create job') key = res.key # Sleep for sometime to let Peloton transition workflow state # from INITIALIZED -> ROLLING_FORWARD or let it be as-is time.sleep(5) client.abort_job_update(key, 'abort update') wait_for_update_status(client, key, {api.JobUpdateStatus.ROLLING_FORWARD}, api.JobUpdateStatus.ABORTED)
def test__deploy_on_aborted_update(client): """ Deploy an update, and abort half-way. Then re-deploy same update. Updated instances should not restart again. """ res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) # Few instances start time.sleep(5) client.abort_job_update(res.key, "abort update") wait_for_update_status( client, res.key, {api.JobUpdateStatus.ROLLING_FORWARD}, api.JobUpdateStatus.ABORTED, ) # Not all instances were created res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={res.key.job}) ) assert len(res.tasks) < 10 # deploy same update, should impact remaining instances res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_diff_executor.yaml"), "start job update test/dc/labrat_large_job_diff_executor", ) wait_for_rolled_forward(client, res.key) # All instances are created res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={res.key.job}) ) assert len(res.tasks) == 10 # No instances should have ancestor id, thereby validating # instances created in previous update are not restarted/redeployed for task in res.tasks: assert task.ancestorId is None
def test__abort_auto_rollback_with_pinned_instances_and_update(client): """ 1. Create a job. 2. Start a bad update (version 2) targeting subset of instances. 3. Wait for the auto-rollback to kick-in. 4. Once auto-rollback kicks in, abort the update. 4. Start a new good update and wait for all instances to converge to that update. """ # Create a job res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10 for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m # start a bad update with updateOnlyTheseInstances parameter update_instances = [0, 2, 3, 7, 9] pinned_req = get_job_update_request( "test_dc_labrat_large_job_bad_config.yaml") pinned_req.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in update_instances]) pinned_req.settings.maxFailedInstances = 4 res = client.start_job_update( pinned_req, "start a bad update test/dc/labrat_large_job with pinned instances", ) # wait for auto-rollback to kick-in wait_for_auto_rolling_back(client, res.key, timeout_secs=150) # abort the update client.abort_job_update(res.key, "abort update") wait_for_update_status( client, res.key, {api.JobUpdateStatus.ROLLING_BACK}, api.JobUpdateStatus.ABORTED, ) # start a new good update res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_new_config.yaml"), "start job update test/dc/labrat_large_job with a good config", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10 for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert len(t.assignedTask.task.metadata) == 1 for m in t.assignedTask.task.metadata: if m.key == "test_key_12": assert m.value == "test_value_12" else: assert False, "unexpected metadata %s" % m if t.assignedTask.instanceId in update_instances: # only a few of the pinned instances might have rolled back assert run_id == "3" or run_id == "4" else: assert run_id == "2"
def test__get_job_update_details__restart_jobmgr(client, jobmgr, resmgr, hostmgr, mesos_master): """ Start an update, call getJobUpdateDetails, restart jobmgr: 1. Before recovery finishes, expect error 2. After recovery finishes, expect getJobUpdateDetails to include the correct job """ # start job update paused req = get_job_update_request('test_dc_labrat_large_job.yaml') req.settings.updateGroupSize = 10 req.settings.blockIfNoPulsesAfterMs = 604800000 res = client.start_job_update(req, 'start job update test/dc/labrat_large_job') job_update_key = res.key job_key = job_update_key.job wait_for_update_status( client, job_update_key, {api.JobUpdateStatus.ROLLING_FORWARD}, api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE, 120, ) def check(): res = client.get_job_update_details(None, api.JobUpdateQuery(jobKey=job_key)) assert len(res.detailsList) == 1 assert res.detailsList[0].update.summary.key.job == job_key def wait(): start = time.time() while time.time() - start < 120: try: check() break except Exception as e: log.info('getJobUpdateDetails failed: %s, retrying...', e) time.sleep(0.5) else: assert False, \ 'Timed out waiting for getJobUpdateDetails endpoint to recover' # verify getJobUpdateDetailsResult check() # restart jobmgr jobmgr.restart() wait() # wait additional time before proceeding, so that jobmgr has leader elected log.info('Wating 5 seconds before proceeding') time.sleep(5) # resume update client.pulse_job_update(job_update_key) wait_for_rolled_forward(client, job_update_key) wait_for_running(client, job_key) # verify getJobUpdateDetailsResult check() # restart jobmgr jobmgr.restart() wait() # wait additional time before exiting, so that jobmgr has leader elected log.info('Wating 5 seconds before exiting') time.sleep(5)