def there_are_num_which_tasks(context, num, which, state, exact): context.max_tasks = num app_id = which_id(context, which) # 120 * 0.5 = 60 seconds for _ in xrange(120): app = context.marathon_client.get_app(app_id, embed_tasks=True) happy_tasks = get_happy_tasks(app, context.service, "fake_nerve_ns", context.system_paasta_config) happy_count = len(happy_tasks) if state == "healthy": if exact: if happy_count == context.max_tasks: return else: if happy_count >= context.max_tasks: return elif state == "unhealthy": if exact: if len(app.tasks) - happy_count == context.max_tasks: return else: if len(app.tasks) - happy_count >= context.max_tasks: return time.sleep(0.5) raise Exception("timed out waiting for %d %s tasks on %s; there are %d" % (context.max_tasks, state, app_id, len(app.tasks)))
def when_deploy_service_initiated(context, bounce_method, drain_method): with contextlib.nested( mock.patch( 'paasta_tools.bounce_lib.get_happy_tasks', autospec=True, # Wrap function call so we can select a subset of tasks or test # intermediate steps, like when an app is not completely up side_effect=lambda app, _, __, **kwargs: get_happy_tasks( app, context.service, "fake_nerve_ns")[:context.max_tasks], ), mock.patch('paasta_tools.bounce_lib.bounce_lock_zookeeper', autospec=True), mock.patch('paasta_tools.bounce_lib.create_app_lock', autospec=True), mock.patch('paasta_tools.bounce_lib.time.sleep', autospec=True), mock.patch('paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True), ) as ( _, _, _, _, mock_load_system_paasta_config, ): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value=context.cluster) setup_marathon_job.deploy_service( service=context.service, instance=context.instance, marathon_jobid=context.new_config['id'], config=context.new_config, client=context.marathon_client, bounce_method=bounce_method, drain_method_name=drain_method, drain_method_params={}, nerve_ns=context.instance, bounce_health_params={}, soa_dir=None, )
def test_get_happy_tasks_check_haproxy_multiple_locations(self): """If we specify that a task should be in haproxy, don't call it happy unless it's in haproxy.""" tasks = [mock.Mock(health_check_results=[mock.Mock(alive=True)]) for i in xrange(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) with contextlib.nested( mock.patch( 'paasta_tools.bounce_lib.get_registered_marathon_tasks', side_effect=[tasks[2:3], tasks[3:]], autospec=True, ), mock.patch('paasta_tools.mesos_tools.get_mesos_slaves_grouped_by_attribute', autospec=True), ) as ( get_registered_marathon_tasks_patch, get_mesos_slaves_grouped_by_attribute_patch, ): get_mesos_slaves_grouped_by_attribute_patch.return_value = { 'fake_region': ['fake_host1'], 'fake_other_region': ['fake_host2'], } assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', check_haproxy=True) == tasks[2:] get_registered_marathon_tasks_patch.assert_any_call( 'fake_host1', DEFAULT_SYNAPSE_PORT, 'service.namespace', tasks, ) get_registered_marathon_tasks_patch.assert_any_call( 'fake_host2', DEFAULT_SYNAPSE_PORT, 'service.namespace', tasks, )
def get_old_happy_unhappy_draining_tasks(other_apps, drain_method, service, nerve_ns, bounce_health_params): """Split tasks from old apps into 3 categories: - live (not draining) and happy (according to get_happy_tasks) - live (not draining) and unhappy - draining """ old_app_live_happy_tasks = {} old_app_live_unhappy_tasks = {} old_app_draining_tasks = {} for app in other_apps: tasks_by_state = { 'happy': set(), 'unhappy': set(), 'draining': set(), } happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns, **bounce_health_params) for task in app.tasks: if drain_method.is_draining(task): state = 'draining' elif task in happy_tasks: state = 'happy' else: state = 'unhappy' tasks_by_state[state].add(task) old_app_live_happy_tasks[app.id] = tasks_by_state['happy'] old_app_live_unhappy_tasks[app.id] = tasks_by_state['unhappy'] old_app_draining_tasks[app.id] = tasks_by_state['draining'] return old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks
def test_get_happy_tasks_when_some_unhealthy(self): """Only tasks with a passing healthcheck should be happy""" fake_failing_healthcheck_results = [mock.Mock(alive=False)] fake_successful_healthcheck_results = [mock.Mock(alive=True)] tasks = [mock.Mock(health_check_results=fake_failing_healthcheck_results), mock.Mock(health_check_results=fake_failing_healthcheck_results), mock.Mock(health_check_results=fake_successful_healthcheck_results)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace') == tasks[-1:]
def test_get_happy_tasks_min_task_uptime_when_unhealthy(self): """If we specify a minimum task age, tasks newer than that should not be considered happy.""" now = datetime.datetime(2000, 1, 1, 0, 0, 0) tasks = [mock.Mock(health_check_results=[mock.Mock(alive=False)], started_at=(now - datetime.timedelta(minutes=i))) for i in xrange(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) with mock.patch('paasta_tools.bounce_lib.datetime.datetime', utcnow=lambda: now, autospec=True): assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', min_task_uptime=121) == []
def when_setup_service_initiated(context): config = { 'master': '%s' % get_service_connection_string('mesosmaster'), 'scheme': 'http', 'response_timeout': 5, } with contextlib.nested( mock.patch( 'paasta_tools.bounce_lib.get_happy_tasks', autospec=True, # Wrap function call so we can select a subset of tasks or test # intermediate steps, like when an app is not completely up side_effect=lambda app, _, __, ___, **kwargs: get_happy_tasks( app, context.service, "fake_nerve_ns", context.system_paasta_config)[:context.max_tasks], ), mock.patch('paasta_tools.bounce_lib.bounce_lock_zookeeper', autospec=True), mock.patch('paasta_tools.bounce_lib.create_app_lock', autospec=True), mock.patch('paasta_tools.bounce_lib.time.sleep', autospec=True), mock.patch('paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True), mock.patch('paasta_tools.setup_marathon_job._log', autospec=True), mock.patch('paasta_tools.marathon_tools.get_config_hash', autospec=True, return_value='confighash'), mock.patch('paasta_tools.marathon_tools.get_code_sha_from_dockerurl', autospec=True, return_value='newapp'), mock.patch('paasta_tools.marathon_tools.get_docker_url', autospec=True, return_value='busybox'), mock.patch('paasta_tools.paasta_maintenance.load_credentials', autospec=True), mock.patch.object(mesos.cli.master, 'CFG', config), ) as ( _, _, _, _, mock_load_system_paasta_config, _, _, _, _, mock_load_credentials, _, ): mock_load_credentials.side_effect = paasta_maintenance.load_credentials(mesos_secrets='/etc/mesos-slave-secret') mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value=context.cluster) # 120 * 0.5 = 60 seconds for _ in xrange(120): try: (code, message) = setup_marathon_job.setup_service( service=context.service, instance=context.instance, client=context.marathon_client, service_marathon_config=context.new_marathon_service_config, soa_dir='/nail/etc/services', ) assert code == 0, message return except MarathonHttpError: time.sleep(0.5) raise Exception("Unable to acquire app lock for setup_marathon_job.setup_service")
def test_get_happy_tasks_min_task_uptime(self): """If we specify a minimum task age, tasks newer than that should not be considered happy.""" now = datetime.datetime(2000, 1, 1, 0, 0, 0) tasks = [mock.Mock(health_check_results=[], started_at=(now - datetime.timedelta(minutes=i))) for i in xrange(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) # I would have just mocked datetime.datetime.utcnow, but that's apparently difficult; I have to mock # datetime.datetime instead, and give it a utcnow attribute. with mock.patch('paasta_tools.bounce_lib.datetime.datetime', utcnow=lambda: now, autospec=True): assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', min_task_uptime=121) == tasks[3:]
def test_get_happy_tasks_when_some_unhealthy(self): """Only tasks with a passing healthcheck should be happy""" fake_failing_healthcheck_results = [mock.Mock(alive=False)] fake_successful_healthcheck_results = [mock.Mock(alive=True)] tasks = [mock.Mock(health_check_results=fake_failing_healthcheck_results), mock.Mock(health_check_results=fake_failing_healthcheck_results), mock.Mock(health_check_results=fake_successful_healthcheck_results)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) actual = bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', self.fake_system_paasta_config()) expected = tasks[-1:] assert actual == expected
def when_deploy_service_initiated(context, bounce_method, drain_method): with contextlib.nested( mock.patch( 'paasta_tools.bounce_lib.get_happy_tasks', autospec=True, # Wrap function call so we can select a subset of tasks or test # intermediate steps, like when an app is not completely up side_effect=lambda app, _, __, ___, **kwargs: get_happy_tasks( app, context.service, "fake_nerve_ns", context. system_paasta_config)[:context.max_tasks], ), mock.patch('paasta_tools.bounce_lib.bounce_lock_zookeeper', autospec=True), mock.patch('paasta_tools.bounce_lib.create_app_lock', autospec=True), mock.patch('paasta_tools.bounce_lib.time.sleep', autospec=True), mock.patch( 'paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True), mock.patch('paasta_tools.setup_marathon_job._log', autospec=True), ) as ( _, _, _, _, mock_load_system_paasta_config, _, ): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock( return_value=context.cluster) # 120 * 0.5 = 60 seconds for _ in xrange(120): try: setup_marathon_job.deploy_service( service=context.service, instance=context.instance, marathon_jobid=context.new_config['id'], config=context.new_config, client=context.marathon_client, bounce_method=bounce_method, drain_method_name=drain_method, drain_method_params={}, nerve_ns=context.instance, bounce_health_params={}, soa_dir=None, ) return except MarathonHttpError: time.sleep(0.5) raise Exception( "Unable to qcuiqre app lock for setup_marathon_job.deploy_service")
def get_tasks_by_state_for_app( app: MarathonApp, drain_method: drain_lib.DrainMethod, service: str, nerve_ns: str, bounce_health_params: Dict[str, Any], system_paasta_config: SystemPaastaConfig, log_deploy_error: LogDeployError, draining_hosts: Collection[str], ) -> TasksByStateDict: tasks_by_state: TasksByStateDict = { "happy": set(), "unhappy": set(), "draining": set(), "at_risk": set(), } happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns, system_paasta_config, **bounce_health_params) async def categorize_task(task: MarathonTask) -> None: try: is_draining = await drain_method.is_draining(task) except Exception as e: log_deploy_error( f"Ignoring {type(e).__name__} exception during is_draining of task " f"{task.id} {e.args}. Treating task as 'unhappy'.") state = "unhappy" else: if is_draining is True: state = "draining" elif task in happy_tasks: if task.host in draining_hosts: state = "at_risk" else: state = "happy" else: state = "unhappy" tasks_by_state[state].add(task) if app.tasks: a_sync.block( asyncio.wait, [ asyncio.ensure_future(categorize_task(task)) for task in app.tasks ], ) return tasks_by_state
def get_tasks_by_state_for_app( app: MarathonApp, drain_method: drain_lib.DrainMethod, service: str, nerve_ns: str, bounce_health_params: Dict[str, Any], system_paasta_config: SystemPaastaConfig, log_deploy_error: LogDeployError, draining_hosts: Collection[str], ) -> TasksByStateDict: tasks_by_state: TasksByStateDict = { 'happy': set(), 'unhappy': set(), 'draining': set(), 'at_risk': set(), } happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns, system_paasta_config, **bounce_health_params) async def categorize_task(task: MarathonTask) -> None: try: is_draining = await drain_method.is_draining(task) except Exception as e: log_deploy_error( "Ignoring exception during is_draining of task %s:" " %s. Treating task as 'unhappy'." % (task, e), ) state = 'unhappy' else: if is_draining is True: state = 'draining' elif task in happy_tasks: if task.host in draining_hosts: state = 'at_risk' else: state = 'happy' else: state = 'unhappy' tasks_by_state[state].add(task) if app.tasks: a_sync.block( asyncio.wait, [ asyncio.ensure_future(categorize_task(task)) for task in app.tasks ], ) return tasks_by_state
def test_get_happy_tasks_when_some_unhealthy(self): """Only tasks with a passing healthcheck should be happy""" fake_failing_healthcheck_results = [mock.Mock(alive=False)] fake_successful_healthcheck_results = [mock.Mock(alive=True)] tasks = [ mock.Mock(health_check_results=fake_failing_healthcheck_results), mock.Mock(health_check_results=fake_failing_healthcheck_results), mock.Mock(health_check_results=fake_successful_healthcheck_results) ] fake_app = mock.Mock(tasks=tasks, health_checks=[]) actual = bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', self.fake_system_paasta_config()) expected = tasks[-1:] assert actual == expected
def test_get_happy_tasks_min_task_uptime(self): """If we specify a minimum task age, tasks newer than that should not be considered happy.""" now = datetime.datetime(2000, 1, 1, 0, 0, 0) tasks = [mock.Mock(health_check_results=[], started_at=(now - datetime.timedelta(minutes=i))) for i in xrange(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) # I would have just mocked datetime.datetime.utcnow, but that's apparently difficult; I have to mock # datetime.datetime instead, and give it a utcnow attribute. with mock.patch('paasta_tools.bounce_lib.datetime.datetime', utcnow=lambda: now, autospec=True): actual = bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', self.fake_system_paasta_config(), min_task_uptime=121) expected = tasks[3:] assert actual == expected
def test_get_happy_tasks_check_haproxy_when_unhealthy(self): """If we specify that a task should be in haproxy, don't call it happy unless it's in haproxy.""" tasks = [mock.Mock(health_check_results=[mock.Mock(alive=False)]) for i in range(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) with mock.patch( 'paasta_tools.bounce_lib.get_registered_marathon_tasks', return_value=tasks[2:], autospec=True, ): actual = bounce_lib.get_happy_tasks( fake_app, 'service', 'namespace', self.fake_system_paasta_config(), check_haproxy=True, ) expected = [] assert actual == expected
def test_get_happy_tasks_check_haproxy_when_unhealthy(self): """If we specify that a task should be in haproxy, don't call it happy unless it's in haproxy.""" tasks = [mock.Mock(health_check_results=[mock.Mock(alive=False)]) for i in xrange(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) with contextlib.nested( mock.patch('paasta_tools.bounce_lib.get_registered_marathon_tasks', return_value=tasks[2:], autospec=True), mock.patch('paasta_tools.mesos_tools.get_mesos_slaves_grouped_by_attribute', return_value={'fake_region': ['fake_host']}, autospec=True), ) as ( _, get_mesos_slaves_grouped_by_attribute_patch, ): assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', check_haproxy=True) == []
def test_get_happy_tasks_with_multiple_healthchecks_fail(self): """Only tasks with at least one passing healthcheck should be happy""" fake_successful_healthcheck_results = [ mock.Mock(alive=False), mock.Mock(alive=False), ] tasks = [mock.Mock(health_check_results=fake_successful_healthcheck_results)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) assert ( bounce_lib.get_happy_tasks( fake_app, "service", "namespace", self.fake_system_paasta_config() ) == [] )
def when_setup_service_initiated(context): with mock.patch( 'paasta_tools.bounce_lib.get_happy_tasks', autospec=True, # Wrap function call so we can select a subset of tasks or test # intermediate steps, like when an app is not completely up side_effect=lambda app, _, __, ___, **kwargs: get_happy_tasks( app, context.service, "fake_nerve_ns", context.system_paasta_config, )[:context.max_tasks], ), mock.patch( 'paasta_tools.bounce_lib.bounce_lock_zookeeper', autospec=True, ), mock.patch( 'paasta_tools.bounce_lib.time.sleep', autospec=True, ), mock.patch( 'paasta_tools.setup_marathon_job.load_system_paasta_config', autospec=True, ) as mock_load_system_paasta_config, mock.patch( 'paasta_tools.setup_marathon_job._log', autospec=True, ), mock.patch( 'paasta_tools.marathon_tools.get_config_hash', autospec=True, return_value='confighash', ), mock.patch( 'paasta_tools.marathon_tools.get_code_sha_from_dockerurl', autospec=True, return_value='newapp', ), mock.patch( 'paasta_tools.utils.InstanceConfig.get_docker_url', autospec=True, return_value='busybox', ), mock.patch( 'paasta_tools.mesos_maintenance.get_principal', autospec=True, ) as mock_get_principal, mock.patch( 'paasta_tools.mesos_maintenance.get_secret', autospec=True, ) as mock_get_secret: credentials = mesos_maintenance.load_credentials(mesos_secrets='/etc/mesos-slave-secret') mock_get_principal.return_value = credentials.principal mock_get_secret.return_value = credentials.secret mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value=context.cluster) # 120 * 0.5 = 60 seconds for _ in range(120): try: marathon_apps = marathon_tools.get_all_marathon_apps(context.marathon_client, embed_tasks=True) (code, message, bounce_again) = setup_marathon_job.setup_service( service=context.service, instance=context.instance, client=context.marathon_client, marathon_apps=marathon_apps, service_marathon_config=context.new_marathon_service_config, soa_dir='/nail/etc/services', ) assert code == 0, message return except MarathonHttpError: time.sleep(0.5) raise Exception("Unable to acquire app lock for setup_marathon_job.setup_service")
def test_get_happy_tasks_check_haproxy_multiple_locations(self): """If we specify that a task should be in haproxy, don't call it happy unless it's in haproxy.""" tasks = [ mock.Mock(health_check_results=[mock.Mock(alive=True)]) for i in xrange(5) ] fake_app = mock.Mock(tasks=tasks, health_checks=[]) with contextlib.nested( mock.patch( 'paasta_tools.bounce_lib.get_registered_marathon_tasks', side_effect=[tasks[2:3], tasks[3:]], autospec=True, ), mock.patch( 'paasta_tools.mesos_tools.get_mesos_slaves_grouped_by_attribute', autospec=True), ) as ( get_registered_marathon_tasks_patch, get_mesos_slaves_grouped_by_attribute_patch, ): get_mesos_slaves_grouped_by_attribute_patch.return_value = { 'fake_region': ['fake_host1'], 'fake_other_region': ['fake_host2'], } actual = bounce_lib.get_happy_tasks( fake_app, 'service', 'namespace', self.fake_system_paasta_config(), check_haproxy=True) expected = tasks[2:] assert actual == expected get_registered_marathon_tasks_patch.assert_any_call( 'fake_host1', 123456, utils.DEFAULT_SYNAPSE_HAPROXY_URL_FORMAT, 'service.namespace', tasks, ) get_registered_marathon_tasks_patch.assert_any_call( 'fake_host2', 123456, utils.DEFAULT_SYNAPSE_HAPROXY_URL_FORMAT, 'service.namespace', tasks, )
def when_there_are_num_which_tasks(context, num, which, state): context.max_tasks = int(num) app_id = which_id(context, which) # 120 * 0.5 = 60 seconds for _ in xrange(120): app = context.marathon_client.get_app(app_id, embed_tasks=True) happy_count = len(get_happy_tasks(app, context.service, "fake_nerve_ns", context.system_paasta_config)) if state == "healthy": if happy_count >= context.max_tasks: return elif state == "unhealthy": if len(app.tasks) - happy_count >= context.max_tasks: return time.sleep(0.5) raise Exception("timed out waiting for %d %s tasks on %s; there are %s" % (context.max_tasks, state, app_id, app.tasks))
def when_there_are_num_which_tasks(context, num, which, state): context.max_tasks = int(num) app_id = which_id(context, which) # 120 * 0.5 = 60 seconds for _ in xrange(120): app = context.marathon_client.get_app(app_id, embed_tasks=True) happy_count = len(get_happy_tasks(app, context.service, "fake_nerve_ns")) if state == "healthy": if happy_count >= context.max_tasks: return elif state == "unhealthy": if len(app.tasks) - happy_count >= context.max_tasks: return time.sleep(0.5) raise Exception("timed out waiting for %d %s tasks on %s; there are %d" % (context.max_tasks, state, app_id, app.tasks))
def test_get_happy_tasks_min_task_uptime_when_unhealthy(self): """If we specify a minimum task age, tasks newer than that should not be considered happy.""" now = datetime.datetime(2000, 1, 1, 0, 0, 0) tasks = [mock.Mock( health_check_results=[mock.Mock(alive=False)], started_at=(now - datetime.timedelta(minutes=i)), ) for i in range(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) with mock.patch('paasta_tools.bounce_lib.datetime.datetime', utcnow=lambda: now, autospec=True): actual = bounce_lib.get_happy_tasks( fake_app, 'service', 'namespace', self.fake_system_paasta_config(), min_task_uptime=121, ) expected = [] assert actual == expected
def get_old_happy_unhappy_draining_tasks_for_app(app, drain_method, service, nerve_ns, bounce_health_params): tasks_by_state = { 'happy': set(), 'unhappy': set(), 'draining': set(), } happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns, **bounce_health_params) for task in app.tasks: if drain_method.is_draining(task): state = 'draining' elif task in happy_tasks: state = 'happy' else: state = 'unhappy' tasks_by_state[state].add(task) return tasks_by_state
def test_get_happy_tasks_check_haproxy(self): """If we specify that a task should be in haproxy, don't call it happy unless it's in haproxy.""" tasks = [mock.Mock(health_check_results=[mock.Mock(alive=True)]) for i in xrange(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) with contextlib.nested( mock.patch('paasta_tools.bounce_lib.get_registered_marathon_tasks', return_value=tasks[2:], autospec=True), mock.patch('paasta_tools.bounce_lib.mesos_tools.get_mesos_slaves_grouped_by_attribute', return_value={'fake_region': [{'hostname': 'fakehost'}]}, autospec=True), mock.patch('paasta_tools.mesos_tools.get_slaves', return_value=[], autospec=True), ) as ( _, get_mesos_slaves_grouped_by_attribute_patch, __ ): actual = bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', self.fake_system_paasta_config(), check_haproxy=True) expected = tasks[2:] assert actual == expected
def get_tasks_by_state_for_app( app, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, log_deploy_error, draining_hosts, ): tasks_by_state = { 'happy': set(), 'unhappy': set(), 'draining': set(), 'at_risk': set(), } happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns, system_paasta_config, **bounce_health_params) for task in app.tasks: try: is_draining = drain_method.is_draining(task) except Exception as e: log_deploy_error( "Ignoring exception during is_draining of task %s:" " %s. Treating task as 'unhappy'." % (task, e), ) state = 'unhappy' else: if is_draining is True: state = 'draining' elif task in happy_tasks: if task.host in draining_hosts: state = 'at_risk' else: state = 'happy' else: state = 'unhappy' tasks_by_state[state].add(task) return tasks_by_state
def test_get_happy_tasks_check_haproxy_multiple_locations(self): """If we specify that a task should be in haproxy, don't call it happy unless it's in haproxy.""" tasks = [mock.Mock(health_check_results=[mock.Mock(alive=True)]) for i in xrange(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) with contextlib.nested( mock.patch( 'paasta_tools.bounce_lib.get_registered_marathon_tasks', side_effect=[tasks[2:3], tasks[3:]], autospec=True, ), mock.patch('paasta_tools.mesos_tools.get_mesos_slaves_grouped_by_attribute', autospec=True), mock.patch('paasta_tools.mesos_tools.get_slaves', return_value=[], autospec=True), ) as ( get_registered_marathon_tasks_patch, get_mesos_slaves_grouped_by_attribute_patch, _ ): get_mesos_slaves_grouped_by_attribute_patch.return_value = { 'fake_region': [{'hostname': 'fake_host1'}], 'fake_other_region': [{'hostname': 'fake_host2'}] } actual = bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', self.fake_system_paasta_config(), check_haproxy=True) expected = tasks[2:] assert actual == expected get_registered_marathon_tasks_patch.assert_any_call( 'fake_host1', 123456, utils.DEFAULT_SYNAPSE_HAPROXY_URL_FORMAT, 'service.namespace', tasks, ) get_registered_marathon_tasks_patch.assert_any_call( 'fake_host2', 123456, utils.DEFAULT_SYNAPSE_HAPROXY_URL_FORMAT, 'service.namespace', tasks, )
def when_deploy_service_initiated(context, bounce_method, drain_method): with contextlib.nested( mock.patch( "paasta_tools.bounce_lib.get_happy_tasks", autospec=True, # Wrap function call so we can select a subset of tasks or test # intermediate steps, like when an app is not completely up side_effect=lambda app, _, __, ___, **kwargs: get_happy_tasks( app, context.service, "fake_nerve_ns", context.system_paasta_config )[: context.max_tasks], ), mock.patch("paasta_tools.bounce_lib.bounce_lock_zookeeper", autospec=True), mock.patch("paasta_tools.bounce_lib.create_app_lock", autospec=True), mock.patch("paasta_tools.bounce_lib.time.sleep", autospec=True), mock.patch("paasta_tools.setup_marathon_job.load_system_paasta_config", autospec=True), mock.patch("paasta_tools.setup_marathon_job._log", autospec=True), ) as (_, _, _, _, mock_load_system_paasta_config, _): mock_load_system_paasta_config.return_value.get_cluster = mock.Mock(return_value=context.cluster) # 120 * 0.5 = 60 seconds for _ in xrange(120): try: setup_marathon_job.deploy_service( service=context.service, instance=context.instance, marathon_jobid=context.new_config["id"], config=context.new_config, client=context.marathon_client, bounce_method=bounce_method, drain_method_name=drain_method, drain_method_params={}, nerve_ns=context.instance, bounce_health_params={}, soa_dir=None, ) return except MarathonHttpError: time.sleep(0.5) raise Exception("Unable to qcuiqre app lock for setup_marathon_job.deploy_service")
def test_get_happy_tasks_when_running_with_healthchecks_defined(self): """All running tasks with no health check results are unhealthy if the app defines healthchecks""" now = datetime.datetime(2000, 1, 1, 0, 0, 0, tzinfo=pytz.utc) tasks = [ mock.Mock( health_check_results=[], started_at=(now - datetime.timedelta(minutes=i)), ) for i in range(5) ] fake_app = mock.Mock( tasks=tasks, health_checks=[ mock.Mock(grace_period_seconds=1234, interval_seconds=4321) ], ) with mock.patch( "paasta_tools.marathon_tools.datetime.datetime", now=lambda x: now, autospec=True, ): assert (bounce_lib.get_happy_tasks( fake_app, "service", "namespace", self.fake_system_paasta_config()) == [])
def get_tasks_by_state_for_app(app, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config): tasks_by_state = { 'happy': set(), 'unhappy': set(), 'draining': set(), 'at_risk': set(), } happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns, system_paasta_config, **bounce_health_params) draining_hosts = get_draining_hosts() for task in app.tasks: if drain_method.is_draining(task): state = 'draining' elif task in happy_tasks: if task.host in draining_hosts: state = 'at_risk' else: state = 'happy' else: state = 'unhappy' tasks_by_state[state].add(task) return tasks_by_state
def test_get_happy_tasks_check_each_host(self): """If we specify that a task should be in haproxy, don't call it happy unless it's in haproxy.""" tasks = [mock.Mock(health_check_results=[mock.Mock(alive=True)], host='fake_host1') for i in range(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) with mock.patch( 'paasta_tools.bounce_lib.get_registered_marathon_tasks', side_effect=[([t] if i >= 2 else []) for i, t in enumerate(tasks)], autospec=True, ) as get_registered_marathon_tasks_patch: actual = bounce_lib.get_happy_tasks( fake_app, 'service', 'namespace', self.fake_system_paasta_config(), check_haproxy=True, ) expected = tasks[2:] assert actual == expected for task in tasks: get_registered_marathon_tasks_patch.assert_any_call( 'fake_host1', 123456, utils.DEFAULT_SYNAPSE_HAPROXY_URL_FORMAT, 'service.namespace', [task], )
def test_get_happy_tasks_when_all_healthy(self): """All tasks with only passing healthchecks should be happy""" tasks = [mock.Mock(health_check_results=[mock.Mock(alive=True)]) for _ in range(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', self.fake_system_paasta_config()) == tasks
def test_get_happy_tasks_when_all_healthy(self): """All tasks with only passing healthchecks should be happy""" tasks = [mock.Mock(health_check_results=[mock.Mock(alive=True)]) for _ in xrange(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', self.fake_system_paasta_config()) == tasks
def deploy_service( service, instance, marathon_jobid, config, client, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance ) short_id = marathon_tools.format_job_id(service, instance) cluster = load_system_paasta_config().get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks = get_old_happy_unhappy_draining_tasks( other_apps, drain_method, service, nerve_ns, bounce_health_params ) if new_app_running: protected_draining_tasks = set() if new_app.instances < config['instances']: client.scale_app(app_id=new_app.id, instances=config['instances'], force=True) elif new_app.instances > config['instances']: num_tasks_to_scale = max(min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_old_happy_unhappy_draining_tasks_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set(scaling_app_draining_tasks[:tasks_to_move_draining]) protected_draining_tasks.update(scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set(scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set(scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with # `paasta mark-for-deployment`), then we should undrain them. for task in new_app.tasks: if task not in protected_draining_tasks: drain_method.stop_draining(task) # Re-drain any already draining tasks on old apps for tasks in old_app_draining_tasks.values(): for task in tasks: drain_method.drain(task) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: loglines = ['Exception raised during deploy of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def test_get_happy_tasks_when_running_with_healthchecks_defined(self): """All running tasks with no health check results are unhealthy if the app defines healthchecks""" tasks = [mock.Mock(health_check_results=[]) for _ in range(5)] fake_app = mock.Mock(tasks=tasks, health_checks=["fake_healthcheck_definition"]) assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', self.fake_system_paasta_config()) == []
def test_get_happy_tasks_with_multiple_healthchecks_fail(self): """Only tasks with at least one passing healthcheck should be happy""" fake_successful_healthcheck_results = [mock.Mock(alive=False), mock.Mock(alive=False)] tasks = [mock.Mock(health_check_results=fake_successful_healthcheck_results)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace') == []
def test_get_happy_tasks_when_running_without_healthchecks_defined(self): """All running tasks with no health checks results are healthy if the app does not define healthchecks""" tasks = [mock.Mock(health_check_results=[]) for _ in xrange(5)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace') == tasks
def deploy_service( service, instance, marathon_jobid, config, client, marathon_apps, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, bounce_margin_factor=1.0, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log(service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, marathon_apps) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, system_paasta_config, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) ( old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks(new_app) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[new_app.id] = set( scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get(new_app.id, []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: logline = 'Failed to get lock to create marathon app for %s.%s' % ( service, instance) log_deploy_error(logline, level='debug') return (0, "Couldn't get marathon lock, skipping until next time") except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % ( service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def deploy_service( service, instance, marathon_jobid, config, client, marathon_apps, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, bounce_margin_factor=1.0, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance ) short_id = marathon_tools.format_job_id(service, instance) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, marathon_apps) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, system_paasta_config, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) (old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks(new_app) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max(min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, system_paasta_config, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set(scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set(scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[new_app.id] = set(scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set(scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get(new_app.id, []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % (service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def test_get_happy_tasks_when_running_with_healthchecks_defined(self): """All running tasks with no health check results are unhealthy if the app defines healthchecks""" tasks = [mock.Mock(health_check_results=[]) for _ in xrange(5)] fake_app = mock.Mock(tasks=tasks, health_checks=["fake_healthcheck_definition"]) assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', self.fake_system_paasta_config()) == []
def deploy_service( service, instance, marathon_jobid, config, client, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level="event"): return _log( service=service, line=errormsg, component="deploy", level="event", cluster=cluster, instance=instance ) short_id = marathon_tools.format_job_id(service, instance) cluster = load_system_paasta_config().get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True) new_app_list = [a for a in existing_apps if a.id == "/%s" % config["id"]] other_apps = [a for a in existing_apps if a.id != "/%s" % config["id"]] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = "ERROR: drain_method not recognized: %s. Must be one of (%s)" % ( drain_method_name, ", ".join(drain_lib.list_drain_methods()), ) log_deploy_error(errormsg) return (1, errormsg) old_app_live_tasks, old_app_draining_tasks = get_old_live_draining_tasks(other_apps, drain_method) # Re-drain any already draining tasks on old apps for tasks in old_app_draining_tasks.values(): for task in tasks: drain_method.drain(task) # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with # `paasta mark-for-deployment`), then we should undrain them. if new_app_running: for task in new_app.tasks: drain_method.stop_draining(task) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = "ERROR: bounce_method not recognized: %s. Must be one of (%s)" % ( bounce_method, ", ".join(bounce_lib.list_bounce_methods()), ) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_tasks=old_app_live_tasks, old_app_draining_tasks=old_app_draining_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: loglines = ["Exception raised during deploy of service %s:" % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: log_deploy_error(logline, level="debug") raise return (0, "Service deployed.")
def deploy_service( service: str, instance: str, marathon_jobid: str, config: marathon_tools.FormattedMarathonAppDict, clients: marathon_tools.MarathonClients, marathon_apps_with_clients: Collection[Tuple[MarathonApp, MarathonClient]], bounce_method: str, drain_method_name: str, drain_method_params: Dict[str, Any], nerve_ns: str, bounce_health_params: Dict[str, Any], soa_dir: str, job_config: marathon_tools.MarathonServiceConfig, bounce_margin_factor: float = 1.0, ) -> Tuple[int, str, Optional[float]]: """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param clients: A MarathonClients object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :param bounce_margin_factor: the multiplication factor used to calculate the number of instances to be drained :returns: A tuple of (status, output, bounce_in_seconds) to be used with send_sensu_event""" def log_deploy_error(errormsg: str, level: str = 'event') -> None: return _log( service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance, ) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() existing_apps_with_clients = marathon_tools.get_matching_apps_with_clients( service=service, instance=instance, marathon_apps_with_clients=marathon_apps_with_clients, ) new_client = clients.get_current_client_for_service(job_config) new_apps_with_clients_list: List[Tuple[MarathonApp, MarathonClient]] = [] other_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]] = [] for a, c in existing_apps_with_clients: if a.id == '/%s' % config['id'] and c == new_client: new_apps_with_clients_list.append((a, c)) else: other_apps_with_clients.append((a, c)) serviceinstance = "%s.%s" % (service, instance) if new_apps_with_clients_list: new_app, new_client = new_apps_with_clients_list[0] if len(new_apps_with_clients_list) != 1: raise ValueError( "Only expected one app per ID per shard; found %d" % len(new_apps_with_clients_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks( new_app, service, nerve_ns, system_paasta_config, **bounce_health_params, ) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg, None) try: draining_hosts = get_draining_hosts() except ReadTimeout as e: errormsg = "ReadTimeout encountered trying to get draining hosts: %s" % e return (1, errormsg, 60) ( old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks, old_app_at_risk_tasks, ) = get_tasks_by_state( other_apps_with_clients=other_apps_with_clients, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) # The first thing we need to do is take up the "slack" of old apps, to stop # them from launching new things that we are going to have to end up draining # and killing anyway. for a, c in other_apps_with_clients: marathon_tools.take_up_slack(app=a, client=c) num_at_risk_tasks = 0 if new_app_running: num_at_risk_tasks = get_num_at_risk_tasks( new_app, draining_hosts=draining_hosts) if new_app.instances < config['instances'] + num_at_risk_tasks: log.info("Scaling %s up from %d to %d instances." % (new_app.id, new_app.instances, config['instances'] + num_at_risk_tasks)) new_client.scale_app(app_id=new_app.id, instances=config['instances'] + num_at_risk_tasks, force=True) # If we have more than the specified number of instances running, we will want to drain some of them. # We will start by draining any tasks running on at-risk hosts. elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_tasks_by_state_for_app( app=new_app, drain_method=drain_method, service=service, nerve_ns=nerve_ns, bounce_health_params=bounce_health_params, system_paasta_config=system_paasta_config, log_deploy_error=log_deploy_error, draining_hosts=draining_hosts, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) scaling_app_at_risk_tasks = list(task_dict['at_risk']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[(new_app.id, new_client)] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[(new_app.id, new_client)] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy], ) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_at_risk = min(len(scaling_app_at_risk_tasks), num_tasks_to_scale) old_app_at_risk_tasks[(new_app.id, new_client)] = set( scaling_app_at_risk_tasks[:tasks_to_move_at_risk]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_at_risk tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[(new_app.id, new_client)] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # slack represents remaining the extra remaining instances that are configured # in marathon that don't have a launched task yet. When scaling down we want to # reduce this slack so marathon doesn't get a chance to launch a new task in # that space that we will then have to drain and kill again. marathon_tools.take_up_slack(client=new_client, app=new_app) # TODO: don't take actions in deploy_service. undrain_tasks( to_undrain=new_app.tasks, leave_draining=old_app_draining_tasks.get((new_app.id, new_client), []), drain_method=drain_method, log_deploy_error=log_deploy_error, ) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg, None) bounce_again_in_seconds = do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, old_app_at_risk_tasks=old_app_at_risk_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, clients=clients, soa_dir=soa_dir, job_config=job_config, bounce_margin_factor=bounce_margin_factor, ) except bounce_lib.LockHeldException: logline = 'Failed to get lock to create marathon app for %s.%s' % ( service, instance) log_deploy_error(logline, level='debug') return (0, "Couldn't get marathon lock, skipping until next time", None) except Exception: logline = 'Exception raised during deploy of service %s:\n%s' % ( service, traceback.format_exc()) log_deploy_error(logline, level='debug') raise if num_at_risk_tasks: bounce_again_in_seconds = 60 elif new_app_running: if new_app.instances > config['instances']: bounce_again_in_seconds = 60 return (0, 'Service deployed.', bounce_again_in_seconds)
def test_get_happy_tasks_with_multiple_healthchecks_fail(self): """Only tasks with at least one passing healthcheck should be happy""" fake_successful_healthcheck_results = [mock.Mock(alive=False), mock.Mock(alive=False)] tasks = [mock.Mock(health_check_results=fake_successful_healthcheck_results)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace', self.fake_system_paasta_config()) == []
def deploy_service( service, instance, marathon_jobid, config, client, bounce_method, drain_method_name, drain_method_params, nerve_ns, bounce_health_params, soa_dir, ): """Deploy the service to marathon, either directly or via a bounce if needed. Called by setup_service when it's time to actually deploy. :param service: The name of the service to deploy :param instance: The instance of the service to deploy :param marathon_jobid: Full id of the marathon job :param config: The complete configuration dict to send to marathon :param client: A MarathonClient object :param bounce_method: The bounce method to use, if needed :param drain_method_name: The name of the traffic draining method to use. :param nerve_ns: The nerve namespace to look in. :param bounce_health_params: A dictionary of options for bounce_lib.get_happy_tasks. :returns: A tuple of (status, output) to be used with send_sensu_event""" def log_deploy_error(errormsg, level='event'): return _log(service=service, line=errormsg, component='deploy', level='event', cluster=cluster, instance=instance) short_id = marathon_tools.format_job_id(service, instance) cluster = load_system_paasta_config().get_cluster() existing_apps = marathon_tools.get_matching_apps(service, instance, client, embed_failures=True) new_app_list = [a for a in existing_apps if a.id == '/%s' % config['id']] other_apps = [a for a in existing_apps if a.id != '/%s' % config['id']] serviceinstance = "%s.%s" % (service, instance) if new_app_list: new_app = new_app_list[0] if len(new_app_list) != 1: raise ValueError("Only expected one app per ID; found %d" % len(new_app_list)) new_app_running = True happy_new_tasks = bounce_lib.get_happy_tasks(new_app, service, nerve_ns, **bounce_health_params) else: new_app_running = False happy_new_tasks = [] try: drain_method = drain_lib.get_drain_method( drain_method_name, service=service, instance=instance, nerve_ns=nerve_ns, drain_method_params=drain_method_params, ) except KeyError: errormsg = 'ERROR: drain_method not recognized: %s. Must be one of (%s)' % \ (drain_method_name, ', '.join(drain_lib.list_drain_methods())) log_deploy_error(errormsg) return (1, errormsg) old_app_live_happy_tasks, old_app_live_unhappy_tasks, old_app_draining_tasks = get_old_happy_unhappy_draining_tasks( other_apps, drain_method, service, nerve_ns, bounce_health_params) if new_app_running: protected_draining_tasks = set() if new_app.instances < config['instances']: client.scale_app(app_id=new_app.id, instances=config['instances'], force=True) elif new_app.instances > config['instances']: num_tasks_to_scale = max( min(len(new_app.tasks), new_app.instances) - config['instances'], 0) task_dict = get_old_happy_unhappy_draining_tasks_for_app( new_app, drain_method, service, nerve_ns, bounce_health_params, ) scaling_app_happy_tasks = list(task_dict['happy']) scaling_app_unhappy_tasks = list(task_dict['unhappy']) scaling_app_draining_tasks = list(task_dict['draining']) tasks_to_move_draining = min(len(scaling_app_draining_tasks), num_tasks_to_scale) old_app_draining_tasks[new_app.id] = set( scaling_app_draining_tasks[:tasks_to_move_draining]) protected_draining_tasks.update( scaling_app_draining_tasks[:tasks_to_move_draining]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_draining tasks_to_move_unhappy = min(len(scaling_app_unhappy_tasks), num_tasks_to_scale) old_app_live_unhappy_tasks[new_app.id] = set( scaling_app_unhappy_tasks[:tasks_to_move_unhappy]) num_tasks_to_scale = num_tasks_to_scale - tasks_to_move_unhappy tasks_to_move_happy = min(len(scaling_app_happy_tasks), num_tasks_to_scale) old_app_live_happy_tasks[new_app.id] = set( scaling_app_happy_tasks[:tasks_to_move_happy]) happy_new_tasks = scaling_app_happy_tasks[tasks_to_move_happy:] # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with # `paasta mark-for-deployment`), then we should undrain them. for task in new_app.tasks: if task not in protected_draining_tasks: drain_method.stop_draining(task) # Re-drain any already draining tasks on old apps for tasks in old_app_draining_tasks.values(): for task in tasks: drain_method.drain(task) # log all uncaught exceptions and raise them again try: try: bounce_func = bounce_lib.get_bounce_method_func(bounce_method) except KeyError: errormsg = 'ERROR: bounce_method not recognized: %s. Must be one of (%s)' % \ (bounce_method, ', '.join(bounce_lib.list_bounce_methods())) log_deploy_error(errormsg) return (1, errormsg) try: with bounce_lib.bounce_lock_zookeeper(short_id): do_bounce( bounce_func=bounce_func, drain_method=drain_method, config=config, new_app_running=new_app_running, happy_new_tasks=happy_new_tasks, old_app_live_happy_tasks=old_app_live_happy_tasks, old_app_live_unhappy_tasks=old_app_live_unhappy_tasks, old_app_draining_tasks=old_app_draining_tasks, service=service, bounce_method=bounce_method, serviceinstance=serviceinstance, cluster=cluster, instance=instance, marathon_jobid=marathon_jobid, client=client, soa_dir=soa_dir, ) except bounce_lib.LockHeldException: log.error("Instance %s already being bounced. Exiting", short_id) return (1, "Instance %s is already being bounced." % short_id) except Exception: loglines = ['Exception raised during deploy of service %s:' % service] loglines.extend(traceback.format_exc().rstrip().split("\n")) for logline in loglines: log_deploy_error(logline, level='debug') raise return (0, 'Service deployed.')
def test_get_happy_tasks_with_multiple_healthchecks_success(self): """All tasks with at least one passing healthcheck should be happy""" fake_successful_healthcheck_results = [mock.Mock(alive=True), mock.Mock(alive=False)] tasks = [mock.Mock(health_check_results=fake_successful_healthcheck_results)] fake_app = mock.Mock(tasks=tasks, health_checks=[]) assert bounce_lib.get_happy_tasks(fake_app, 'service', 'namespace') == tasks