def test_drain( mock_reserve_all_resources, mock_build_maintenance_schedule_payload, mock_get_schedule_client, ): fake_schedule = {'fake_schedule': 'fake_value'} mock_build_maintenance_schedule_payload.return_value = fake_schedule drain(hostnames=['some-host'], start='some-start', duration='some-duration') assert mock_build_maintenance_schedule_payload.call_count == 1 expected_args = mock.call(['some-host'], 'some-start', 'some-duration', drain=True) assert mock_build_maintenance_schedule_payload.call_args == expected_args assert mock_reserve_all_resources.call_count == 1 expected_args = mock.call(['some-host']) assert mock_reserve_all_resources.call_args == expected_args assert mock_get_schedule_client.call_count == 1 assert mock_get_schedule_client.return_value.call_count == 1 expected_args = mock.call(method="POST", endpoint="", data=json.dumps(fake_schedule)) assert mock_get_schedule_client.return_value.call_args == expected_args mock_reserve_all_resources.side_effect = HTTPError() drain(hostnames=['some-host'], start='some-start', duration='some-duration') assert mock_get_schedule_client.call_count == 2
def test_drain( mock_reserve_all_resources, mock_build_maintenance_schedule_payload, mock_operator_api, ): fake_schedule = {'fake_schedule': 'fake_value'} mock_build_maintenance_schedule_payload.return_value = fake_schedule drain(hostnames=['some-host'], start='some-start', duration='some-duration') assert mock_build_maintenance_schedule_payload.call_count == 1 expected_args = mock.call(['some-host'], 'some-start', 'some-duration', drain=True) assert mock_build_maintenance_schedule_payload.call_args == expected_args assert mock_reserve_all_resources.call_count == 1 expected_args = mock.call(['some-host']) assert mock_reserve_all_resources.call_args == expected_args assert mock_operator_api.call_count == 1 assert mock_operator_api.return_value.call_count == 1 expected_args = mock.call(data=fake_schedule) assert mock_operator_api.return_value.call_args == expected_args mock_reserve_all_resources.side_effect = HTTPError() drain(hostnames=['some-host'], start='some-start', duration='some-duration') assert mock_operator_api.call_count == 2
def gracefully_terminate_slave(resource, slave_to_kill, pool_settings, current_capacity, new_capacity, dry_run): sfr_id = resource['id'] drain_timeout = pool_settings.get('drain_timeout', DEFAULT_DRAIN_TIMEOUT) # The start time of the maintenance window is the point at which # we giveup waiting for the instance to drain and mark it for termination anyway start = int(time.time() + drain_timeout) * 1000000000 # nanoseconds # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually # do anything at the end of the maintenance window. duration = 600 * 1000000000 # nanoseconds log.info("Draining {0}".format(slave_to_kill['pid'])) if not dry_run: try: drain_host_string = "{0}|{1}".format(slave_to_kill['hostname'], slave_to_kill['ip']) drain([drain_host_string], start, duration) except HTTPError as e: log.error("Failed to start drain " "on {0}: {1}\n Trying next host".format( slave_to_kill['hostname'], e)) raise log.info("Decreasing spot fleet capacity from {0} to: {1}".format( current_capacity, new_capacity)) # Instance weights can be floats but the target has to be an integer # because this is all AWS allows on the API call to set target capacity new_capacity = int(floor(new_capacity)) try: set_spot_fleet_request_capacity(sfr_id, new_capacity, dry_run, region=resource['region']) except FailSetSpotCapacity: log.error("Couldn't update spot fleet, stopping autoscaler") log.info("Undraining {0}".format(slave_to_kill['pid'])) if not dry_run: undrain([drain_host_string]) raise log.info("Waiting for instance to drain before we terminate") try: wait_and_terminate(slave_to_kill, drain_timeout, dry_run, region=resource['region']) except ClientError as e: log.error("Failure when terminating: {0}: {1}".format( slave_to_kill['pid'], e)) log.error( "Setting spot fleet capacity back to {0}".format(current_capacity)) set_spot_fleet_request_capacity(sfr_id, current_capacity, dry_run, region=resource['region']) finally: log.info("Undraining {0}".format(slave_to_kill['pid'])) if not dry_run: undrain([drain_host_string])
def paasta_maintenance(): """Manipulate the maintenance state of a PaaSTA host. :returns: None """ args = parse_args() if args.verbose >= 2: logging.basicConfig(level=logging.DEBUG) elif args.verbose == 1: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARNING) action = args.action hostnames = args.hostname if action != 'status' and not hostnames: paasta_print("You must specify one or more hostnames") return start = args.start duration = args.duration ret = "Done" if action == 'drain': mesos_maintenance.drain(hostnames, start, duration) elif action == 'undrain': mesos_maintenance.undrain(hostnames) elif action == 'down': mesos_maintenance.down(hostnames) elif action == 'up': mesos_maintenance.up(hostnames) elif action == 'status': ret = mesos_maintenance.friendly_status() elif action == 'cluster_status': ret = mesos_maintenance.status() elif action == 'schedule': ret = mesos_maintenance.schedule() elif action == 'is_safe_to_drain': ret = is_safe_to_drain(hostnames[0]) elif action == 'is_safe_to_kill': ret = is_safe_to_kill(hostnames[0]) elif action == 'is_host_drained': ret = mesos_maintenance.is_host_drained(hostnames[0]) elif action == 'is_host_down': ret = mesos_maintenance.is_host_down(hostnames[0]) elif action == 'is_host_draining': ret = mesos_maintenance.is_host_draining(hostnames[0]) elif action == 'is_host_past_maintenance_start': ret = mesos_maintenance.is_host_past_maintenance_start(hostnames[0]) elif action == 'is_host_past_maintenance_end': ret = mesos_maintenance.is_host_past_maintenance_end(hostnames[0]) else: raise NotImplementedError("Action: '%s' is not implemented." % action) paasta_print(ret) return ret
def paasta_maintenance(): """Manipulate the maintenance state of a PaaSTA host. :returns: None """ args = parse_args() if args.verbose >= 2: logging.basicConfig(level=logging.DEBUG) elif args.verbose == 1: logging.basicConfig(level=logging.INFO) else: logging.basicConfig(level=logging.WARNING) action = args.action hostnames = args.hostname if action != 'status' and not hostnames: paasta_print("You must specify one or more hostnames") return start = args.start duration = args.duration ret = "Done" if action == 'drain': mesos_maintenance.drain(hostnames, start, duration) elif action == 'undrain': mesos_maintenance.undrain(hostnames) elif action == 'down': mesos_maintenance.down(hostnames) elif action == 'up': mesos_maintenance.up(hostnames) elif action == 'status': ret = "%s" % mesos_maintenance.status() elif action == 'schedule': ret = "%s" % mesos_maintenance.schedule() elif action == 'is_safe_to_drain': ret = is_safe_to_drain(hostnames[0]) elif action == 'is_safe_to_kill': ret = is_safe_to_kill(hostnames[0]) elif action == 'is_host_drained': ret = mesos_maintenance.is_host_drained(hostnames[0]) elif action == 'is_host_down': ret = mesos_maintenance.is_host_down(hostnames[0]) elif action == 'is_host_draining': ret = mesos_maintenance.is_host_draining(hostnames[0]) elif action == 'is_host_past_maintenance_start': ret = mesos_maintenance.is_host_past_maintenance_start(hostnames[0]) elif action == 'is_host_past_maintenance_end': ret = mesos_maintenance.is_host_past_maintenance_end(hostnames[0]) else: raise NotImplementedError("Action: '%s' is not implemented." % action) paasta_print(ret) return ret
def gracefully_terminate_slave(self, slave_to_kill, current_capacity, new_capacity): drain_timeout = self.pool_settings.get('drain_timeout', DEFAULT_DRAIN_TIMEOUT) # The start time of the maintenance window is the point at which # we giveup waiting for the instance to drain and mark it for termination anyway start = int(time.time() + drain_timeout) * 1000000000 # nanoseconds # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually # do anything at the end of the maintenance window. duration = 600 * 1000000000 # nanoseconds self.log.info("Draining {}".format(slave_to_kill.pid)) should_drain = self.should_drain(slave_to_kill) if should_drain: try: drain_host_string = "{}|{}".format(slave_to_kill.hostname, slave_to_kill.ip) drain([drain_host_string], start, duration) except HTTPError as e: self.log.error("Failed to start drain " "on {}: {}\n Trying next host".format( slave_to_kill.hostname, e)) raise self.log.info("Decreasing resource from {} to: {}".format( current_capacity, new_capacity)) # Instance weights can be floats but the target has to be an integer # because this is all AWS allows on the API call to set target capacity new_capacity = int(floor(new_capacity)) try: self.set_capacity(new_capacity) except FailSetResourceCapacity: self.log.error( "Couldn't update resource capacity, stopping autoscaler") self.log.info("Undraining {}".format(slave_to_kill.pid)) if should_drain: undrain([drain_host_string]) raise self.log.info("Waiting for instance to drain before we terminate") try: self.wait_and_terminate( slave=slave_to_kill, drain_timeout=drain_timeout, dry_run=self.dry_run, region=self.resource['region'], should_drain=should_drain, ) except ClientError as e: self.log.error("Failure when terminating: {}: {}".format( slave_to_kill.pid, e)) self.log.error("Setting resource capacity back to {}".format( current_capacity)) self.set_capacity(current_capacity) self.log.info("Undraining {}".format(slave_to_kill.pid)) if should_drain: undrain([drain_host_string])
def mark_host_at_risk(context, host): start = mesos_maintenance.datetime_to_nanoseconds(mesos_maintenance.now()) duration = mesos_maintenance.parse_timedelta("1h") with contextlib.nested( mock.patch("paasta_tools.mesos_maintenance.get_principal", autospec=True), mock.patch("paasta_tools.mesos_maintenance.get_secret", autospec=True), ) as (mock_get_principal, mock_get_secret): credentials = mesos_maintenance.load_credentials(mesos_secrets="/etc/mesos-slave-secret") mock_get_principal.return_value = credentials.principal mock_get_secret.return_value = credentials.secret mesos_maintenance.drain([host], start, duration) context.at_risk_host = host
def mark_host_at_risk(context, host): start = mesos_maintenance.datetime_to_nanoseconds(mesos_maintenance.now()) duration = mesos_maintenance.parse_timedelta('1h') with mock.patch( 'paasta_tools.mesos_maintenance.get_principal', autospec=True, ) as mock_get_principal, mock.patch( 'paasta_tools.mesos_maintenance.get_secret', autospec=True, ) as mock_get_secret: credentials = mesos_maintenance.load_credentials(mesos_secrets='/etc/mesos-slave-secret') mock_get_principal.return_value = credentials.principal mock_get_secret.return_value = credentials.secret mesos_maintenance.drain([host], start, duration) context.at_risk_host = host
def gracefully_terminate_slave(resource, slave_to_kill, pool_settings, current_capacity, new_capacity, dry_run): sfr_id = resource['id'] drain_timeout = pool_settings.get('drain_timeout', DEFAULT_DRAIN_TIMEOUT) # The start time of the maintenance window is the point at which # we giveup waiting for the instance to drain and mark it for termination anyway start = int(time.time() + drain_timeout) * 1000000000 # nanoseconds # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually # do anything at the end of the maintenance window. duration = 600 * 1000000000 # nanoseconds log.info("Draining {0}".format(slave_to_kill['pid'])) if not dry_run: try: drain_host_string = "{0}|{1}".format(slave_to_kill['hostname'], slave_to_kill['ip']) drain([drain_host_string], start, duration) except HTTPError as e: log.error("Failed to start drain " "on {0}: {1}\n Trying next host".format(slave_to_kill['hostname'], e)) raise log.info("Decreasing spot fleet capacity from {0} to: {1}".format(current_capacity, new_capacity)) # Instance weights can be floats but the target has to be an integer # because this is all AWS allows on the API call to set target capacity new_capacity = int(floor(new_capacity)) try: set_spot_fleet_request_capacity(sfr_id, new_capacity, dry_run, region=resource['region']) except FailSetSpotCapacity: log.error("Couldn't update spot fleet, stopping autoscaler") log.info("Undraining {0}".format(slave_to_kill['pid'])) if not dry_run: undrain([drain_host_string]) raise log.info("Waiting for instance to drain before we terminate") try: wait_and_terminate(slave_to_kill, drain_timeout, dry_run, region=resource['region']) except ClientError as e: log.error("Failure when terminating: {0}: {1}".format(slave_to_kill['pid'], e)) log.error("Setting spot fleet capacity back to {0}".format(current_capacity)) set_spot_fleet_request_capacity(sfr_id, current_capacity, dry_run, region=resource['region']) finally: log.info("Undraining {0}".format(slave_to_kill['pid'])) if not dry_run: undrain([drain_host_string])
def test_drain( mock_reserve_all_resources, mock_build_maintenance_schedule_payload, mock_get_schedule_client, ): fake_schedule = {'fake_schedule': 'fake_value'} mock_build_maintenance_schedule_payload.return_value = fake_schedule drain(hostnames=['some-host'], start='some-start', duration='some-duration') assert mock_build_maintenance_schedule_payload.call_count == 1 expected_args = mock.call(['some-host'], 'some-start', 'some-duration', drain=True) assert mock_build_maintenance_schedule_payload.call_args == expected_args assert mock_reserve_all_resources.call_count == 1 expected_args = mock.call(['some-host']) assert mock_reserve_all_resources.call_args == expected_args assert mock_get_schedule_client.call_count == 1 assert mock_get_schedule_client.return_value.call_count == 1 expected_args = mock.call(method="POST", endpoint="", data=json.dumps(fake_schedule)) assert mock_get_schedule_client.return_value.call_args == expected_args
def test_drain( mock_reserve_all_resources, mock_build_maintenance_schedule_payload, mock_operator_api, ): fake_schedule = {"fake_schedule": "fake_value"} mock_build_maintenance_schedule_payload.return_value = fake_schedule drain(hostnames=["some-host"], start="some-start", duration="some-duration") assert mock_build_maintenance_schedule_payload.call_count == 1 expected_args = mock.call(["some-host"], "some-start", "some-duration", drain=True) assert mock_build_maintenance_schedule_payload.call_args == expected_args assert mock_reserve_all_resources.call_count == 1 expected_args = mock.call(["some-host"]) assert mock_reserve_all_resources.call_args == expected_args assert mock_operator_api.call_count == 1 assert mock_operator_api.return_value.call_count == 1 expected_args = mock.call(data=fake_schedule) assert mock_operator_api.return_value.call_args == expected_args mock_reserve_all_resources.side_effect = HTTPError() drain(hostnames=["some-host"], start="some-start", duration="some-duration") assert mock_operator_api.call_count == 2 mock_reserve_all_resources.reset_mock() mock_operator_api.reset_mock() drain( hostnames=["some-host"], start="some-start", duration="some-duration", reserve_resources=False, ) assert mock_reserve_all_resources.call_count == 0 assert mock_operator_api.return_value.call_count == 1
async def gracefully_terminate_slave(self, slave_to_kill, capacity_diff, timer): """ Since this is async, it can be suspended at an `await` call. Because of this, we need to re-calculate the capacity each time we call `set_capacity` (as another coroutine could have set the capacity while this one was suspended). `set_capacity` stores the currently set capacity in the object, and then this function re-calculates that from the capacity_diff each time we call `set_capacity` """ drain_timeout = self.pool_settings.get('drain_timeout', DEFAULT_DRAIN_TIMEOUT) # The start time of the maintenance window is the point at which # we giveup waiting for the instance to drain and mark it for termination anyway start = int(time.time() + drain_timeout) * 1000000000 # nanoseconds # Set the duration to an hour, this is fairly arbitrary as mesos doesn't actually # do anything at the end of the maintenance window. duration = 600 * 1000000000 # nanoseconds self.log.info("Draining {}".format(slave_to_kill.pid)) should_drain = self.should_drain(slave_to_kill) if should_drain: try: drain_host_string = "{}|{}".format(slave_to_kill.hostname, slave_to_kill.ip) drain([drain_host_string], start, duration) except HTTPError as e: self.log.error("Failed to start drain " "on {}: {}\n Trying next host".format( slave_to_kill.hostname, e)) raise self.log.info("Decreasing resource from {} to: {}".format( self.capacity, self.capacity + capacity_diff)) # Instance weights can be floats but the target has to be an integer # because this is all AWS allows on the API call to set target capacity try: self.set_capacity(self.capacity + capacity_diff) except FailSetResourceCapacity: self.log.error( "Couldn't update resource capacity, stopping autoscaler") self.log.info("Undraining {}".format(slave_to_kill.pid)) if should_drain: undrain([drain_host_string]) raise self.log.info("Waiting for instance to drain before we terminate") try: await self.wait_and_terminate( slave=slave_to_kill, drain_timeout=drain_timeout, dry_run=self.dry_run, timer=timer, region=self.resource['region'], should_drain=should_drain, ) except ClientError as e: self.log.error("Failure when terminating: {}: {}".format( slave_to_kill.pid, e)) self.log.error( "Setting resource capacity back to {}".format(self.capacity - capacity_diff)) self.set_capacity(self.capacity - capacity_diff) self.log.info("Undraining {}".format(slave_to_kill.pid)) if should_drain: undrain([drain_host_string])