class TestTrafficSentinel(object): def setup(self): host = "fake" username = "******" password = "******" self.mock_traffic_sentinel = True self.traffic_sentinel = TrafficSentinel(host, username=username, password=password) self.original_urlopen = urllib2.urlopen def patch_urllib(self, return_string): self.traffic_sentinel_string = StringIO(return_string) urllib2.urlopen = Mock(return_value=self.traffic_sentinel_string) def teardown(self): urllib2.urlopen = self.original_urlopen def test_get_metric_statistics(self): # This is a tricky way to make sure this test passes with the real ts, # since a real TS will always have its own values test_host = os.environ.get("TRAFFIC_SENTINEL_HOST", "fake.ts.host.tld") loads = [0.010, 0.020] test_reply = "%s,%f\n" % (test_host, loads[0]) test_reply += "%s,%f\n" % (test_host, loads[1]) load_average = sum(loads) / float(len(loads)) if self.mock_traffic_sentinel: self.patch_urllib(test_reply) period = 60 start_time = datetime.now() - timedelta(days=1) end_time = datetime.now() metric_name = "load_five" statistics = Statistics.AVERAGE result = self.traffic_sentinel.get_metric_statistics(period, start_time, end_time, metric_name, statistics) assert len(result) > 0 assert result.get(test_host) assert result[test_host].get(Statistics.AVERAGE) if not self.mock_traffic_sentinel: return # assert result[test_host][Statistics.AVERAGE] ~= load_average assert abs(result[test_host][Statistics.AVERAGE] - load_average) < 0.0000001 def test_get_metric_statistics_app_attributes(self): # test_host = os.environ.get("TRAFFIC_SENTINEL_HOST", "fake.ts.host.tld") test_process = os.environ.get("TRAFFIC_SENTINEL_PROCESS", "fake.process") queue_length = 1 ml = 1 app_attributes = ["pid=%s&ql=%s&ml=%s" % (test_process, queue_length, ml)] test_reply = "%s\n" % (app_attributes[0]) if self.mock_traffic_sentinel: self.patch_urllib(test_reply) period = 60 start_time = datetime.now() - timedelta(days=1) end_time = datetime.now() metric_name = "app_attributes:ml" statistics = Statistics.AVERAGE dimensions = {"pid": [test_process]} result = self.traffic_sentinel.get_metric_statistics( period, start_time, end_time, metric_name, statistics, dimensions ) assert len(result) > 0 assert result.get(test_process) assert result[test_process].get(Statistics.AVERAGE) if not self.mock_traffic_sentinel: return # assert result[test_host][Statistics.AVERAGE] ~= load_average assert abs(result[test_process][Statistics.AVERAGE] - ml) < 0.0000001 def test_build_script(self): query_fields = ["first", "second"] query_type = "host" group = 60 interval = "201209190101.01-201209200101.01" dimensions = {"hostname": ["somevm.cloud.tld", "someothervm.cloud.tld"]} script = self.traffic_sentinel._build_script(query_fields, query_type, interval, group, dimensions) assert 'interval = "%s"' % interval in script assert 'select = "%s"' % ",".join(query_fields) in script assert 'where = "%s"' % "(hostname = somevm.cloud.tld | hostname = someothervm.cloud.tld)" in script
class SensorPolicy(IPolicy): _SENSOR_PARAMS = ('metric', 'minimum_processes', 'maximum_processes', 'sample_period', 'sample_function', 'cooldown_period', 'scale_up_threshold', 'scale_up_n_processes', 'scale_down_threshold', 'scale_down_n_processes') def __init__(self, parameters=None, process_definition_id=None, schedule_process_callback=None, terminate_process_callback=None, process_state_callback=None, process_configuration=None, aggregator_config=None, *args, **kwargs): """Set up the Policy @param parameters: The parameters used by this policy to determine the distribution and number of VMs. This policy expects a dictionary with TODO @param process_definition_id: The process definition id to send to the PD on launch @param schedule_process_callback: A callback to schedule a process to a PD. Must have signature: schedule(pd_name, process_definition_id), and return a upid as a string @param terminate_process_callback: A callback to terminate a process on a PD. Must have signature: terminate(upid) @param process_state_callback: A callback to get a process state from a PD. Must have signature: process_state(upid) @param aggregator_config: configuration dict of aggregator. For traffic sentinel, this should look like: config = { 'type': 'trafficsentinel', 'host': 'host.name.tld', 'port': 1235, 'username': '******', 'password': '******' } """ self.schedule_process = schedule_process_callback or dummy_schedule_process_callback self.terminate_process = terminate_process_callback or dummy_terminate_process_callback self.process_state = process_state_callback or dummy_process_state_callback self._parameters = None if parameters: self.parameters = parameters else: self._schedule_kwargs = {} self.process_definition_id = process_definition_id self.previous_all_procs = {} self._status = HAState.PENDING self.minimum_n = 1 self.last_scale_action = datetime.datetime.min if aggregator_config is None: raise Exception("Must provide an aggregator config") aggregator_type = aggregator_config.get('type', '').lower() if aggregator_type == 'trafficsentinel': host = aggregator_config.get('host') username = aggregator_config.get('username') password = aggregator_config.get('password') port = aggregator_config.get('port', 443) protocol = aggregator_config.get('protocol', 'https') self._sensor_aggregator = TrafficSentinel(host, username, password, port=port, protocol=protocol) self.app_metrics = self._sensor_aggregator.app_metrics self.host_metrics = self._sensor_aggregator.app_metrics else: raise Exception("Don't know what to do with %s aggregator type" % aggregator_type) if kwargs.get('name'): self.logprefix = "HA Agent (%s): " % kwargs['name'] else: self.logprefix = "" @property def parameters(self): """parameters a dictionary of parameters that looks like: metric: Name of Sensor Aggregator Metric to use for scaling decisions sample_period: Number of seconds of sample data to use (eg. if 3600, use sample data from 1 hour ago until present time sample_function: Statistical function to apply to sampled data. Choose from Average, Sum, SampleCount, Maximum, Minimum cooldown_period: Minimum time in seconds between scale up or scale down actions scale_up_threshold: If the sampled metric is above this value, scale up the number of processes scale_up_n_processes: Number of processes to scale up by scale_down_threshold: If the sampled metric is below this value, scale down the number of processes scale_down_n_processes: Number of processes to scale down by minimum_processes: Minimum number of processes to maintain maximum_processes: Maximum number of processes to maintain """ return self._parameters @parameters.setter def parameters(self, new_parameters): for key in new_parameters.keys(): if key not in _SCHEDULE_PROCESS_KWARGS + self._SENSOR_PARAMS: raise PolicyError("%s not a valid parameter for sensor" % key) if self._parameters is None: self._parameters = {} parameters = dict(self._parameters) for key, val in new_parameters.iteritems(): parameters[key] = val if parameters.get('metric') is None: msg = "a metric_name must be provided" raise PolicyError(msg) try: parameters['sample_period'] = int(parameters.get('sample_period')) if parameters['sample_period'] < 0: raise ValueError() except ValueError: msg = "sample_period '%s' is not a positive integer" % ( parameters.get('sample_period')) raise PolicyError(msg) if parameters.get('sample_function') not in Statistics.ALL: msg = "'%s' is not a known sample_function. Choose from %s" % ( parameters.get('sample_function'), Statistics.ALL) raise PolicyError(msg) try: parameters['cooldown_period'] = int(parameters.get('cooldown_period')) if parameters['cooldown_period'] < 0: raise ValueError() except ValueError: msg = "cooldown_period '%s' is not a positive integer" % ( parameters.get('cooldown_period')) raise PolicyError(msg) try: parameters['scale_up_threshold'] = float(parameters.get('scale_up_threshold')) except ValueError: msg = "scale_up_threshold '%s' is not a floating point number" % ( parameters.get('scale_up_threshold')) raise PolicyError(msg) try: parameters['scale_up_n_processes'] = int(parameters.get('scale_up_n_processes')) except ValueError: msg = "scale_up_n_processes '%s' is not an integer" % ( parameters.get('scale_up_n_processes')) raise PolicyError(msg) try: parameters['scale_down_threshold'] = float(parameters.get('scale_down_threshold')) except ValueError: msg = "scale_down_threshold '%s' is not a floating point number" % ( parameters.get('scale_down_threshold')) raise PolicyError(msg) try: parameters['scale_down_n_processes'] = int(parameters.get('scale_down_n_processes')) except ValueError: msg = "scale_down_n_processes '%s' is not an integer" % ( parameters.get('scale_up_n_processes')) raise PolicyError(msg) try: parameters['minimum_processes'] = int(parameters.get('minimum_processes')) if parameters['minimum_processes'] < 0: raise ValueError() except ValueError: msg = "minimum_processes '%s' is not a positive integer" % ( parameters.get('minimum_processes')) raise PolicyError(msg) try: parameters['maximum_processes'] = int(parameters.get('maximum_processes')) if parameters['maximum_processes'] < 0: raise ValueError() except ValueError: msg = "maximum_processes '%s' is not a positive integer" % ( parameters.get('maximum_processes')) raise PolicyError(msg) # phew! self._parameters = parameters self._schedule_kwargs = get_schedule_process_kwargs(new_parameters) def status(self): return self._status def apply_policy(self, all_procs, managed_upids): if self._parameters is None: raise PolicyError("No parameters set, unable to apply policy") time_since_last_scale = datetime.datetime.now() - self.last_scale_action if time_since_last_scale.seconds < self._parameters['cooldown_period']: log.debug("Returning early from apply policy because we're in cooldown") self._set_status(0, managed_upids) return managed_upids managed_upids = self._filter_invalid_processes(all_procs, managed_upids) # Get numbers from metric hostnames = self._get_hostnames(all_procs, managed_upids) period = 60 end_time = datetime.datetime.now() # TODO: what TZ does TS use? seconds = self._parameters['sample_period'] start_time = end_time - datetime.timedelta(seconds=seconds) metric_name = self._parameters['metric'] sample_function = self._parameters['sample_function'] statistics = [sample_function, ] if metric_name in self.app_metrics or 'app_attributes' in metric_name: dimensions = {'pid': managed_upids} else: dimensions = {'hostname': hostnames} try: metric_per_host = self._sensor_aggregator.get_metric_statistics( period, start_time, end_time, metric_name, statistics, dimensions) except HTTPError as h: msg = "Problem getting metrics from sensor aggregator with url: '%s'" % h.filename log.exception(msg) raise PolicyError(msg) values = [] for host, metric_value in metric_per_host.iteritems(): values.append(metric_value[sample_function]) log.debug("got metrics %s for %s" % (metric_per_host, dimensions)) try: average_metric = sum(values) / len(values) except ZeroDivisionError: # TODO: this is really boneheaded. What we should do instead is # treat this situation specifically to scale to the minimum. # Users might want a metric that can go negative for example, # and this trick won't work average_metric = 0 if average_metric > self._parameters['scale_up_threshold']: scale_by = self._parameters['scale_up_n_processes'] elif average_metric < self._parameters['scale_down_threshold']: scale_by = - abs(self._parameters['scale_down_n_processes']) else: scale_by = 0 wanted = len(managed_upids) + scale_by wanted = min(max(wanted, self._parameters['minimum_processes']), self._parameters['maximum_processes']) scale_by = wanted - len(managed_upids) if scale_by < 0: # remove excess log.info("%sSensor policy scaling down by %s", self.logprefix, scale_by) scale_by = -1 * scale_by for to_scale in range(0, scale_by): upid = managed_upids[0] self.terminate_process(upid) elif scale_by > 0: # Add processes log.info("%sSensor policy scaling up by %s", self.logprefix, scale_by) for to_rebalance in range(0, scale_by): pd_name = self._get_least_used_pd(all_procs) self.schedule_process(pd_name, self.process_definition_id, **self._schedule_kwargs) if scale_by != 0: self.last_scale_action = datetime.datetime.now() self._set_status(scale_by, managed_upids) self.previous_all_procs = all_procs return managed_upids def _set_status(self, to_rebalance, managed_upids): if self._status == HAState.FAILED: # If already in FAILED state, keep this state. # Requires human intervention self._status = HAState.FAILED elif to_rebalance == 0: self._status = HAState.STEADY elif len(managed_upids) >= self.minimum_n and self._parameters['minimum_processes'] > 0: self._status = HAState.READY else: self._status = HAState.PENDING def _get_hostnames(self, all_procs, upids): """get hostnames of eeagents that have managed processes """ hostnames = [] for pd, procs in all_procs.iteritems(): for proc in procs: if proc['upid'] not in upids: continue hostname = proc.get('hostname') if hostname is None: continue hostnames.append(hostname) return list(set(hostnames))