def configure(self, hosts): """Configure the leaf scheduler. :param hosts: list of child hosts :type hosts: list of str """ # Transfer children's constraints from list to set, so searching # elements are more efficient. self._hosts = [] for host in hosts: self._hosts.append(ChildInfo.from_thrift(host)) self._coalesce_resources(self._hosts) if self._health_checker: self._health_checker.stop() if self._enable_health_checker: # initialize health checker with the new set of children. agent_config = common.services.get(ServiceName.AGENT_CONFIG) children = dict((host.id, ServerAddress(host.address, host.port)) for host in self._hosts) self._health_checker = HealthChecker(self._scheduler_id, children, agent_config) self._health_checker.start() self._configured = ConfigStates.INITIALIZED
def test_stop(self): """Make sure start() starts threads and stop() stops them""" health_checker = HealthChecker("id", {}, self.conf) self.assertFalse(health_checker._heartbeater.is_alive()) self.assertFalse(health_checker._reporter.is_alive()) health_checker.start() self.assertTrue(health_checker._heartbeater.is_alive()) self.assertTrue(health_checker._reporter.is_alive()) health_checker.stop() self.assertFalse(health_checker._heartbeater.is_alive()) self.assertFalse(health_checker._reporter.is_alive())
def test_heartbeat(self, client_class, time_class): """Test that sequence number and timestamp get updated correctly after sending heartbeat.""" client_class.side_effect = self.create_fake_client bar_client = MagicMock() baz_client = MagicMock() self._clients["bar"] = bar_client self._clients["baz"] = baz_client children = { "bar": ServerAddress("bar", 1234), "baz": ServerAddress("baz", 1234) } # make sure things are initialized properly time_class.return_value = 0.0 health_checker = HealthChecker("id", children, self.conf) self.assertEquals(health_checker._seqnum, 0) self.assertEquals(len(health_checker._last_update), 2) self.assertEquals(health_checker._last_update["bar"], (0, 0.0)) self.assertEquals(health_checker._last_update["baz"], (0, 0.0)) # send a ping time_class.return_value = 10.0 health_checker._send_heartbeat() self.assertEquals(health_checker._seqnum, 1) self.assertEquals(len(health_checker._last_update), 2) self.assertEquals(health_checker._last_update["bar"], (1, 10.0)) self.assertEquals(health_checker._last_update["baz"], (1, 10.0)) # send another ping. ping to baz fails. time_class.return_value = 20.0 baz_client.ping.side_effect = Exception() health_checker._send_heartbeat() self.assertEquals(health_checker._seqnum, 2) self.assertEquals(len(health_checker._last_update), 2) self.assertEquals(health_checker._last_update["bar"], (2, 20.0)) self.assertEquals(health_checker._last_update["baz"], (1, 10.0)) # send another ping. ping to bar fails. time_class.return_value = 30.0 bar_client.ping.side_effect = Exception() baz_client.ping.side_effect = None health_checker._send_heartbeat() self.assertEquals(health_checker._seqnum, 3) self.assertEquals(len(health_checker._last_update), 2) self.assertEquals(health_checker._last_update["bar"], (2, 20.0)) self.assertEquals(health_checker._last_update["baz"], (3, 30.0))
def test_heartbeat(self, client_class, time_class): """Test that sequence number and timestamp get updated correctly after sending heartbeat.""" client_class.side_effect = self.create_fake_client bar_client = MagicMock() baz_client = MagicMock() self._clients["bar"] = bar_client self._clients["baz"] = baz_client children = {"bar": ServerAddress("bar", 1234), "baz": ServerAddress("baz", 1234)} # make sure things are initialized properly time_class.return_value = 0.0 health_checker = HealthChecker("id", children, self.conf) self.assertEquals(health_checker._seqnum, 0) self.assertEquals(len(health_checker._last_update), 2) self.assertEquals(health_checker._last_update["bar"], (0, 0.0)) self.assertEquals(health_checker._last_update["baz"], (0, 0.0)) # send a ping time_class.return_value = 10.0 health_checker._send_heartbeat() self.assertEquals(health_checker._seqnum, 1) self.assertEquals(len(health_checker._last_update), 2) self.assertEquals(health_checker._last_update["bar"], (1, 10.0)) self.assertEquals(health_checker._last_update["baz"], (1, 10.0)) # send another ping. ping to baz fails. time_class.return_value = 20.0 baz_client.ping.side_effect = Exception() health_checker._send_heartbeat() self.assertEquals(health_checker._seqnum, 2) self.assertEquals(len(health_checker._last_update), 2) self.assertEquals(health_checker._last_update["bar"], (2, 20.0)) self.assertEquals(health_checker._last_update["baz"], (1, 10.0)) # send another ping. ping to bar fails. time_class.return_value = 30.0 bar_client.ping.side_effect = Exception() baz_client.ping.side_effect = None health_checker._send_heartbeat() self.assertEquals(health_checker._seqnum, 3) self.assertEquals(len(health_checker._last_update), 2) self.assertEquals(health_checker._last_update["bar"], (2, 20.0)) self.assertEquals(health_checker._last_update["baz"], (3, 30.0))
class LeafScheduler(BaseScheduler): """Leaf scheduler manages child hosts.""" def __init__(self, scheduler_id, ut_ratio, enable_health_checker=True): """Create a new leaf scheduler. :param scheduler_id: scheduler id :type scheduler_id: str :type enable_health_checker: enables health checking of children. """ self._logger = logging.getLogger(__name__) self._logger.info("Creating leaf scheduler: %s" % scheduler_id) self.lock = threading.RLock() self._latch = CountUpDownLatch() self._place_strategy = RandomSubsetStrategy(PLACE_FAN_OUT_RATIO, MIN_PLACE_FAN_OUT, MAX_PLACE_FAN_OUT) self._scheduler_id = scheduler_id self._hosts = [] self._scorer = DefaultScorer(ut_ratio) self._threadpool = None self._initialize_services(scheduler_id) self._health_checker = None self._enable_health_checker = enable_health_checker self._configured = ConfigStates.UNINITIALIZED def _initialize_services(self, scheduler_id): self._threadpool = common.services.get(ThreadPoolExecutor) self._scheduler_client = SchedulerClient() @locked def configure(self, hosts): """Configure the leaf scheduler. :param hosts: list of child hosts :type hosts: list of str """ # Transfer children's constraints from list to set, so searching # elements are more efficient. self._hosts = [] for host in hosts: self._hosts.append(ChildInfo.from_thrift(host)) self._coalesce_resources(self._hosts) if self._health_checker: self._health_checker.stop() if self._enable_health_checker: # initialize health checker with the new set of children. agent_config = common.services.get(ServiceName.AGENT_CONFIG) children = dict((host.id, ServerAddress(host.address, host.port)) for host in self._hosts) self._health_checker = HealthChecker(self._scheduler_id, children, agent_config) self._health_checker.start() self._configured = ConfigStates.INITIALIZED @locked def _get_hosts(self): """ Get the list of hosts for this scheduler. The returned list is a deep copy of the set of hosts so a subsequent call to configure the host is not stepping on calls in flight. Assumes the host is configured as a leaf scheduler. :rtype: list of str """ return list(self._hosts) def mark_pending(func): """ Decorator for bumping up the pending count for calls that are inflight. """ @log_request(log_level=logging.debug) def nested(self, *args, **kwargs): self._latch.count_up() self._logger.debug("latch counted up to: {0}".format( self._latch.count)) try: return func(self, *args, **kwargs) finally: self._latch.count_down() self._logger.debug("latch counted down to: {0}".format( self._latch.count)) return nested @mark_pending def find(self, request): """Find the specified resource. :type request: FindRequest :rtype: FindResponse :raise: InvalidScheduler """ if self._configured == ConfigStates.UNINITIALIZED: raise InvalidScheduler() # Host service only has a single scheduler request.scheduler_id = None futures = [] for agent in self._get_hosts(): future = self._threadpool.submit(self._find_worker, agent.address, agent.port, agent.id, request) futures.append(future) done, not_done = concurrent.futures.wait(futures, timeout=FIND_TIMEOUT) self._logger.info("Find responses received: %d, timed out: %d", len(done), len(not_done)) for future in done: response = future.result() if response.result == FindResultCode.OK: return response return FindResponse(FindResultCode.NOT_FOUND) @mark_pending def place(self, request): """Place the specified resources. :type request: PlaceRequest :rtype: PlaceResponse :raise: InvalidScheduler """ if self._configured == ConfigStates.UNINITIALIZED: raise InvalidScheduler() request.scheduler_id = None constraints = self._collect_constraints(request.resource) selected = self._placement_hosts(request, constraints) if len(selected) == 0: return PlaceResponse(PlaceResultCode.NO_SUCH_RESOURCE) selected = self._filter_missing_hosts(selected) done = self._execute_placement(selected, request) responses = [] no_such_resource = False not_enough_memory_resource = False not_enough_cpu_resource = False not_enough_datastore_capacity = False for future in done: try: response = future.result() if response.result == PlaceResultCode.OK: responses.append(response) elif response.result == \ PlaceResultCode.NOT_ENOUGH_CPU_RESOURCE: not_enough_cpu_resource = True elif response.result == \ PlaceResultCode.NOT_ENOUGH_MEMORY_RESOURCE: not_enough_memory_resource = True elif response.result == \ PlaceResultCode.NOT_ENOUGH_DATASTORE_CAPACITY: not_enough_datastore_capacity = True elif response.result == \ PlaceResultCode.NO_SUCH_RESOURCE: no_such_resource = True except Exception, e: self._logger.warning( "Caught exception while sending " "place request: %s", str(e)) best_response = self._scorer.score(responses) if best_response is not None: return best_response elif not_enough_cpu_resource: return PlaceResponse(PlaceResultCode.NOT_ENOUGH_CPU_RESOURCE) elif not_enough_memory_resource: return PlaceResponse(PlaceResultCode.NOT_ENOUGH_MEMORY_RESOURCE) elif not_enough_datastore_capacity: return PlaceResponse(PlaceResultCode.NOT_ENOUGH_DATASTORE_CAPACITY) elif no_such_resource: return PlaceResponse(PlaceResultCode.NO_SUCH_RESOURCE) else: return PlaceResponse(PlaceResultCode.SYSTEM_ERROR)
def test_slow_heartbeater(self, client_class, time_class, chairman): """Don't report missing if the current sequence number is equal to the sequence number of the last successful ping. """ client_class.side_effect = self.create_fake_client bar_client = MagicMock() self._clients["bar"] = bar_client children = {"bar": ServerAddress("bar", 1234)} # send a ping. bar should get reported resurrected. health_checker = HealthChecker("id", children, self.conf) time_class.return_value = 0.0 chairman.return_value.report_resurrected.return_value = \ ReportResurrectedResponse(result=0) health_checker._send_heartbeat() health_checker._send_report() req = ReportResurrectedRequest(hosts=['bar'], schedulers=None, scheduler_id='id') chairman.return_value.report_resurrected.assert_called_once_with(req) self.assertFalse(chairman.return_value.report_missing.called) self.assertEquals(health_checker._resurrected_children, set(["bar"])) self.assertEquals(health_checker._missing_children, set()) # call _send_report() again after 100 seconds. bar shouldn't get # reported missing since the heartbeater hasn't send another ping. time_class.return_value = 100.0 chairman.reset_mock() health_checker._send_report() self.assertFalse(chairman.return_value.report_missing.called) self.assertFalse(chairman.return_value.report_resurrected.called) # ping fails. now the reporter should report bar missing. bar_client.ping.side_effect = Exception() chairman.return_value.report_missing.return_value = \ ReportMissingResponse(result=0) health_checker._send_heartbeat() health_checker._send_report() req = ReportMissingRequest(hosts=['bar'], schedulers=None, scheduler_id='id') chairman.return_value.report_missing.assert_called_once_with(req) self.assertFalse(chairman.return_value.report_resurrected.called)
def test_chairman_failure(self, client_class, time_class, chairman): """Reporter should retry reporting if chairman fails.""" client_class.side_effect = self.create_fake_client bar_client = MagicMock() self._clients["bar"] = bar_client children = {"bar": ServerAddress("bar", 1234)} # report_resurrected returns a non-zero value health_checker = HealthChecker("id", children, self.conf) time_class.return_value = 0.0 health_checker._send_heartbeat() chairman.return_value.report_resurrected.return_value = \ ReportResurrectedResponse(result=1) health_checker._send_report() req = ReportResurrectedRequest(hosts=['bar'], schedulers=None, scheduler_id='id') chairman.return_value.report_resurrected.assert_called_once_with(req) self.assertFalse(chairman.return_value.report_missing.called) # report_resurrected throws an exception chairman.reset_mock() chairman.return_value.report_resurrected.side_effect = Exception() health_checker._send_report() chairman.return_value.report_resurrected.assert_called_once_with(req) self.assertFalse(chairman.return_value.report_missing.called) # report succeeds chairman.reset_mock() chairman.return_value.report_resurrected.side_effect = None chairman.return_value.report_resurrected.return_value = \ ReportResurrectedResponse(result=0) health_checker._send_report() chairman.return_value.report_resurrected.assert_called_once_with(req) self.assertFalse(chairman.return_value.report_missing.called) # report doesn't get called anymore. chairman.reset_mock() health_checker._send_report() self.assertFalse(chairman.return_value.report_resurrected.called) self.assertFalse(chairman.return_value.report_missing.called)
def test_report(self, client_class, time_class, chairman): """Test that resurrected and missing hosts get reported correctly""" client_class.side_effect = self.create_fake_client chairman.return_value.report_resurrected.return_value = \ ReportResurrectedResponse(result=0) bar_client = MagicMock() baz_client = MagicMock() self._clients["bar"] = bar_client self._clients["baz"] = baz_client children = { "bar": ServerAddress("bar", 1234), "baz": ServerAddress("baz", 1234) } # first ping succeeds for bar and baz. they get reported resurrected. health_checker = HealthChecker("id", children, self.conf) time_class.return_value = 0.0 health_checker._send_heartbeat() health_checker._send_report() req = ReportResurrectedRequest(hosts=['bar', 'baz'], schedulers=None, scheduler_id='id') chairman.return_value.report_resurrected.assert_called_once_with(req) self.assertFalse(chairman.return_value.report_missing.called) self.assertEquals(health_checker._resurrected_children, set(["bar", "baz"])) self.assertEquals(health_checker._missing_children, set()) # call _send_report again. this time nothing should get reported. chairman.reset_mock() health_checker._send_report() self.assertFalse(chairman.return_value.report_missing.called) self.assertFalse(chairman.return_value.report_resurrected.called) # bar goes missing. bar_client.ping.side_effect = Exception() health_checker._send_heartbeat() time_class.return_value = 100.0 chairman.return_value.report_missing.return_value = \ ReportMissingResponse(result=0) health_checker._send_report() req = ReportMissingRequest(hosts=['bar'], schedulers=None, scheduler_id='id') chairman.return_value.report_missing.assert_called_once_with(req) self.assertFalse(chairman.return_value.report_resurrected.called) # bar comes back chairman.reset_mock() bar_client.ping.side_effect = None time_class.return_value = 200.0 health_checker._send_heartbeat() chairman.return_value.report_resurrected.return_value = \ ReportResurrectedResponse(result=0) health_checker._send_report() req = ReportResurrectedRequest(hosts=['bar'], schedulers=None, scheduler_id='id') self.assertFalse(chairman.return_value.report_missing.called) chairman.return_value.report_resurrected.assert_called_once_with(req)
class LeafScheduler(BaseScheduler): """Leaf scheduler manages child hosts.""" def __init__(self, scheduler_id, ut_ratio, enable_health_checker=True): """Create a new leaf scheduler. :param scheduler_id: scheduler id :type scheduler_id: str :type enable_health_checker: enables health checking of children. """ self._logger = logging.getLogger(__name__) self._logger.info("Creating leaf scheduler: %s" % scheduler_id) self.lock = threading.RLock() self._latch = CountUpDownLatch() self._place_strategy = RandomSubsetStrategy(PLACE_FAN_OUT_RATIO, MIN_PLACE_FAN_OUT, MAX_PLACE_FAN_OUT) self._scheduler_id = scheduler_id self._hosts = [] self._scorer = DefaultScorer(ut_ratio) self._threadpool = None self._initialize_services(scheduler_id) self._health_checker = None self._enable_health_checker = enable_health_checker self._configured = ConfigStates.UNINITIALIZED def _initialize_services(self, scheduler_id): self._threadpool = common.services.get(ThreadPoolExecutor) self._scheduler_client = SchedulerClient() @locked def configure(self, hosts): """Configure the leaf scheduler. :param hosts: list of child hosts :type hosts: list of str """ # Transfer children's constraints from list to set, so searching # elements are more efficient. self._hosts = [] for host in hosts: self._hosts.append(ChildInfo.from_thrift(host)) self._coalesce_resources(self._hosts) if self._health_checker: self._health_checker.stop() if self._enable_health_checker: # initialize health checker with the new set of children. agent_config = common.services.get(ServiceName.AGENT_CONFIG) children = dict((host.id, ServerAddress(host.address, host.port)) for host in self._hosts) self._health_checker = HealthChecker(self._scheduler_id, children, agent_config) self._health_checker.start() self._configured = ConfigStates.INITIALIZED @locked def _get_hosts(self): """ Get the list of hosts for this scheduler. The returned list is a deep copy of the set of hosts so a subsequent call to configure the host is not stepping on calls in flight. Assumes the host is configured as a leaf scheduler. :rtype: list of str """ return list(self._hosts) def mark_pending(func): """ Decorator for bumping up the pending count for calls that are inflight. """ @log_request(log_level=logging.debug) def nested(self, *args, **kwargs): self._latch.count_up() self._logger.debug( "latch counted up to: {0}".format(self._latch.count)) try: return func(self, *args, **kwargs) finally: self._latch.count_down() self._logger.debug( "latch counted down to: {0}".format(self._latch.count)) return nested @mark_pending def find(self, request): """Find the specified resource. :type request: FindRequest :rtype: FindResponse :raise: InvalidScheduler """ if self._configured == ConfigStates.UNINITIALIZED: raise InvalidScheduler() # Host service only has a single scheduler request.scheduler_id = None futures = [] for agent in self._get_hosts(): future = self._threadpool.submit( self._find_worker, agent.address, agent.port, agent.id, request) futures.append(future) done, not_done = concurrent.futures.wait(futures, timeout=FIND_TIMEOUT) self._logger.info("Find responses received: %d, timed out: %d", len(done), len(not_done)) for future in done: response = future.result() if response.result == FindResultCode.OK: return response return FindResponse(FindResultCode.NOT_FOUND) @mark_pending def place(self, request): """Place the specified resources. :type request: PlaceRequest :rtype: PlaceResponse :raise: InvalidScheduler """ if self._configured == ConfigStates.UNINITIALIZED: raise InvalidScheduler() request.scheduler_id = None constraints = self._collect_constraints(request.resource) selected = self._placement_hosts(request, constraints) if len(selected) == 0: return PlaceResponse(PlaceResultCode.NO_SUCH_RESOURCE) selected = self._filter_missing_hosts(selected) done = self._execute_placement(selected, request) responses = [] no_such_resource = False not_enough_memory_resource = False not_enough_cpu_resource = False not_enough_datastore_capacity = False for future in done: try: response = future.result() if response.result == PlaceResultCode.OK: responses.append(response) elif response.result == \ PlaceResultCode.NOT_ENOUGH_CPU_RESOURCE: not_enough_cpu_resource = True elif response.result == \ PlaceResultCode.NOT_ENOUGH_MEMORY_RESOURCE: not_enough_memory_resource = True elif response.result == \ PlaceResultCode.NOT_ENOUGH_DATASTORE_CAPACITY: not_enough_datastore_capacity = True elif response.result == \ PlaceResultCode.NO_SUCH_RESOURCE: no_such_resource = True except Exception, e: self._logger.warning( "Caught exception while sending " "place request: %s", str(e)) best_response = self._scorer.score(responses) if best_response is not None: return best_response elif not_enough_cpu_resource: return PlaceResponse(PlaceResultCode.NOT_ENOUGH_CPU_RESOURCE) elif not_enough_memory_resource: return PlaceResponse(PlaceResultCode.NOT_ENOUGH_MEMORY_RESOURCE) elif not_enough_datastore_capacity: return PlaceResponse(PlaceResultCode.NOT_ENOUGH_DATASTORE_CAPACITY) elif no_such_resource: return PlaceResponse(PlaceResultCode.NO_SUCH_RESOURCE) else: return PlaceResponse(PlaceResultCode.SYSTEM_ERROR)
def test_report(self, client_class, time_class, chairman): """Test that resurrected and missing hosts get reported correctly""" client_class.side_effect = self.create_fake_client chairman.return_value.report_resurrected.return_value = \ ReportResurrectedResponse(result=0) bar_client = MagicMock() baz_client = MagicMock() self._clients["bar"] = bar_client self._clients["baz"] = baz_client children = {"bar": ServerAddress("bar", 1234), "baz": ServerAddress("baz", 1234)} # first ping succeeds for bar and baz. they get reported resurrected. health_checker = HealthChecker("id", children, self.conf) time_class.return_value = 0.0 health_checker._send_heartbeat() health_checker._send_report() req = ReportResurrectedRequest(hosts=['bar', 'baz'], schedulers=None, scheduler_id='id') chairman.return_value.report_resurrected.assert_called_once_with(req) self.assertFalse(chairman.return_value.report_missing.called) self.assertEquals(health_checker._resurrected_children, set(["bar", "baz"])) self.assertEquals(health_checker._missing_children, set()) # call _send_report again. this time nothing should get reported. chairman.reset_mock() health_checker._send_report() self.assertFalse(chairman.return_value.report_missing.called) self.assertFalse(chairman.return_value.report_resurrected.called) # bar goes missing. bar_client.ping.side_effect = Exception() health_checker._send_heartbeat() time_class.return_value = 100.0 chairman.return_value.report_missing.return_value = \ ReportMissingResponse(result=0) health_checker._send_report() req = ReportMissingRequest(hosts=['bar'], schedulers=None, scheduler_id='id') chairman.return_value.report_missing.assert_called_once_with(req) self.assertFalse(chairman.return_value.report_resurrected.called) # bar comes back chairman.reset_mock() bar_client.ping.side_effect = None time_class.return_value = 200.0 health_checker._send_heartbeat() chairman.return_value.report_resurrected.return_value = \ ReportResurrectedResponse(result=0) health_checker._send_report() req = ReportResurrectedRequest(hosts=['bar'], schedulers=None, scheduler_id='id') self.assertFalse(chairman.return_value.report_missing.called) chairman.return_value.report_resurrected.assert_called_once_with(req)