def wrapped(*args, **kwargs): start_time = monotonic_time() rc = fn(*args, **kwargs) end_time = monotonic_time() time_taken = end_time - start_time assert_true(time_taken < time_limit, "Test took too long") return rc
def test_mainline(self): a = monotonic_time() time.sleep(0.01) b = monotonic_time() self.assertTrue(b >= a + 0.01, msg="Monotonic time did not increase as " "expected: %s, %s" % (a, b))
def _on_update_from_driver(self, msg): """ Called when the driver sends us a key/value pair update. After the initial handshake, the stream of events consists entirely of updates unless something happens to change the state of the driver. :param dict msg: The message received from the driver. """ assert self.configured.is_set(), "Received update before config" # The driver starts polling immediately, make sure we block until # everyone else is ready to receive updates. self.begin_polling.wait() # Unpack the message. key = msg[MSG_KEY_KEY] value = msg[MSG_KEY_VALUE] _log.debug("Update from driver: %s -> %s", key, value) # Output some very coarse stats. self.read_count += 1 if self.read_count % 1000 == 0: now = monotonic_time() delta = now - self.last_rate_log_time _log.info("Processed %s updates from driver " "%.1f/s", self.read_count, 1000.0 / delta) self.last_rate_log_time = now # Wrap the update in an EtcdEvent object so we can dispatch it via the # PathDispatcher. n = EtcdEvent("set" if value is not None else "delete", key, value) self.dispatcher.handle_event(n)
def maybe_schedule(self, caller): """ Schedule this Actor to run iff it is not already running/scheduled. :param str caller: A (debug) tag to pass to the greenlet to identify the caller """ # This method uses very low-level gevent APIs for performance, it is # partially cribbed from gevent's Waiter class, which is a simple # application of the low-level APIs. _log.debug("Checking whether we need to schedule %s", self) # We use the lack of yield points in this code to ensure that no-one # changes self._scheduled between the next two lines. Do not add # logging between the test and set of self._scheduled. if not self._scheduled: self._scheduled = True # Calculate the scheduling delay. If this Actor hasn't been # scheduled for a long time, we'll schedule it straight away, # otherwise we back off to the batch_delay to encourage work to be # batched up when we're under load. now = monotonic_time() if self._last_scheduled is not None: time_since_last_schedule = now - self._last_scheduled delay = max(self.batch_delay - time_since_last_schedule, MIN_DELAY) else: delay = MIN_DELAY # We can't switch directly to the Actor's greenlet because that # prevents gevent from doing its scheduling. Instead, we ask the # gevent event loop to switch to the greenlet. t = self._gevent_loop.timer(delay) t.start(self._switch, caller) self._last_scheduled = now _log.debug("Scheduled %s", self)
def __init__(self, config, hosts_ipset): super(EtcdAPI, self).__init__(config.ETCD_ADDRS, etcd_scheme=config.ETCD_SCHEME, etcd_key=config.ETCD_KEY_FILE, etcd_cert=config.ETCD_CERT_FILE, etcd_ca=config.ETCD_CA_FILE) self._config = config # Timestamp storing when the EtcdAPI started. This info is needed # in order to report uptime to etcd. self._start_time = monotonic_time() # Create an Actor to report per-endpoint status into etcd. self.status_reporter = EtcdStatusReporter(config) # Start up the main etcd-watching greenlet. It will wait for an # event from us before doing anything. self._watcher = _FelixEtcdWatcher(config, self, self.status_reporter, hosts_ipset) self._watcher.link(self._on_worker_died) self._watcher.start() # Start up a greenlet to trigger periodic resyncs. self._resync_greenlet = gevent.spawn(self._periodically_resync) self._resync_greenlet.link_exception(self._on_worker_died) # Start up greenlet to report felix's liveness into etcd. self.done_first_status_report = False self._status_reporting_greenlet = gevent.spawn( self._periodically_report_status) self._status_reporting_greenlet.link_exception(self._on_worker_died) # Start the status reporter. self.status_reporter.start() self.status_reporter.greenlet.link(self._on_worker_died)
def _update_felix_status(self, ttl): """ Writes two keys to etcd: * uptime in secs * felix status in JSON - containing current time in ISO 8601 Zulu format :param: ttl int: time to live in sec - lifetime of the status report """ time_formatted = iso_utc_timestamp() uptime = monotonic_time() - self._start_time status = { "time": time_formatted, "uptime": uptime, "first_update": not self.done_first_status_report, } status_value = json.dumps(status) _log.debug("Reporting felix status/uptime (%.1fs) using hostname %s", uptime, self._config.HOSTNAME) status_key = key_for_last_status(self._config.HOSTNAME) self.client.set(status_key, status_value) status_key = key_for_status(self._config.HOSTNAME) self.client.set(status_key, status_value, ttl=ttl) self.done_first_status_report = True
def _on_update_from_driver(self, msg): """ Called when the driver sends us a key/value pair update. After the initial handshake, the stream of events consists entirely of updates unless something happens to change the state of the driver. :param dict msg: The message received from the driver. """ assert self.configured.is_set(), "Received update before config" # The driver starts polling immediately, make sure we block until # everyone else is ready to receive updates. self.begin_polling.wait() # Unpack the message. key = msg[MSG_KEY_KEY] value = msg[MSG_KEY_VALUE] _log.debug("Update from driver: %s -> %s", key, value) # Output some very coarse stats. self.read_count += 1 if self.read_count % 1000 == 0: now = monotonic_time() delta = now - self.last_rate_log_time _log.info("Processed %s updates from driver " "%.1f/s", self.read_count, 1000.0 / delta) self.last_rate_log_time = now # Wrap the update in an EtcdEvent object so we can dispatch it via the # PathDispatcher. n = EtcdEvent("set" if value is not None else "delete", key, value) self.dispatcher.handle_event(n)
def _update_felix_status(self, ttl): """ Writes two keys to etcd: * uptime in secs * felix status in JSON - containing current time in ISO 8601 Zulu format :param: ttl int: time to live in sec - lifetime of the status report """ time_formatted = iso_utc_timestamp() uptime = monotonic_time() - self._start_time status = { "time": time_formatted, "uptime": uptime, "first_update": not self.done_first_status_report, } status_value = json.dumps(status) _log.debug("Reporting felix status/uptime (%.1fs) using hostname %s", uptime, self._config.HOSTNAME) status_key = key_for_last_status(self._config.HOSTNAME) self.client.set(status_key, status_value) status_key = key_for_status(self._config.HOSTNAME) self.client.set(status_key, status_value, ttl=ttl) self.done_first_status_report = True
def __init__(self, config, hosts_ipset): super(EtcdAPI, self).__init__(config.ETCD_ADDR) self._config = config # Timestamp storing when the EtcdAPI started. This info is needed # in order to report uptime to etcd. self._start_time = monotonic_time() # Create an Actor to report per-endpoint status into etcd. self.status_reporter = EtcdStatusReporter(config) # Start up the main etcd-watching greenlet. It will wait for an # event from us before doing anything. self._watcher = _FelixEtcdWatcher(config, self, self.status_reporter, hosts_ipset) self._watcher.link(self._on_worker_died) self._watcher.start() # Start up a greenlet to trigger periodic resyncs. self._resync_greenlet = gevent.spawn(self._periodically_resync) self._resync_greenlet.link_exception(self._on_worker_died) # Start up greenlet to report felix's liveness into etcd. self.done_first_status_report = False self._status_reporting_greenlet = gevent.spawn(self._periodically_report_status) self._status_reporting_greenlet.link_exception(self._on_worker_died) # Start the status reporter. self.status_reporter.start() self.status_reporter.greenlet.link(self._on_worker_died)
def _maybe_log_resync_thread_stats(self): now = monotonic_time() if now - self._last_resync_stat_log_time > STATS_LOG_INTERVAL: for stat in self._resync_stats: _log.info("STAT: Resync thread %s", stat) stat.reset() self._last_resync_stat_log_time = now
def _maybe_log_resync_thread_stats(self): now = monotonic_time() if now - self._last_resync_stat_log_time > STATS_LOG_INTERVAL: for stat in self._resync_stats: _log.info("STAT: Resync thread %s", stat) stat.reset() self._last_resync_stat_log_time = now
def maybe_schedule(self, caller): """ Schedule this Actor to run iff it is not already running/scheduled. :param str caller: A (debug) tag to pass to the greenlet to identify the caller """ # This method uses very low-level gevent APIs for performance, it is # partially cribbed from gevent's Waiter class, which is a simple # application of the low-level APIs. _log.debug("Checking whether we need to schedule %s", self) # We use the lack of yield points in this code to ensure that no-one # changes self._scheduled between the next two lines. Do not add # logging between the test and set of self._scheduled. if not self._scheduled: self._scheduled = True # Calculate the scheduling delay. If this Actor hasn't been # scheduled for a long time, we'll schedule it straight away, # otherwise we back off to the batch_delay to encourage work to be # batched up when we're under load. now = monotonic_time() if self._last_scheduled is not None: time_since_last_schedule = now - self._last_scheduled delay = max(self.batch_delay - time_since_last_schedule, MIN_DELAY) else: delay = MIN_DELAY # We can't switch directly to the Actor's greenlet because that # prevents gevent from doing its scheduling. Instead, we ask the # gevent event loop to switch to the greenlet. t = self._gevent_loop.timer(delay) t.start(self._switch, caller) self._last_scheduled = now _log.debug("Scheduled %s", self)
def __init__(self, config, hosts_ipset): super(EtcdAPI, self).__init__(config.ETCD_ADDRS, etcd_scheme=config.ETCD_SCHEME, etcd_key=config.ETCD_KEY_FILE, etcd_cert=config.ETCD_CERT_FILE, etcd_ca=config.ETCD_CA_FILE) self._config = config # Timestamp storing when the EtcdAPI started. This info is needed # in order to report uptime to etcd. self._start_time = monotonic_time() # Create an Actor to report per-endpoint status into etcd. We defer # startup of this and our other workers until we get started. self.status_reporter = EtcdStatusReporter(config) # Create the main etcd-watching greenlet. self._watcher = _FelixEtcdWatcher(config, self, self.status_reporter, hosts_ipset) self._watcher.link(self._on_worker_died) # Create a greenlet to trigger periodic resyncs. self._resync_greenlet = gevent.Greenlet(self._periodically_resync) self._resync_greenlet.link_exception(self._on_worker_died) # Create a greenlet to report felix's liveness into etcd. self.done_first_status_report = False self._status_reporting_greenlet = gevent.Greenlet( self._periodically_report_status ) self._status_reporting_greenlet.link_exception(self._on_worker_died) self.status_reporter.greenlet.link(self._on_worker_died)
def __init__(self, felix_sck): # Wrap the socket with our protocol reader/writer objects. self._msg_reader = MessageReader(felix_sck) self._msg_writer = MessageWriter(felix_sck) # Global stop event used to signal to all threads to stop. self._stop_event = Event() # Threads to own the connection from/to Felix. The resync thread # is responsible for doing resyncs and merging updates from the # watcher thread (which it manages). self._reader_thread = Thread(target=self._read_from_socket, name="reader-thread") self._reader_thread.daemon = True self._resync_thread = Thread(target=self._resync_and_merge, name="resync-thread") self._resync_thread.daemon = True self._watcher_thread = None # Created on demand self._watcher_stop_event = None self._watcher_start_index = None # High-water mark cache. Owned by resync thread. self._hwms = HighWaterTracker() self._first_resync = True self._resync_http_pool = None self._cluster_id = None # Resync thread stats. self._snap_keys_processed = RateStat("snapshot keys processed") self._event_keys_processed = RateStat("event keys processed") self._felix_updates_sent = RateStat("felix updates sent") self._resync_stats = [ self._snap_keys_processed, self._event_keys_processed, self._felix_updates_sent, ] self._last_resync_stat_log_time = monotonic_time() # Set by the reader thread once the init message has been received # from Felix. self._init_received = Event() # Initial config, received in the init message. self._etcd_base_url = None self._etcd_other_urls = [] # Lock for the etcd url fields: this is the only lock, and no thread # ever recursively acquires it, so it cannot deadlock. Must be locked # to access the _etcd_base_url and _etcd_other_urls fields (after they # are initialized). self._etcd_url_lock = Lock() self._hostname = None # Set by the reader thread once the logging config has been received # from Felix. Triggers the first resync. self._config_received = Event() # Flag to request a resync. Set by the reader thread, polled by the # resync and merge thread. self._resync_requested = False
def update_felix_status(self): """Sends Felix's status to the backend driver.""" time_formatted = iso_utc_timestamp() uptime = monotonic_time() - self._start_time envelope = felixbackend_pb2.FromDataplane() payload = envelope.process_status_update payload.iso_timestamp = time_formatted payload.uptime = uptime self._writer.send_message(envelope)
def __init__(self, config, message_writer): super(DatastoreWriter, self).__init__() self._config = config self._start_time = monotonic_time() self._writer = message_writer self._endpoint_status = {IPV4: {}, IPV6: {}} self.config_resolved = False self._dirty_endpoints = set() self._reporting_allowed = True self._status_reporting_greenlet = None
def _on_ipset_delta_msg_from_driver(self, msg): _log.debug("IP set delta updates: %s", msg) # Output some very coarse stats. self.ip_upd_count += 1 if self.ip_upd_count % 1000 == 0: now = monotonic_time() delta = now - self.last_ip_upd_log_time _log.info("Processed %s IP updates from driver " "%.1f/s", self.ip_upd_count, 1000.0 / delta) self.last_ip_upd_log_time = now self.splitter.on_ipset_delta_update(msg.id, msg.added_members or [], msg.removed_members or [])
def __init__(self, config, msg_reader, datastore_writer, hosts_ipset): super(DatastoreReader, self).__init__() self._config = config self.hosts_ipset = hosts_ipset self._msg_reader = msg_reader self._datastore_writer = datastore_writer # Whether we've been in sync with etcd at some point. self._been_in_sync = False # Keep track of the config loaded from etcd so we can spot if it # changes. self.last_global_config = None self.last_host_config = None # Events triggered by the DatastoreAPI Actor to tell us to load the # config and start polling. These are one-way flags. self.load_config = Event() self.begin_polling = Event() # Event that we trigger once the config is loaded. self.configured = Event() # Polling state initialized at poll start time. self.splitter = None # Next-hop IP addresses of our hosts, if populated in etcd. self.ipv4_by_hostname = {} # Forces a resync after the current poll if set. Safe to set from # another thread. Automatically reset to False after the resync is # triggered. self.resync_requested = False # True if we've been shut down. self.killed = False # Stats. self.read_count = 0 self.ip_upd_count = 0 self.ip_remove_count = 0 self.msgs_processed = 0 self.last_rate_log_time = monotonic_time() self.last_ip_upd_log_time = monotonic_time() self.last_ip_remove_log_time = monotonic_time()
def __init__(self, config, pipe_from_parent, pipe_to_parent, hosts_ipset): super(DatastoreAPI, self).__init__() self._config = config self.pipe_from_parent = pipe_from_parent self.pipe_to_parent = pipe_to_parent self.hosts_ipset = hosts_ipset # Timestamp storing when the DatastoreAPI started. This info is needed # in order to report uptime to etcd. self._start_time = monotonic_time() # The main etcd-watching greenlet. self._reader = None # One-way flag indicating we're being shut down. self.killed = False
def __init__(self, config, etcd_api, status_reporter, hosts_ipset): super(_FelixEtcdWatcher, self).__init__() self._config = config self._etcd_api = etcd_api self._status_reporter = status_reporter self.hosts_ipset = hosts_ipset # Whether we've been in sync with etcd at some point. self._been_in_sync = False # Keep track of the config loaded from etcd so we can spot if it # changes. self.last_global_config = None self.last_host_config = None self.my_config_dir = dir_for_per_host_config(self._config.HOSTNAME) # Events triggered by the EtcdAPI Actor to tell us to load the config # and start polling. These are one-way flags. self.load_config = Event() self.begin_polling = Event() # Event that we trigger once the config is loaded. self.configured = Event() # Polling state initialized at poll start time. self.splitter = None # Next-hop IP addresses of our hosts, if populated in etcd. self.ipv4_by_hostname = {} # Forces a resync after the current poll if set. Safe to set from # another thread. Automatically reset to False after the resync is # triggered. self.resync_requested = False self.dispatcher = PathDispatcher() # The Popen object for the driver. self._driver_process = None # Stats. self.read_count = 0 self.msgs_processed = 0 self.last_rate_log_time = monotonic_time() # Register for events when values change. self._register_paths() self._usage_report_greenlet = gevent.Greenlet( self._periodically_usage_report)
def __init__(self, config, etcd_api, status_reporter, hosts_ipset): super(_FelixEtcdWatcher, self).__init__() self._config = config self._etcd_api = etcd_api self._status_reporter = status_reporter self.hosts_ipset = hosts_ipset # Whether we've been in sync with etcd at some point. self._been_in_sync = False # Keep track of the config loaded from etcd so we can spot if it # changes. self.last_global_config = None self.last_host_config = None self.my_config_dir = dir_for_per_host_config(self._config.HOSTNAME) # Events triggered by the EtcdAPI Actor to tell us to load the config # and start polling. These are one-way flags. self.load_config = Event() self.begin_polling = Event() # Event that we trigger once the config is loaded. self.configured = Event() # Polling state initialized at poll start time. self.splitter = None # Next-hop IP addresses of our hosts, if populated in etcd. self.ipv4_by_hostname = {} # Forces a resync after the current poll if set. Safe to set from # another thread. Automatically reset to False after the resync is # triggered. self.resync_requested = False self.dispatcher = PathDispatcher() # The Popen object for the driver. self._driver_process = None # Stats. self.read_count = 0 self.msgs_processed = 0 self.last_rate_log_time = monotonic_time() # Register for events when values change. self._register_paths()
def time_since_start(self): now = monotonic_time() time_since_start = now - self.start_time return time_since_start
def watch_etcd(self, next_index, event_queue, stop_event): """ Thread: etcd watcher thread. Watches etcd for changes and sends them over the queue to the resync thread, which owns the socket to Felix. Dies if it receives an error from etcd. Note: it is important that we pass the index, queue and event as parameters to ensure that each watcher thread only touches the versions of those values that were created for it as opposed to a later-created watcher thread. :param int next_index: The etcd index to start watching from. :param Queue event_queue: Queue of updates back to the resync thread. :param Event stop_event: Event used to stop this thread when it is no longer needed. """ _log.info("Watcher thread started with next index %s", next_index) last_log_time = monotonic_time() req_end_time = None non_req_time_stat = AggregateStat("processing time", "ms") etcd_response_time = None etcd_response_time_stat = AggregateStat("etcd response time", "ms") stats = [etcd_response_time_stat, non_req_time_stat] http = None try: while not self._stop_event.is_set() and not stop_event.is_set(): if not http: _log.info("No HTTP pool, creating one...") http = self.get_etcd_connection() req_start_time = monotonic_time() if req_end_time is not None: # Calculate the time since the end of the previous request, # i.e. the time we spent processing the response. Note: # start and end are flipped because we just read the start # time but we have the end time from the last loop. non_req_time = req_start_time - req_end_time non_req_time_stat.store_reading(non_req_time * 1000) _log.debug("Waiting on etcd index %s", next_index) try: try: resp = self._etcd_request(http, VERSION_DIR, recursive=True, wait_index=next_index, timeout=90) finally: # Make sure the time is available to both exception and # mainline code paths. req_end_time = monotonic_time() etcd_response_time = req_end_time - req_start_time if resp.status != 200: _log.warning("etcd watch returned bad HTTP status to" "poll on index %s: %s", next_index, resp.status) self._check_cluster_id(resp) resp_body = resp.data # Force read inside try block. except urllib3.exceptions.ReadTimeoutError: # 100% expected when there are no events. _log.debug("Watch read timed out, restarting watch at " "index %s", next_index) # Workaround urllib3 bug #718. After a ReadTimeout, the # connection is incorrectly recycled. http = None continue except (urllib3.exceptions.HTTPError, httplib.HTTPException, socket.error) as e: # Not so expected but still possible to recover: etcd # being restarted, for example. assert etcd_response_time is not None if etcd_response_time < REQ_TIGHT_LOOP_THRESH: # Failed fast, make sure we don't tight loop. delay = REQ_TIGHT_LOOP_THRESH - etcd_response_time _log.warning("Connection to etcd failed with %r, " "restarting watch at index %s after " "delay %.3f", e, next_index, delay) time.sleep(delay) else: _log.warning("Connection to etcd failed with %r, " "restarting watch at index %s " "immediately", e, next_index) # Workaround urllib3 bug #718. After a ReadTimeout, the # connection is incorrectly recycled. http = None continue # If we get to this point, we've got an etcd response to # process; try to parse it. try: etcd_resp = json.loads(resp_body) if "errorCode" in etcd_resp: _log.error("Error from etcd for index %s: %s; " "triggering a resync.", next_index, etcd_resp) break node = etcd_resp["node"] key = node["key"] action = ACTION_MAPPING[etcd_resp["action"]] is_dir = node.get("dir", False) value = node.get("value") dir_creation = False if is_dir: if action == "delete": if key.rstrip("/") in (VERSION_DIR, ROOT_DIR): # Special case: if the whole keyspace is # deleted, that implies the ready flag is gone # too. Break out of the loop to trigger a # resync. This avoids queuing up a bunch of # events that would be discarded by the # resync thread. _log.warning("Whole %s deleted, resyncing", VERSION_DIR) break else: # Just ignore sets to directories, we only track # leaves. _log.debug("Skipping non-delete to dir %s", key) dir_creation = True modified_index = node["modifiedIndex"] except (KeyError, TypeError, ValueError): _log.exception("Unexpected format for etcd response to" "index %s: %r; triggering a resync.", next_index, resp_body) break else: # We successfully parsed the response, hand it off to the # resync thread. Now we know that we got a response, # we record that in the stat. etcd_response_time_stat.store_reading(etcd_response_time * 1000) if not dir_creation: # The resync thread doesn't need to know about # directory creations so we skip them. (It does need # to know about deletions in order to clean up # sub-keys.) event_queue.put((modified_index, key, value)) next_index = modified_index + 1 # Opportunistically log stats. now = monotonic_time() if now - last_log_time > STATS_LOG_INTERVAL: for stat in stats: _log.info("STAT: Watcher %s", stat) stat.reset() _log.info("STAT: Watcher queue length: %s", event_queue.qsize()) last_log_time = now except DriverShutdown: _log.warning("Watcher thread stopping due to driver shutdown.") except: _log.exception("Exception finishing watcher thread.") raise finally: # Signal to the resync thread that we've exited. event_queue.put(None) # Make sure we get some stats output from the watcher. for stat in stats: _log.info("STAT: Final watcher %s", stat) _log.info("Watcher thread finished. Signalled to resync thread. " "Was at index %s. Queue length is %s.", next_index, event_queue.qsize())
def test_mainline(self): a = monotonic_time() time.sleep(0.01) b = monotonic_time() self.assertTrue(b >= a+0.01, msg="Monotonic time did not increase as " "expected: %s, %s" % (a, b))
def _reset_resync_thread_stats(self): for stat in self._resync_stats: stat.reset() self._last_resync_stat_log_time = monotonic_time()
def _resync_and_merge(self): """ Thread: Resync-and-merge thread. Loads the etcd snapshot, merges it with the events going on concurrently and sends the event stream to Felix. """ _log.info("Resync thread started, waiting for config to be loaded...") self._init_received.wait() _log.info("Config loaded; continuing.") while not self._stop_event.is_set(): _log.info("Stop event not set, starting new resync...") self._reset_resync_thread_stats() loop_start = monotonic_time() try: # Start with a fresh HTTP pool just in case it got into a bad # state. self._resync_http_pool = self.get_etcd_connection() # Before we get to the snapshot, Felix needs the configuration. self._send_status(STATUS_WAIT_FOR_READY) self._wait_for_ready() self._preload_config() # Wait for config if we have not already received it. self._wait_for_config() # Kick off the snapshot request as far as the headers. self._send_status(STATUS_RESYNC) resp, snapshot_index = self._start_snapshot_request() # Before reading from the snapshot, start the watcher thread. self._ensure_watcher_running(snapshot_index) # Incrementally process the snapshot, merging in events from # the queue. self._process_snapshot_and_events(resp, snapshot_index) # We're now in-sync. Tell Felix. self._send_status(STATUS_IN_SYNC) # Then switch to processing events only. self._process_events_only() except WriteFailed: _log.exception("Write to Felix failed; shutting down.") self.stop() except WatcherDied: _log.warning("Watcher died; resyncing.") self._stop_watcher() # Clean up the event except (urllib3.exceptions.HTTPError, httplib.HTTPException, socket.error) as e: _log.error("Request to etcd failed: %r; resyncing.", e) self._stop_watcher() if monotonic_time() - loop_start < 1: _log.warning("May be tight looping, sleeping...") time.sleep(1) except ResyncRequested: _log.info("Resync requested, looping to start a new resync. " "Leaving watcher running if possible.") except ResyncRequired: _log.warn("Detected inconsistency requiring a full resync, " "stopping watcher") self._stop_watcher() except DriverShutdown: _log.info("Driver shut down.") return except: _log.exception("Unexpected exception; shutting down.") self.stop() raise finally: self._first_resync = False self._resync_requested = False _log.info("Stop event set, exiting resync loop.")
def watch_etcd(self, next_index, event_queue, stop_event): """ Thread: etcd watcher thread. Watches etcd for changes and sends them over the queue to the resync thread, which owns the socket to Felix. Dies if it receives an error from etcd. Note: it is important that we pass the index, queue and event as parameters to ensure that each watcher thread only touches the versions of those values that were created for it as opposed to a later-created watcher thread. :param next_index: The etcd index to start watching from. :param event_queue: Queue of updates back to the resync thread. :param stop_event: Event used to stop this thread when it is no longer needed. """ _log.info("Watcher thread started") http = None try: while not self._stop_event.is_set() and not stop_event.is_set(): if not http: _log.info("No HTTP pool, creating one...") http = self.get_etcd_connection() req_start_time = monotonic_time() try: _log.debug("Waiting on etcd index %s", next_index) resp = self._etcd_request(http, VERSION_DIR, recursive=True, wait_index=next_index, timeout=90) if resp.status != 200: _log.warning("etcd watch returned bad HTTP status: %s", resp.status) self._check_cluster_id(resp) resp_body = resp.data # Force read inside try block. except urllib3.exceptions.ReadTimeoutError: # 100% expected when there are no events. _log.debug("Watch read timed out, restarting watch at " "index %s", next_index) # Workaround urllib3 bug #718. After a ReadTimeout, the # connection is incorrectly recycled. http = None continue except (urllib3.exceptions.HTTPError, httplib.HTTPException, socket.error) as e: # Not so expected but still possible to recover: etcd # being restarted, for example. req_end_time = monotonic_time() req_time = req_end_time - req_start_time if req_time < REQ_TIGHT_LOOP_THRESH: # Failed fast, make sure we don't tight loop. delay = REQ_TIGHT_LOOP_THRESH - req_time _log.warning("Connection to etcd failed with %r, " "restarting watch at index %s after " "delay %.3f", e, next_index, delay) time.sleep(delay) else: _log.warning("Connection to etcd failed with %r, " "restarting watch at index %s " "immediately", e, next_index) # Workaround urllib3 bug #718. After a ReadTimeout, the # connection is incorrectly recycled. http = None continue try: etcd_resp = json.loads(resp_body) if "errorCode" in etcd_resp: _log.error("Error from etcd: %s; triggering a resync.", etcd_resp) break node = etcd_resp["node"] key = node["key"] action = ACTION_MAPPING[etcd_resp["action"]] is_dir = node.get("dir", False) value = node.get("value") dir_creation = False if is_dir: if action == "delete": if key.rstrip("/") in (VERSION_DIR, ROOT_DIR): # Special case: if the whole keyspace is # deleted, that implies the ready flag is gone # too. Break out of the loop to trigger a # resync. This avoids queuing up a bunch of # events that would be discarded by the # resync thread. _log.warning("Whole %s deleted, resyncing", VERSION_DIR) break else: # Just ignore sets to directories, we only track # leaves. _log.debug("Skipping non-delete to dir %s", key) dir_creation = True modified_index = node["modifiedIndex"] except (KeyError, TypeError, ValueError): _log.exception("Unexpected format for etcd response: %r;" "triggering a resync.", resp_body) break else: if not dir_creation: # The resync thread doesn't need to know about # directory creations so we skip them. (It does need # to know about deletions in order to clean up # sub-keys.) event_queue.put((modified_index, key, value)) next_index = modified_index + 1 except DriverShutdown: _log.warning("Watcher thread stopping due to driver shutdown.") except: _log.exception("Exception finishing watcher thread.") raise finally: # Signal to the resync thread that we've exited. _log.info("Watcher thread finished. Signalling to resync thread.") event_queue.put(None)
def watch_etcd(self, next_index, event_queue, stop_event): """ Thread: etcd watcher thread. Watches etcd for changes and sends them over the queue to the resync thread, which owns the socket to Felix. Dies if it receives an error from etcd. Note: it is important that we pass the index, queue and event as parameters to ensure that each watcher thread only touches the versions of those values that were created for it as opposed to a later-created watcher thread. :param int next_index: The etcd index to start watching from. :param Queue event_queue: Queue of updates back to the resync thread. :param Event stop_event: Event used to stop this thread when it is no longer needed. """ _log.info("Watcher thread started with next index %s", next_index) last_log_time = monotonic_time() req_end_time = None non_req_time_stat = AggregateStat("processing time", "ms") etcd_response_time = None etcd_response_time_stat = AggregateStat("etcd response time", "ms") stats = [etcd_response_time_stat, non_req_time_stat] http = None try: while not self._stop_event.is_set() and not stop_event.is_set(): if not http: _log.info("No HTTP pool, creating one...") http = self.get_etcd_connection() req_start_time = monotonic_time() if req_end_time is not None: # Calculate the time since the end of the previous request, # i.e. the time we spent processing the response. Note: # start and end are flipped because we just read the start # time but we have the end time from the last loop. non_req_time = req_start_time - req_end_time non_req_time_stat.store_reading(non_req_time * 1000) _log.debug("Waiting on etcd index %s", next_index) try: try: resp = self._etcd_request(http, VERSION_DIR, recursive=True, wait_index=next_index, timeout=90) finally: # Make sure the time is available to both exception and # mainline code paths. req_end_time = monotonic_time() etcd_response_time = req_end_time - req_start_time if resp.status != 200: _log.warning("etcd watch returned bad HTTP status to" "poll on index %s: %s", next_index, resp.status) self._check_cluster_id(resp) resp_body = resp.data # Force read inside try block. except urllib3.exceptions.ReadTimeoutError: # 100% expected when there are no events. _log.debug("Watch read timed out, restarting watch at " "index %s", next_index) # Workaround urllib3 bug #718. After a ReadTimeout, the # connection is incorrectly recycled. http = None continue except (urllib3.exceptions.HTTPError, httplib.HTTPException, socket.error) as e: # Not so expected but still possible to recover: etcd # being restarted, for example. assert etcd_response_time is not None if etcd_response_time < REQ_TIGHT_LOOP_THRESH: # Failed fast, make sure we don't tight loop. delay = REQ_TIGHT_LOOP_THRESH - etcd_response_time _log.warning("Connection to etcd failed with %r, " "restarting watch at index %s after " "delay %.3f", e, next_index, delay) time.sleep(delay) else: _log.warning("Connection to etcd failed with %r, " "restarting watch at index %s " "immediately", e, next_index) # If available, connect to a different etcd URL in case # only the previous one has failed. self._rotate_etcd_url() # Workaround urllib3 bug #718. After a ReadTimeout, the # connection is incorrectly recycled. http = None continue # If we get to this point, we've got an etcd response to # process; try to parse it. try: etcd_resp = json.loads(resp_body) if "errorCode" in etcd_resp: _log.error("Error from etcd for index %s: %s; " "triggering a resync.", next_index, etcd_resp) break node = etcd_resp["node"] key = node["key"] action = ACTION_MAPPING[etcd_resp["action"]] is_dir = node.get("dir", False) value = node.get("value") dir_creation = False if is_dir: if action == "delete": if key.rstrip("/") in (VERSION_DIR, ROOT_DIR): # Special case: if the whole keyspace is # deleted, that implies the ready flag is gone # too. Break out of the loop to trigger a # resync. This avoids queuing up a bunch of # events that would be discarded by the # resync thread. _log.warning("Whole %s deleted, resyncing", VERSION_DIR) break else: # Just ignore sets to directories, we only track # leaves. _log.debug("Skipping non-delete to dir %s", key) dir_creation = True modified_index = node["modifiedIndex"] except (KeyError, TypeError, ValueError): _log.exception("Unexpected format for etcd response to" "index %s: %r; triggering a resync.", next_index, resp_body) break else: # We successfully parsed the response, hand it off to the # resync thread. Now we know that we got a response, # we record that in the stat. etcd_response_time_stat.store_reading(etcd_response_time * 1000) if not dir_creation: # The resync thread doesn't need to know about # directory creations so we skip them. (It does need # to know about deletions in order to clean up # sub-keys.) event_queue.put((modified_index, key, value)) next_index = modified_index + 1 # Opportunistically log stats. now = monotonic_time() if now - last_log_time > STATS_LOG_INTERVAL: for stat in stats: _log.info("STAT: Watcher %s", stat) stat.reset() _log.info("STAT: Watcher queue length: %s", event_queue.qsize()) last_log_time = now except DriverShutdown: _log.warning("Watcher thread stopping due to driver shutdown.") except: _log.exception("Exception finishing watcher thread.") raise finally: # Signal to the resync thread that we've exited. event_queue.put(None) # Make sure we get some stats output from the watcher. for stat in stats: _log.info("STAT: Final watcher %s", stat) _log.info("Watcher thread finished. Signalled to resync thread. " "Was at index %s. Queue length is %s.", next_index, event_queue.qsize())
def reset(self): self.start_time = monotonic_time() self.count = 0
def _reset_resync_thread_stats(self): for stat in self._resync_stats: stat.reset() self._last_resync_stat_log_time = monotonic_time()
def _resync_and_merge(self): """ Thread: Resync-and-merge thread. Loads the etcd snapshot, merges it with the events going on concurrently and sends the event stream to Felix. """ _log.info("Resync thread started, waiting for config to be loaded...") self._init_received.wait() _log.info("Config loaded; continuing.") while not self._stop_event.is_set(): _log.info("Stop event not set, starting new resync...") self._reset_resync_thread_stats() loop_start = monotonic_time() try: # Start with a fresh HTTP pool just in case it got into a bad # state. self._resync_http_pool = self.get_etcd_connection() # Before we get to the snapshot, Felix needs the configuration. self._send_status(STATUS_WAIT_FOR_READY) self._wait_for_ready() self._preload_config() # Wait for config if we have not already received it. self._wait_for_config() # Kick off the snapshot request as far as the headers. self._send_status(STATUS_RESYNC) resp, snapshot_index = self._start_snapshot_request() # Before reading from the snapshot, start the watcher thread. self._ensure_watcher_running(snapshot_index) # Incrementally process the snapshot, merging in events from # the queue. self._process_snapshot_and_events(resp, snapshot_index) # We're now in-sync. Tell Felix. self._send_status(STATUS_IN_SYNC) # Then switch to processing events only. self._process_events_only() except WriteFailed: _log.exception("Write to Felix failed; shutting down.") self.stop() except WatcherDied: _log.warning("Watcher died; resyncing.") self._stop_watcher() # Clean up the event except (urllib3.exceptions.HTTPError, httplib.HTTPException, socket.error) as e: _log.error("Request to etcd failed: %r; resyncing.", e) self._stop_watcher() self._rotate_etcd_url() # Try a different etcd URL if possible if monotonic_time() - loop_start < 1: _log.warning("May be tight looping, sleeping...") time.sleep(1) except ResyncRequested: _log.info("Resync requested, looping to start a new resync. " "Leaving watcher running if possible.") except ResyncRequired: _log.warn("Detected inconsistency requiring a full resync, " "stopping watcher") self._stop_watcher() except DriverShutdown: _log.info("Driver shut down.") return except: _log.exception("Unexpected exception; shutting down.") self.stop() raise finally: self._first_resync = False self._resync_requested = False _log.info("Stop event set, exiting resync loop.")
def time_since_start(self): now = monotonic_time() time_since_start = now - self.start_time return time_since_start
def reset(self): self.start_time = monotonic_time() self.count = 0