class Replayer(ControlFlow): ''' Replay events from a trace To set the event scheduling parameters, pass them as keyword args to the constructor of this class, which will pass them on to the EventScheduler object it creates. ''' # Interpolated time parameter. *not* the event scheduling epsilon: time_epsilon_microseconds = 500 def __init__(self, simulation_cfg, superlog_path_or_dag, create_event_scheduler=None, print_buffers=True, wait_on_deterministic_values=False, default_dp_permit=False, fail_to_interactive=False, fail_to_interactive_on_persistent_violations=False, end_in_interactive=False, input_logger=None, allow_unexpected_messages=False, pass_through_whitelisted_messages=True, delay_flow_mods=False, **kwargs): ControlFlow.__init__(self, simulation_cfg) if wait_on_deterministic_values: self.sync_callback = ReplaySyncCallback() else: self.sync_callback = ReplaySyncCallback(self.get_interpolated_time) if type(superlog_path_or_dag) == str: superlog_path = superlog_path_or_dag # The dag is codefied as a list, where each element has # a list of its dependents self.dag = EventDag(log_parser.parse_path(superlog_path)) else: self.dag = superlog_path_or_dag if len(self.dag.events) == 0: raise ValueError("No events to replay!") self.default_dp_permit = default_dp_permit # Set DataplanePermit and DataplaneDrop to passive if permit is set # to default for event in [ e for e in self.dag.events if type(e) in dp_events ]: event.passive = default_dp_permit self.dp_checker = None if default_dp_permit: self.dp_checker = DataplaneChecker(self.dag) self.print_buffers_flag = print_buffers # compute interpolate to time to be just before first event self.compute_interpolated_time(self.dag.events[0]) # String repesentations of unexpected state changes we've passed through, for # statistics purposes. self.unexpected_state_changes = [] self.early_state_changes = [] self.event_scheduler_stats = None self.end_in_interactive = end_in_interactive self.fail_to_interactive = fail_to_interactive self.fail_to_interactive_on_persistent_violations =\ fail_to_interactive_on_persistent_violations self._input_logger = input_logger self.allow_unexpected_messages = allow_unexpected_messages self.pass_through_whitelisted_messages = pass_through_whitelisted_messages # How many logical rounds to peek ahead when deciding if a message is # expected or not. self.expected_message_round_window = 3 # String repesentations of unexpected messages we've passed through, for # statistics purposes. self.passed_unexpected_messages = [] self.delay_flow_mods = delay_flow_mods if self.pass_through_whitelisted_messages: for event in self.dag.events: if hasattr(event, "ignore_whitelisted_packets"): event.ignore_whitelisted_packets = True if create_event_scheduler: self.create_event_scheduler = create_event_scheduler else: self.create_event_scheduler = \ lambda simulation: EventScheduler(simulation, **{ k: v for k,v in kwargs.items() if k in EventScheduler.kwargs }) def _log_input_event(self, event, **kws): if self._input_logger is not None: self._input_logger.log_input_event(event, **kws) def get_interpolated_time(self): ''' During divergence, the controller may ask for the current time more or less times than they did in the original run. We control the time, so we need to give them some answer. The answers we give them should be (i) monotonically increasing, and (ii) between the time of the last recorded ("landmark") event and the next landmark event, and (iii) as close to the recorded times as possible Our temporary solution is to always return the time right before the next landmark ''' # TODO(cs): implement Andi's improved time heuristic return self.interpolated_time def compute_interpolated_time(self, current_event): next_time = current_event.time just_before_micro = next_time.microSeconds - self.time_epsilon_microseconds just_before_micro = max(0, just_before_micro) self.interpolated_time = SyncTime(next_time.seconds, just_before_micro) def increment_round(self): msg.event(color.CYAN + ("Round %d" % self.logical_time) + color.WHITE) def init_results(self, results_dir): if self._input_logger: self._input_logger.open(results_dir) def simulate(self, post_bootstrap_hook=None): ''' Caller *must* call simulation.clean_up() ''' self.simulation = self.simulation_cfg.bootstrap(self.sync_callback) assert(isinstance(self.simulation.patch_panel, BufferedPatchPanel)) # TODO(aw): remove this hack self.simulation.fail_to_interactive = self.fail_to_interactive self.simulation.fail_to_interactive_on_persistent_violations =\ self.fail_to_interactive_on_persistent_violations self.simulation.openflow_buffer.pass_through_whitelisted_messages =\ self.pass_through_whitelisted_messages self.logical_time = 0 if self.delay_flow_mods: for switch in self.simulation.topology.switches: assert(isinstance(switch, FuzzSoftwareSwitch)) switch.use_delayed_commands() self.run_simulation_forward(self.dag, post_bootstrap_hook) if self.print_buffers_flag: self._print_buffers() return self.simulation def _print_buffers(self): log.debug("Pending Message Receives:") for p in self.simulation.openflow_buffer.pending_receives(): log.debug("- %s", p) log.debug("Pending State Changes:") for p in self.sync_callback.pending_state_changes(): log.debug("- %s", p) def run_simulation_forward(self, dag, post_bootstrap_hook=None): event_scheduler = self.create_event_scheduler(self.simulation) event_scheduler.set_input_logger(self._input_logger) self.event_scheduler_stats = event_scheduler.stats if post_bootstrap_hook is not None: post_bootstrap_hook() def interrupt(sgn, frame): msg.interactive("Interrupting replayer, dropping to console (press ^C again to terminate)") signal.signal(signal.SIGINT, self.old_interrupt) self.old_interrupt = None raise KeyboardInterrupt() self.old_interrupt = signal.signal(signal.SIGINT, interrupt) try: for i, event in enumerate(dag.events): try: self.compute_interpolated_time(event) if self.default_dp_permit: self.dp_checker.check_dataplane(i, self.simulation) if isinstance(event, InputEvent): self._check_early_state_changes(dag, i, event) self._check_new_state_changes(dag, i) self._check_unexpected_dp_messages(dag, i) # TODO(cs): quasi race-condition here. If unexpected state change # happens *while* we're waiting for event, we basically have a # deadlock (if controller logging is set to blocking) until the # timeout occurs # TODO(cs): we don't actually allow new internal message events # through.. we only let new state changes through. Should experiment # with whether we would get better fidelity if we let them through. event_scheduler.schedule(event) if self.logical_time != event.round: self.logical_time = event.round self.increment_round() except KeyboardInterrupt: interactive = Interactive(self.simulation_cfg, input_logger=self._input_logger) interactive.simulate(self.simulation, bound_objects=( ('replayer', self), )) self.old_interrupt = signal.signal(signal.SIGINT, interrupt) finally: if self.old_interrupt: signal.signal(signal.SIGINT, self.old_interrupt) msg.event(color.B_BLUE+"Event Stats: %s" % str(event_scheduler.stats)) if self.default_dp_permit: msg.event(color.B_BLUE+"DataplaneDrop Stats: %s" % str(self.dp_checker.stats)) if self.end_in_interactive: interactive = Interactive(self.simulation_cfg, input_logger=self._input_logger) interactive.simulate(self.simulation, bound_objects=( ('replayer', self), )) def _check_early_state_changes(self, dag, current_index, input): ''' Check whether any pending state change that were supposed to come *after* the current input have occured. If so, we have violated causality.''' pending_state_changes = self.sync_callback.pending_state_changes() if len(pending_state_changes) > 0: # TODO(cs): currently assumes a single controller (-> single pending state # change) state_change = pending_state_changes[0] next_expected = dag.next_state_change(current_index) original_input_index = dag.get_original_index_for_event(input) if (next_expected is not None and state_change == next_expected.pending_state_change and dag.get_original_index_for_event(next_expected) > original_input_index): # raise RuntimeError("State change happened before expected! Causality violated") log.warn("State change happened before expected! Causality violated") self.early_state_changes.append(repr(next_expected)) def _check_new_state_changes(self, dag, current_index): ''' If we are blocking controllers, it's bad news bears if an unexpected internal state change occurs. Check if there are any, and ACK them so that the execution can proceed.''' pending_state_changes = self.sync_callback.pending_state_changes() if len(pending_state_changes) > 0: # TODO(cs): currently assumes a single controller (-> single pending state # change) state_change = pending_state_changes[0] next_expected = dag.next_state_change(current_index) if (next_expected is None or state_change != next_expected.pending_state_change): log.info("Unexpected state change. Ack'ing") log_event = ControllerStateChange.from_pending_state_change(state_change) # Monkeypatch a "new internal event" marker to be logged to the JSON trace # (All fields picked up by event.to_json()) log_event.new_internal_event = True log_event.replay_time = SyncTime.now() self._log_input_event(log_event) self.unexpected_state_changes.append(repr(state_change)) self.sync_callback.ack_pending_state_change(state_change) def _check_unexpected_dp_messages(self, dag, current_index): ''' If throughout replay we observe new messages that weren't in the original trace, we (usually) want to let them through. This is especially true for messages that are related to timers, such as LLDP, since timings will often change during replay, and we don't want to unnecessarily change the controller's behavior. ''' if not self.allow_unexpected_messages: return # TODO(cs): evaluate whether _check_unexpected_dp_messages is more or less # effective than _check_whitelisted_dp_messages. Compare executions with # visualization tool. Currently it appears that this method is too # liberal, and ends up causing timouts as a result of letting messages # through. # First, build a set of expected ControlMessageSends/Receives fingerprints # within the next expected_message_round_window rounds. start_round = dag.events[current_index].round expected_receive_fingerprints = set() expected_send_fingerprints = set() for i in xrange(current_index, len(dag.events)): event = dag.events[i] if event.round - start_round > self.expected_message_round_window: break if type(event) == ControlMessageReceive: (_, of_fingerprint, dpid, cid) = event.fingerprint expected_receive_fingerprints.add((of_fingerprint, dpid, cid)) if type(event) == ControlMessageSend: (_, of_fingerprint, dpid, cid) = event.fingerprint expected_send_fingerprints.add((of_fingerprint, dpid, cid)) # Now check pending messages. for expected_fingerprints, messages in [ (expected_receive_fingerprints, self.simulation.openflow_buffer.pending_receives()), (expected_send_fingerprints, self.simulation.openflow_buffer.pending_sends())]: for pending_message in messages: fingerprint = (pending_message.fingerprint, pending_message.dpid, pending_message.controller_id) if fingerprint not in expected_fingerprints: message = self.simulation.openflow_buffer.schedule(pending_message) log.debug("Sending unexpected %s" % message) b64_packet = base64_encode(message) # Monkeypatch a "new internal event" marker to be logged to the JSON trace # (All fields picked up by event.to_json()) event_type = ControlMessageReceive if type(pending_message) == PendingReceive else ControlMessageSend log_event = event_type(pending_message.dpid, pending_message.controller_id, pending_message.fingerprint, b64_packet=b64_packet) log_event.new_internal_event = True log_event.replay_time = SyncTime.now() self.passed_unexpected_messages.append(repr(log_event)) self._log_input_event(log_event)
class Replayer(ControlFlow): ''' Replay events from a trace To set the event scheduling parameters, pass them as keyword args to the constructor of this class, which will pass them on to the EventScheduler object it creates. ''' # Interpolated time parameter. *not* the event scheduling epsilon: time_epsilon_microseconds = 500 kwargs = EventScheduler.kwargs | set([ 'create_event_scheduler', 'print_buffers', 'wait_on_deterministic_values', 'default_dp_permit', 'fail_to_interactive', 'fail_to_interactive_on_persistent_violations', 'end_in_interactive', 'input_logger', 'allow_unexpected_messages', 'expected_message_round_window', 'pass_through_whitelisted_messages', 'delay_flow_mods', 'invariant_check_name', 'bug_signature', 'end_wait_seconds', 'transform_dag', 'pass_through_sends', 'fail_fast', 'check_interval' ]) def __init__(self, simulation_cfg, superlog_path_or_dag, create_event_scheduler=None, print_buffers=True, wait_on_deterministic_values=False, default_dp_permit=False, fail_to_interactive=False, fail_to_interactive_on_persistent_violations=False, end_in_interactive=False, input_logger=None, allow_unexpected_messages=False, expected_message_round_window=3, pass_through_whitelisted_messages=False, delay_flow_mods=False, invariant_check_name="", bug_signature="", end_wait_seconds=0.5, transform_dag=None, pass_through_sends=False, fail_fast=False, check_interval=5, **kwargs): ''' - If invariant_check_name is not None, check it at the end for the execution - If bug_signature is not None, check whether this particular signature appears in the output of the invariant check at the end of the execution ''' ControlFlow.__init__(self, simulation_cfg) # Label uniquely identifying this replay, set in init_results() self.replay_id = "N/A" self.logical_time = 0 if wait_on_deterministic_values: self.sync_callback = ReplaySyncCallback() else: self.sync_callback = ReplaySyncCallback(self.get_interpolated_time) if type(superlog_path_or_dag) == str: superlog_path = superlog_path_or_dag # The dag is codefied as a list, where each element has # a list of its dependents self.dag = EventDag(log_parser.parse_path(superlog_path)) else: self.dag = superlog_path_or_dag if len(self.dag.events) == 0: raise ValueError("No events to replay!") self.default_dp_permit = default_dp_permit self.dp_checker = self._setup_dp_checker(default_dp_permit) if self.default_dp_permit: # Set DataplanePermit and DataplaneDrop to passive if permit is set # to default # TODO(cs): rather than setting these to passive (which still causes them to # be scheduled as regular events) should these just be removed from the # event dag altogether? for event in [e for e in self.dag.events if type(e) in dp_events]: event.passive = default_dp_permit self.print_buffers_flag = print_buffers self.fail_fast = fail_fast self.check_interval = check_interval # compute interpolate to time to be just before first event self.compute_interpolated_time(self.dag.events[0]) # String repesentations of unexpected state changes we've passed through, for # statistics purposes. self.unexpected_state_changes = [] self.early_state_changes = [] self.event_scheduler_stats = None self.end_in_interactive = end_in_interactive self.fail_to_interactive = fail_to_interactive self.fail_to_interactive_on_persistent_violations =\ fail_to_interactive_on_persistent_violations self._input_logger = input_logger self.allow_unexpected_messages = allow_unexpected_messages self.expected_message_round_window = expected_message_round_window self.pass_through_whitelisted_messages = pass_through_whitelisted_messages self.pass_through_sends = pass_through_sends # How many logical rounds to peek ahead when deciding if a message is # expected or not. # String repesentations of unexpected messages we've passed through, for # statistics purposes. self.passed_unexpected_messages = [] self.delay_flow_mods = delay_flow_mods self.end_wait_seconds = end_wait_seconds self.transform_dag = transform_dag self.bug_signature = bug_signature self.invariant_check_name = invariant_check_name self.invariant_check = None if self.invariant_check_name: if self.invariant_check_name not in name_to_invariant_check: raise ValueError( '''Unknown invariant check %s.\n''' '''Invariant check name must be defined in config.invariant_checks''', self.invariant_check_name) self.invariant_check = name_to_invariant_check[ self.invariant_check_name] if self.pass_through_whitelisted_messages: for event in self.dag.events: if hasattr(event, "ignore_whitelisted_packets"): event.ignore_whitelisted_packets = True if self.simulation_cfg.ignore_interposition: self._ignore_interposition() if create_event_scheduler: self.create_event_scheduler = create_event_scheduler else: if self.default_dp_permit: # Tell EventScheduler to use call into us whenever it wants to sleep or # select, so that we can keep forwarding dataplane packets. kwargs[ 'sleep_continuation'] = self._sleep_with_dataplane_passthrough kwargs[ 'select_continuation'] = self._select_with_dataplane_passthrough self.create_event_scheduler = \ lambda simulation: EventScheduler(simulation, **{ k: v for k,v in kwargs.items() if k in EventScheduler.kwargs }) unknown_kwargs = [ k for k in kwargs.keys() if k not in EventScheduler.kwargs ] if unknown_kwargs != []: raise ValueError("Unknown kwargs %s" % str(unknown_kwargs)) def _log_input_event(self, event, **kws): if self._input_logger is not None: self._input_logger.log_input_event(event, **kws) def _setup_dp_checker(self, default_dp_permit): no_dp_drops = [e for e in self.dag.events if type(e) == DataplaneDrop] == [] if not default_dp_permit and no_dp_drops: print >> sys.stderr, ( '''No DataplaneDrops to replay. We suggest you ''' '''set Replayer's default_dp_permit=True ''') if default_dp_permit: if no_dp_drops: return AlwaysAllowDataplane(self.dag) else: return DataplaneChecker(self.dag) return None def _ignore_interposition(self): ''' Configure all interposition points to immediately pass through all internal events (possibly useful for replays affected by non-determinism) ''' filtered_events = [ e for e in self.dag.events if type(e) not in all_internal_events ] self.dag = EventDag(filtered_events) self.default_dp_permit = True self.dp_checker = AlwaysAllowDataplane(self.dag) def _sleep_with_dataplane_passthrough(self, seconds): ''' Pre: simulate() has been called ''' start = time.time() while not self.simulation.io_master.closed: elapsed = time.time() - start remaining = seconds - elapsed if remaining < 0.01: break self._select_with_dataplane_passthrough(remaining) def _select_with_dataplane_passthrough(self, timeout_seconds): ''' Pre: simulate() has been called ''' if self.default_dp_permit: self.dp_checker.check_dataplane(-1, self.simulation) return self.simulation.io_master.select(timeout_seconds) def get_interpolated_time(self): ''' During divergence, the controller may ask for the current time more or less times than they did in the original run. We control the time, so we need to give them some answer. The answers we give them should be (i) monotonically increasing, and (ii) between the time of the last recorded ("landmark") event and the next landmark event, and (iii) as close to the recorded times as possible Our temporary solution is to always return the time right before the next landmark ''' # TODO(cs): implement Andi's improved time heuristic return self.interpolated_time def compute_interpolated_time(self, current_event): next_time = current_event.time just_before_micro = next_time.microSeconds - self.time_epsilon_microseconds just_before_micro = max(0, just_before_micro) self.interpolated_time = SyncTime(next_time.seconds, just_before_micro) def increment_round(self): msg.event(color.CYAN + ("Round %d" % self.logical_time) + color.WHITE) def init_results(self, results_dir): self.replay_id = results_dir if self._input_logger: self._input_logger.open(results_dir) def set_pass_through_sends(self, simulation): simulation.openflow_buffer.pass_through_sends_only() for e in self.dag.events: if type(e) == ControlMessageSend: e.pass_through_sends = True def simulate(self, post_bootstrap_hook=None): ''' Caller *must* call simulation.clean_up() ''' if self.transform_dag: log.info("Transforming dag") self.dag = self.transform_dag(self.dag) log.info("Proceeding with normal replay") self.simulation = self.simulation_cfg.bootstrap(self.sync_callback) assert (isinstance(self.simulation.patch_panel, BufferedPatchPanel)) # TODO(aw): remove this hack self.simulation.fail_to_interactive = self.fail_to_interactive self.simulation.fail_to_interactive_on_persistent_violations =\ self.fail_to_interactive_on_persistent_violations self.simulation.openflow_buffer.pass_through_whitelisted_packets =\ self.pass_through_whitelisted_messages if self.pass_through_sends: self.set_pass_through_sends(self.simulation) if self.delay_flow_mods: for switch in self.simulation.topology.switches: assert (isinstance(switch, FuzzSoftwareSwitch)) switch.use_delayed_commands() self.run_simulation_forward(post_bootstrap_hook) if self.print_buffers_flag: self._print_buffers() return self.simulation def _print_buffers(self): log.debug("Pending Message Receives:") for p in self.simulation.openflow_buffer.pending_receives: log.debug("- %s", p) log.debug("Pending State Changes:") for p in self.sync_callback.pending_state_changes(): log.debug("- %s", p) def run_simulation_forward(self, post_bootstrap_hook=None): event_scheduler = self.create_event_scheduler(self.simulation) event_scheduler.set_input_logger(self._input_logger) self.event_scheduler_stats = event_scheduler.stats if post_bootstrap_hook is not None: post_bootstrap_hook() def interrupt(sgn, frame): msg.interactive( "Interrupting replayer, dropping to console (press ^C again to terminate)" ) signal.signal(signal.SIGINT, self.old_interrupt) self.old_interrupt = None raise KeyboardInterrupt() self.old_interrupt = signal.signal(signal.SIGINT, interrupt) try: for i, event in enumerate(self.dag.events): try: self.compute_interpolated_time(event) if self.default_dp_permit: self.dp_checker.check_dataplane(i, self.simulation) if isinstance(event, InputEvent): self._check_early_state_changes(self.dag, i, event) self._check_new_state_changes(self.dag, i) self._check_unexpected_cp_messages(self.dag, i) # TODO(cs): quasi race-condition here. If unexpected state change # happens *while* we're waiting for event, we basically have a # deadlock (if controller logging is set to blocking) until the # timeout occurs # TODO(cs): we don't actually allow new internal message events # through.. we only let new state changes through. Should experiment # with whether we would get better fidelity if we let them through. if self.fail_fast and (i % self.check_interval ) == 0 and self._check_violation(): return event_scheduler.schedule(event) if self.logical_time != event.round: self.logical_time = event.round self.increment_round() except KeyboardInterrupt: interactive = Interactive(self.simulation_cfg, input_logger=self._input_logger) interactive.simulate(self.simulation, bound_objects=(('replayer', self), )) # If Interactive terminated due to ^D, return to our fuzzing loop, # prepared again to drop into Interactive on ^C. self.old_interrupt = signal.signal(signal.SIGINT, interrupt) except Exception: log.critical( "Exception raised while scheduling event %s, replay %s" % (str(event), self.replay_id)) raise if self.invariant_check: # Wait a bit in case the bug takes awhile to happen # TODO(cs): may be redundant with WaitTime events at the end of the # trace. log.debug("Sleeping %d seconds after run" % self.end_wait_seconds) if self.default_dp_permit: self._sleep_with_dataplane_passthrough( self.end_wait_seconds) else: time.sleep(self.end_wait_seconds) self._check_violation() finally: if self.old_interrupt: signal.signal(signal.SIGINT, self.old_interrupt) msg.event(color.B_BLUE + "Event Stats: %s" % str(event_scheduler.stats)) if self.default_dp_permit: msg.event(color.B_BLUE + "DataplaneDrop Stats: %s" % str(self.dp_checker.stats)) if self.end_in_interactive: interactive = Interactive(self.simulation_cfg, input_logger=self._input_logger) interactive.simulate(self.simulation, bound_objects=(('replayer', self), )) def _check_violation(self): # TODO(cs): this does not verify whether the violation is persistent # or transient. Perhaps it should? violations = self.invariant_check(self.simulation) self.simulation.violation_found = False if violations != []: self._log_input_event(InvariantViolation(violations)) msg.fail("Violations at end of trace: %s" % str(violations)) if self.bug_signature: if self.bug_signature in violations: self.simulation.violation_found = True msg.success("Violation found %s" % self.bug_signature) return True else: msg.fail("Violation does not match violation signature!") return False else: self.simulation.violation_found = True return True else: msg.success("No correctness violations!") return False def _check_early_state_changes(self, dag, current_index, input): ''' Check whether any pending state change that were supposed to come *after* the current input have occured. If so, we have violated causality.''' # Note that we never violate causality due to message sends/receipts, # since we buffer all messages. pending_state_changes = self.sync_callback.pending_state_changes() if len(pending_state_changes) > 0: # TODO(cs): currently assumes a single controller (-> single pending state # change) state_change = pending_state_changes[0] next_expected = dag.next_state_change(current_index) original_input_index = dag.get_original_index_for_event(input) if (next_expected is not None and state_change == next_expected.pending_state_change and dag.get_original_index_for_event(next_expected) > original_input_index): # raise RuntimeError("State change happened before expected! Causality violated") log.warn( "State change happened before expected! Causality violated" ) self.early_state_changes.append(repr(next_expected)) def _check_new_state_changes(self, dag, current_index): ''' If we are blocking controllers, it's bad news bears if an unexpected internal state change occurs. Check if there are any, and ACK them so that the execution can proceed.''' pending_state_changes = self.sync_callback.pending_state_changes() if len(pending_state_changes) > 0: # TODO(cs): currently assumes a single controller (-> single pending state # change) state_change = pending_state_changes[0] next_expected = dag.next_state_change(current_index) if (next_expected is None or state_change != next_expected.pending_state_change): log.info("Unexpected state change. Ack'ing %s" % str(state_change)) log_event = ControllerStateChange.from_pending_state_change( state_change) # Monkeypatch a "new internal event" marker to be logged to the JSON trace # (All fields picked up by event.to_json()) log_event.new_internal_event = True log_event.replay_time = SyncTime.now() self._log_input_event(log_event) self.unexpected_state_changes.append(repr(state_change)) self.sync_callback.ack_pending_state_change(state_change) def _check_unexpected_cp_messages(self, dag, current_index): ''' If throughout replay we observe new messages that weren't in the original trace, we (usually) want to let them through. This is especially true for messages that are related to timers, such as LLDP, since timings will often change during replay, and we don't want to unnecessarily change the controller's behavior. ''' if not self.allow_unexpected_messages: return # TODO(cs): evaluate whether _check_unexpected_cp_messages is more or less # effective than whitelisting cp message types (in OpenflowBuffer). # Compare executions with visualization tool. # Currently it appears that this method is too liberal, and ends up # causing timouts as a result of letting messages through. # First, build a set of expected ControlMessageSends/Receives fingerprints # within the next expected_message_round_window rounds. start_round = dag.events[current_index].round expected_receive_fingerprints = set() expected_send_fingerprints = set() for i in xrange(current_index, len(dag.events)): event = dag.events[i] if event.round - start_round > self.expected_message_round_window: break if type(event) == ControlMessageReceive: (_, of_fingerprint, dpid, cid) = event.fingerprint expected_receive_fingerprints.add((of_fingerprint, dpid, cid)) if type(event) == ControlMessageSend: (_, of_fingerprint, dpid, cid) = event.fingerprint expected_send_fingerprints.add((of_fingerprint, dpid, cid)) # Now check pending messages. for expected_fingerprints, messages in [ (expected_receive_fingerprints, self.simulation.openflow_buffer.pending_receives), (expected_send_fingerprints, self.simulation.openflow_buffer.pending_sends) ]: for pending_message in messages: fingerprint = (pending_message.fingerprint, pending_message.dpid, pending_message.controller_id) if fingerprint not in expected_fingerprints: message = self.simulation.openflow_buffer.schedule( pending_message) log.debug("Allowed unexpected message %s" % message) b64_packet = base64_encode(message) # Monkeypatch a "new internal event" marker to be logged to the JSON trace # (All fields picked up by event.to_json()) event_type = ControlMessageReceive if type( pending_message ) == PendingReceive else ControlMessageSend log_event = event_type(pending_message.dpid, pending_message.controller_id, pending_message.fingerprint, b64_packet=b64_packet) log_event.new_internal_event = True log_event.replay_time = SyncTime.now() self.passed_unexpected_messages.append(repr(log_event)) self._log_input_event(log_event)
class Replayer(ControlFlow): ''' Replay events from a trace To set the event scheduling parameters, pass them as keyword args to the constructor of this class, which will pass them on to the EventScheduler object it creates. ''' # Interpolated time parameter. *not* the event scheduling epsilon: time_epsilon_microseconds = 500 kwargs = EventScheduler.kwargs | set(['create_event_scheduler', 'print_buffers', 'wait_on_deterministic_values', 'default_dp_permit', 'fail_to_interactive', 'fail_to_interactive_on_persistent_violations', 'end_in_interactive', 'input_logger', 'allow_unexpected_messages', 'expected_message_round_window', 'pass_through_whitelisted_messages', 'delay_flow_mods', 'invariant_check_name', 'bug_signature', 'end_wait_seconds', 'transform_dag', 'pass_through_sends', 'fail_fast', 'check_interval']) def __init__(self, simulation_cfg, superlog_path_or_dag, create_event_scheduler=None, print_buffers=True, wait_on_deterministic_values=False, default_dp_permit=False, fail_to_interactive=False, fail_to_interactive_on_persistent_violations=False, end_in_interactive=False, input_logger=None, allow_unexpected_messages=False, expected_message_round_window=3, pass_through_whitelisted_messages=False, delay_flow_mods=False, invariant_check_name="", bug_signature="", end_wait_seconds=0.5, transform_dag=None, pass_through_sends=False, fail_fast=False, check_interval=5, **kwargs): ''' - If invariant_check_name is not None, check it at the end for the execution - If bug_signature is not None, check whether this particular signature appears in the output of the invariant check at the end of the execution ''' ControlFlow.__init__(self, simulation_cfg) # Label uniquely identifying this replay, set in init_results() self.replay_id = "N/A" self.logical_time = 0 if wait_on_deterministic_values: self.sync_callback = ReplaySyncCallback() else: self.sync_callback = ReplaySyncCallback(self.get_interpolated_time) if type(superlog_path_or_dag) == str: superlog_path = superlog_path_or_dag # The dag is codefied as a list, where each element has # a list of its dependents self.dag = EventDag(log_parser.parse_path(superlog_path)) else: self.dag = superlog_path_or_dag if len(self.dag.events) == 0: raise ValueError("No events to replay!") self.default_dp_permit = default_dp_permit self.dp_checker = self._setup_dp_checker(default_dp_permit) if self.default_dp_permit: # Set DataplanePermit and DataplaneDrop to passive if permit is set # to default # TODO(cs): rather than setting these to passive (which still causes them to # be scheduled as regular events) should these just be removed from the # event dag altogether? for event in [ e for e in self.dag.events if type(e) in dp_events ]: event.passive = default_dp_permit self.print_buffers_flag = print_buffers self.fail_fast = fail_fast self.check_interval = check_interval # compute interpolate to time to be just before first event self.compute_interpolated_time(self.dag.events[0]) # String repesentations of unexpected state changes we've passed through, for # statistics purposes. self.unexpected_state_changes = [] self.early_state_changes = [] self.event_scheduler_stats = None self.end_in_interactive = end_in_interactive self.fail_to_interactive = fail_to_interactive self.fail_to_interactive_on_persistent_violations =\ fail_to_interactive_on_persistent_violations self._input_logger = input_logger self.allow_unexpected_messages = allow_unexpected_messages self.expected_message_round_window = expected_message_round_window self.pass_through_whitelisted_messages = pass_through_whitelisted_messages self.pass_through_sends = pass_through_sends # How many logical rounds to peek ahead when deciding if a message is # expected or not. # String repesentations of unexpected messages we've passed through, for # statistics purposes. self.passed_unexpected_messages = [] self.delay_flow_mods = delay_flow_mods self.end_wait_seconds = end_wait_seconds self.transform_dag = transform_dag self.bug_signature = bug_signature self.invariant_check_name = invariant_check_name self.invariant_check = None if self.invariant_check_name: if self.invariant_check_name not in name_to_invariant_check: raise ValueError('''Unknown invariant check %s.\n''' '''Invariant check name must be defined in config.invariant_checks''', self.invariant_check_name) self.invariant_check = name_to_invariant_check[self.invariant_check_name] if self.pass_through_whitelisted_messages: for event in self.dag.events: if hasattr(event, "ignore_whitelisted_packets"): event.ignore_whitelisted_packets = True if self.simulation_cfg.ignore_interposition: self._ignore_interposition() if create_event_scheduler: self.create_event_scheduler = create_event_scheduler else: if self.default_dp_permit: # Tell EventScheduler to use call into us whenever it wants to sleep or # select, so that we can keep forwarding dataplane packets. kwargs['sleep_continuation'] = self._sleep_with_dataplane_passthrough kwargs['select_continuation'] = self._select_with_dataplane_passthrough self.create_event_scheduler = \ lambda simulation: EventScheduler(simulation, **{ k: v for k,v in kwargs.items() if k in EventScheduler.kwargs }) unknown_kwargs = [ k for k in kwargs.keys() if k not in EventScheduler.kwargs ] if unknown_kwargs != []: raise ValueError("Unknown kwargs %s" % str(unknown_kwargs)) def _log_input_event(self, event, **kws): if self._input_logger is not None: self._input_logger.log_input_event(event, **kws) def _setup_dp_checker(self, default_dp_permit): no_dp_drops = [ e for e in self.dag.events if type(e) == DataplaneDrop ] == [] if not default_dp_permit and no_dp_drops: print >> sys.stderr, ('''No DataplaneDrops to replay. We suggest you ''' '''set Replayer's default_dp_permit=True ''') if default_dp_permit: if no_dp_drops: return AlwaysAllowDataplane(self.dag) else: return DataplaneChecker(self.dag) return None def _ignore_interposition(self): ''' Configure all interposition points to immediately pass through all internal events (possibly useful for replays affected by non-determinism) ''' filtered_events = [e for e in self.dag.events if type(e) not in all_internal_events] self.dag = EventDag(filtered_events) self.default_dp_permit = True self.dp_checker = AlwaysAllowDataplane(self.dag) def _sleep_with_dataplane_passthrough(self, seconds): ''' Pre: simulate() has been called ''' start = time.time() while not self.simulation.io_master.closed: elapsed = time.time() - start remaining = seconds - elapsed if remaining < 0.01: break self._select_with_dataplane_passthrough(remaining) def _select_with_dataplane_passthrough(self, timeout_seconds): ''' Pre: simulate() has been called ''' if self.default_dp_permit: self.dp_checker.check_dataplane(-1, self.simulation) return self.simulation.io_master.select(timeout_seconds) def get_interpolated_time(self): ''' During divergence, the controller may ask for the current time more or less times than they did in the original run. We control the time, so we need to give them some answer. The answers we give them should be (i) monotonically increasing, and (ii) between the time of the last recorded ("landmark") event and the next landmark event, and (iii) as close to the recorded times as possible Our temporary solution is to always return the time right before the next landmark ''' # TODO(cs): implement Andi's improved time heuristic return self.interpolated_time def compute_interpolated_time(self, current_event): next_time = current_event.time just_before_micro = next_time.microSeconds - self.time_epsilon_microseconds just_before_micro = max(0, just_before_micro) self.interpolated_time = SyncTime(next_time.seconds, just_before_micro) def increment_round(self): msg.event(color.CYAN + ("Round %d" % self.logical_time) + color.WHITE) def init_results(self, results_dir): self.replay_id = results_dir if self._input_logger: self._input_logger.open(results_dir) def set_pass_through_sends(self, simulation): simulation.openflow_buffer.pass_through_sends_only() for e in self.dag.events: if type(e) == ControlMessageSend: e.pass_through_sends = True def simulate(self, post_bootstrap_hook=None): ''' Caller *must* call simulation.clean_up() ''' if self.transform_dag: log.info("Transforming dag") self.dag = self.transform_dag(self.dag) log.info("Proceeding with normal replay") self.simulation = self.simulation_cfg.bootstrap(self.sync_callback) assert(isinstance(self.simulation.patch_panel, BufferedPatchPanel)) # TODO(aw): remove this hack self.simulation.fail_to_interactive = self.fail_to_interactive self.simulation.fail_to_interactive_on_persistent_violations =\ self.fail_to_interactive_on_persistent_violations self.simulation.openflow_buffer.pass_through_whitelisted_packets =\ self.pass_through_whitelisted_messages if self.pass_through_sends: self.set_pass_through_sends(self.simulation) if self.delay_flow_mods: for switch in self.simulation.topology.switches: assert(isinstance(switch, FuzzSoftwareSwitch)) switch.use_delayed_commands() self.run_simulation_forward(post_bootstrap_hook) if self.print_buffers_flag: self._print_buffers() return self.simulation def _print_buffers(self): log.debug("Pending Message Receives:") for p in self.simulation.openflow_buffer.pending_receives: log.debug("- %s", p) log.debug("Pending State Changes:") for p in self.sync_callback.pending_state_changes(): log.debug("- %s", p) def run_simulation_forward(self, post_bootstrap_hook=None): event_scheduler = self.create_event_scheduler(self.simulation) event_scheduler.set_input_logger(self._input_logger) self.event_scheduler_stats = event_scheduler.stats if post_bootstrap_hook is not None: post_bootstrap_hook() def interrupt(sgn, frame): msg.interactive("Interrupting replayer, dropping to console (press ^C again to terminate)") signal.signal(signal.SIGINT, self.old_interrupt) self.old_interrupt = None raise KeyboardInterrupt() self.old_interrupt = signal.signal(signal.SIGINT, interrupt) try: for i, event in enumerate(self.dag.events): try: self.compute_interpolated_time(event) if self.default_dp_permit: self.dp_checker.check_dataplane(i, self.simulation) if isinstance(event, InputEvent): self._check_early_state_changes(self.dag, i, event) self._check_new_state_changes(self.dag, i) self._check_unexpected_cp_messages(self.dag, i) # TODO(cs): quasi race-condition here. If unexpected state change # happens *while* we're waiting for event, we basically have a # deadlock (if controller logging is set to blocking) until the # timeout occurs # TODO(cs): we don't actually allow new internal message events # through.. we only let new state changes through. Should experiment # with whether we would get better fidelity if we let them through. if self.fail_fast and (i % self.check_interval) == 0 and self._check_violation(): return event_scheduler.schedule(event) if self.logical_time != event.round: self.logical_time = event.round self.increment_round() except KeyboardInterrupt: interactive = Interactive(self.simulation_cfg, input_logger=self._input_logger) interactive.simulate(self.simulation, bound_objects=( ('replayer', self), )) # If Interactive terminated due to ^D, return to our fuzzing loop, # prepared again to drop into Interactive on ^C. self.old_interrupt = signal.signal(signal.SIGINT, interrupt) except Exception: log.critical("Exception raised while scheduling event %s, replay %s" % (str(event), self.replay_id)) raise if self.invariant_check: # Wait a bit in case the bug takes awhile to happen # TODO(cs): may be redundant with WaitTime events at the end of the # trace. log.debug("Sleeping %d seconds after run" % self.end_wait_seconds) if self.default_dp_permit: self._sleep_with_dataplane_passthrough(self.end_wait_seconds) else: time.sleep(self.end_wait_seconds) self._check_violation() finally: if self.old_interrupt: signal.signal(signal.SIGINT, self.old_interrupt) msg.event(color.B_BLUE+"Event Stats: %s" % str(event_scheduler.stats)) if self.default_dp_permit: msg.event(color.B_BLUE+"DataplaneDrop Stats: %s" % str(self.dp_checker.stats)) if self.end_in_interactive: interactive = Interactive(self.simulation_cfg, input_logger=self._input_logger) interactive.simulate(self.simulation, bound_objects=( ('replayer', self), )) def _check_violation(self): # TODO(cs): this does not verify whether the violation is persistent # or transient. Perhaps it should? violations = self.invariant_check(self.simulation) self.simulation.violation_found = False if violations != []: self._log_input_event(InvariantViolation(violations)) msg.fail("Violations at end of trace: %s" % str(violations)) if self.bug_signature: if self.bug_signature in violations: self.simulation.violation_found = True msg.success("Violation found %s" % self.bug_signature) return True else: msg.fail("Violation does not match violation signature!") return False else: self.simulation.violation_found = True return True else: msg.success("No correctness violations!") return False def _check_early_state_changes(self, dag, current_index, input): ''' Check whether any pending state change that were supposed to come *after* the current input have occured. If so, we have violated causality.''' # Note that we never violate causality due to message sends/receipts, # since we buffer all messages. pending_state_changes = self.sync_callback.pending_state_changes() if len(pending_state_changes) > 0: # TODO(cs): currently assumes a single controller (-> single pending state # change) state_change = pending_state_changes[0] next_expected = dag.next_state_change(current_index) original_input_index = dag.get_original_index_for_event(input) if (next_expected is not None and state_change == next_expected.pending_state_change and dag.get_original_index_for_event(next_expected) > original_input_index): # raise RuntimeError("State change happened before expected! Causality violated") log.warn("State change happened before expected! Causality violated") self.early_state_changes.append(repr(next_expected)) def _check_new_state_changes(self, dag, current_index): ''' If we are blocking controllers, it's bad news bears if an unexpected internal state change occurs. Check if there are any, and ACK them so that the execution can proceed.''' pending_state_changes = self.sync_callback.pending_state_changes() if len(pending_state_changes) > 0: # TODO(cs): currently assumes a single controller (-> single pending state # change) state_change = pending_state_changes[0] next_expected = dag.next_state_change(current_index) if (next_expected is None or state_change != next_expected.pending_state_change): log.info("Unexpected state change. Ack'ing %s" % str(state_change)) log_event = ControllerStateChange.from_pending_state_change(state_change) # Monkeypatch a "new internal event" marker to be logged to the JSON trace # (All fields picked up by event.to_json()) log_event.new_internal_event = True log_event.replay_time = SyncTime.now() self._log_input_event(log_event) self.unexpected_state_changes.append(repr(state_change)) self.sync_callback.ack_pending_state_change(state_change) def _check_unexpected_cp_messages(self, dag, current_index): ''' If throughout replay we observe new messages that weren't in the original trace, we (usually) want to let them through. This is especially true for messages that are related to timers, such as LLDP, since timings will often change during replay, and we don't want to unnecessarily change the controller's behavior. ''' if not self.allow_unexpected_messages: return # TODO(cs): evaluate whether _check_unexpected_cp_messages is more or less # effective than whitelisting cp message types (in OpenflowBuffer). # Compare executions with visualization tool. # Currently it appears that this method is too liberal, and ends up # causing timouts as a result of letting messages through. # First, build a set of expected ControlMessageSends/Receives fingerprints # within the next expected_message_round_window rounds. start_round = dag.events[current_index].round expected_receive_fingerprints = set() expected_send_fingerprints = set() for i in xrange(current_index, len(dag.events)): event = dag.events[i] if event.round - start_round > self.expected_message_round_window: break if type(event) == ControlMessageReceive: (_, of_fingerprint, dpid, cid) = event.fingerprint expected_receive_fingerprints.add((of_fingerprint, dpid, cid)) if type(event) == ControlMessageSend: (_, of_fingerprint, dpid, cid) = event.fingerprint expected_send_fingerprints.add((of_fingerprint, dpid, cid)) # Now check pending messages. for expected_fingerprints, messages in [ (expected_receive_fingerprints, self.simulation.openflow_buffer.pending_receives), (expected_send_fingerprints, self.simulation.openflow_buffer.pending_sends)]: for pending_message in messages: fingerprint = (pending_message.fingerprint, pending_message.dpid, pending_message.controller_id) if fingerprint not in expected_fingerprints: message = self.simulation.openflow_buffer.schedule(pending_message) log.debug("Allowed unexpected message %s" % message) b64_packet = base64_encode(message) # Monkeypatch a "new internal event" marker to be logged to the JSON trace # (All fields picked up by event.to_json()) event_type = ControlMessageReceive if type(pending_message) == PendingReceive else ControlMessageSend log_event = event_type(pending_message.dpid, pending_message.controller_id, pending_message.fingerprint, b64_packet=b64_packet) log_event.new_internal_event = True log_event.replay_time = SyncTime.now() self.passed_unexpected_messages.append(repr(log_event)) self._log_input_event(log_event)
class Replayer(ControlFlow): ''' Replay events from a `superlog` with causal dependencies, pruning as we go To set the event scheduling paramters, pass them as keyword args to the constructor of this class, which will pass them on to the EventScheduler object it creates. ''' # Runtime stats: total_replays = 0 total_inputs_replayed = 0 # Interpolated time parameter. *not* the event scheduling epsilon: time_epsilon_microseconds = 500 def __init__(self, simulation_cfg, superlog_path_or_dag, create_event_scheduler=None, print_buffers=True, wait_on_deterministic_values=False, fail_to_interactive=False, **kwargs): ControlFlow.__init__(self, simulation_cfg) if wait_on_deterministic_values: self.sync_callback = ReplaySyncCallback() else: self.sync_callback = ReplaySyncCallback(self.get_interpolated_time) if type(superlog_path_or_dag) == str: superlog_path = superlog_path_or_dag # The dag is codefied as a list, where each element has # a list of its dependents self.dag = EventDag(superlog_parser.parse_path(superlog_path)) else: self.dag = superlog_path_or_dag self.dp_checker = DataplaneChecker(self.dag) self.print_buffers = print_buffers # compute interpolate to time to be just before first event self.compute_interpolated_time(self.dag.events[0]) self.unexpected_state_changes = [] self.early_state_changes = [] self.event_scheduler_stats = None self.fail_to_interactive = fail_to_interactive if create_event_scheduler: self.create_event_scheduler = create_event_scheduler else: self.create_event_scheduler = \ lambda simulation: EventScheduler(simulation, **{ k: v for k,v in kwargs.items() if k in EventScheduler.kwargs }) def get_interpolated_time(self): ''' During divergence, the controller may ask for the current time more or less times than they did in the original run. We control the time, so we need to give them some answer. The answers we give them should be (i) monotonically increasing, and (ii) between the time of the last recorded ("landmark") event and the next landmark event, and (iii) as close to the recorded times as possible Our temporary solution is to always return the time right before the next landmark ''' # TODO(cs): implement Andi's improved time heuristic return self.interpolated_time def compute_interpolated_time(self, current_event): next_time = current_event.time just_before_micro = next_time.microSeconds - self.time_epsilon_microseconds just_before_micro = max(0, just_before_micro) self.interpolated_time = SyncTime(next_time.seconds, just_before_micro) def increment_round(self): msg.event(color.CYAN + ( "Round %d" % self.logical_time) + color.WHITE) def simulate(self, post_bootstrap_hook=None): ''' Caller *must* call simulation.clean_up() ''' Replayer.total_replays += 1 Replayer.total_inputs_replayed += len(self.dag.input_events) self.simulation = self.simulation_cfg.bootstrap(self.sync_callback) assert(isinstance(self.simulation.patch_panel, BufferedPatchPanel)) ### TODO aw remove this hack self.simulation.fail_to_interactive = self.fail_to_interactive self.logical_time = 0 self.run_simulation_forward(self.dag, post_bootstrap_hook) if self.print_buffers: self._print_buffers() return self.simulation def _print_buffers(self): log.debug("Pending Message Receives:") for p in self.simulation.god_scheduler.pending_receives(): log.debug("- %s", p) log.debug("Pending State Changes:") for p in self.sync_callback.pending_state_changes(): log.debug("- %s", p) def run_simulation_forward(self, dag, post_bootstrap_hook=None): event_scheduler = self.create_event_scheduler(self.simulation) self.event_scheduler_stats = event_scheduler.stats if post_bootstrap_hook is not None: post_bootstrap_hook() old_interrupt = None def interrupt(sgn, frame): msg.interactive("Interrupting replayer, dropping to console (press ^C again to terminate)") signal.signal(signal.SIGINT, self.old_interrupt) self.old_interrupt = None raise KeyboardInterrupt() self.old_interrupt = signal.signal(signal.SIGINT, interrupt) try: for i, event in enumerate(dag.events): try: self.compute_interpolated_time(event) self.dp_checker.check_dataplane(i, self.simulation) if isinstance(event, InputEvent): self._check_early_state_changes(dag, i, event) self._check_new_state_changes(dag, i) # TODO(cs): quasi race-condition here. If unexpected state change # happens *while* we're waiting for event, we basically have a # deadlock (if controller logging is set to blocking) until the # timeout occurs event_scheduler.schedule(event) if self.logical_time != event.round: self.logical_time = event.round self.increment_round() except KeyboardInterrupt as e: interactive = Interactive(self.simulation_cfg) interactive.simulate(self.simulation, bound_objects=( ('replayer', self), )) self.old_interrupt = signal.signal(signal.SIGINT, interrupt) finally: if self.old_interrupt: signal.signal(signal.SIGINT, self.old_interrupt) msg.event(color.B_BLUE+"Event Stats: %s" % str(event_scheduler.stats)) msg.event(color.B_BLUE+"DataplaneDrop Stats: %s" % str(self.dp_checker.stats)) def _check_early_state_changes(self, dag, current_index, input): ''' Check whether the any pending state change were supposed to come *after* the current input. If so, we have violated causality.''' pending_state_changes = self.sync_callback.pending_state_changes() if len(pending_state_changes) > 0: # TODO(cs): currently assumes a single controller (-> single pending state # change) state_change = pending_state_changes[0] next_expected = dag.next_state_change(current_index) original_input_index = dag.get_original_index_for_event(input) if (next_expected is not None and state_change == next_expected.pending_state_change and dag.get_original_index_for_event(next_expected) > original_input_index): raise RuntimeError("State change happened before expected! Causality violated") self.early_state_changes.append(repr(next_expected)) def _check_new_state_changes(self, dag, current_index): ''' If we are blocking controllers, it's bad news bears if an unexpected internal state change occurs. Check if there are any, and ACK them so that the execution can proceed.''' pending_state_changes = self.sync_callback.pending_state_changes() if len(pending_state_changes) > 0: # TODO(cs): currently assumes a single controller (-> single pending state # change) state_change = pending_state_changes[0] next_expected = dag.next_state_change(current_index) if (next_expected is None or state_change != next_expected.pending_state_change): log.info("Unexpected state change. Ack'ing") self.unexpected_state_changes.append(repr(state_change)) self.sync_callback.ack_pending_state_change(state_change)
class Replayer(ControlFlow): ''' Replay events from a trace To set the event scheduling parameters, pass them as keyword args to the constructor of this class, which will pass them on to the EventScheduler object it creates. ''' # Interpolated time parameter. *not* the event scheduling epsilon: time_epsilon_microseconds = 500 def __init__(self, simulation_cfg, superlog_path_or_dag, create_event_scheduler=None, print_buffers=True, wait_on_deterministic_values=False, default_dp_permit=False, fail_to_interactive=False, fail_to_interactive_on_persistent_violations=False, end_in_interactive=False, input_logger=None, allow_unexpected_messages=False, pass_through_whitelisted_messages=True, delay_flow_mods=False, **kwargs): ControlFlow.__init__(self, simulation_cfg) if wait_on_deterministic_values: self.sync_callback = ReplaySyncCallback() else: self.sync_callback = ReplaySyncCallback(self.get_interpolated_time) if type(superlog_path_or_dag) == str: superlog_path = superlog_path_or_dag # The dag is codefied as a list, where each element has # a list of its dependents self.dag = EventDag(log_parser.parse_path(superlog_path)) else: self.dag = superlog_path_or_dag if len(self.dag.events) == 0: raise ValueError("No events to replay!") self.default_dp_permit = default_dp_permit # Set DataplanePermit and DataplaneDrop to passive if permit is set # to default for event in [ e for e in self.dag.events if type(e) in dp_events ]: event.passive = default_dp_permit self.dp_checker = None if default_dp_permit: self.dp_checker = DataplaneChecker(self.dag) self.print_buffers_flag = print_buffers # compute interpolate to time to be just before first event self.compute_interpolated_time(self.dag.events[0]) # String repesentations of unexpected state changes we've passed through, for # statistics purposes. self.unexpected_state_changes = [] self.early_state_changes = [] self.event_scheduler_stats = None self.end_in_interactive = end_in_interactive self.fail_to_interactive = fail_to_interactive self.fail_to_interactive_on_persistent_violations =\ fail_to_interactive_on_persistent_violations self._input_logger = input_logger self.allow_unexpected_messages = allow_unexpected_messages self.pass_through_whitelisted_messages = pass_through_whitelisted_messages # How many logical rounds to peek ahead when deciding if a message is # expected or not. self.expected_message_round_window = 3 # String repesentations of unexpected messages we've passed through, for # statistics purposes. self.passed_unexpected_messages = [] self.delay_flow_mods = delay_flow_mods if self.pass_through_whitelisted_messages: for event in self.dag.events: if hasattr(event, "ignore_whitelisted_packets"): event.ignore_whitelisted_packets = True if create_event_scheduler: self.create_event_scheduler = create_event_scheduler else: self.create_event_scheduler = \ lambda simulation: EventScheduler(simulation, **{ k: v for k,v in kwargs.items() if k in EventScheduler.kwargs }) def _log_input_event(self, event, **kws): if self._input_logger is not None: self._input_logger.log_input_event(event, **kws) def get_interpolated_time(self): ''' During divergence, the controller may ask for the current time more or less times than they did in the original run. We control the time, so we need to give them some answer. The answers we give them should be (i) monotonically increasing, and (ii) between the time of the last recorded ("landmark") event and the next landmark event, and (iii) as close to the recorded times as possible Our temporary solution is to always return the time right before the next landmark ''' # TODO(cs): implement Andi's improved time heuristic return self.interpolated_time def compute_interpolated_time(self, current_event): next_time = current_event.time just_before_micro = next_time.microSeconds - self.time_epsilon_microseconds just_before_micro = max(0, just_before_micro) self.interpolated_time = SyncTime(next_time.seconds, just_before_micro) def increment_round(self): msg.event(color.CYAN + ("Round %d" % self.logical_time) + color.WHITE) def init_results(self, results_dir): if self._input_logger: self._input_logger.open(results_dir) def simulate(self, post_bootstrap_hook=None): ''' Caller *must* call simulation.clean_up() ''' self.simulation = self.simulation_cfg.bootstrap(self.sync_callback) assert(isinstance(self.simulation.patch_panel, BufferedPatchPanel)) # TODO(aw): remove this hack self.simulation.fail_to_interactive = self.fail_to_interactive self.simulation.fail_to_interactive_on_persistent_violations =\ self.fail_to_interactive_on_persistent_violations self.simulation.openflow_buffer.pass_through_whitelisted_messages =\ self.pass_through_whitelisted_messages self.logical_time = 0 if self.delay_flow_mods: for switch in self.simulation.topology.switches: assert(isinstance(switch, FuzzSoftwareSwitch)) switch.use_delayed_commands() self.run_simulation_forward(self.dag, post_bootstrap_hook) if self.print_buffers_flag: self._print_buffers() return self.simulation def _print_buffers(self): log.debug("Pending Message Receives:") for p in self.simulation.openflow_buffer.pending_receives(): log.debug("- %s", p) log.debug("Pending State Changes:") for p in self.sync_callback.pending_state_changes(): log.debug("- %s", p) def run_simulation_forward(self, dag, post_bootstrap_hook=None): event_scheduler = self.create_event_scheduler(self.simulation) event_scheduler.set_input_logger(self._input_logger) self.event_scheduler_stats = event_scheduler.stats if post_bootstrap_hook is not None: post_bootstrap_hook() def interrupt(sgn, frame): msg.interactive("Interrupting replayer, dropping to console (press ^C again to terminate)") signal.signal(signal.SIGINT, self.old_interrupt) self.old_interrupt = None raise KeyboardInterrupt() self.old_interrupt = signal.signal(signal.SIGINT, interrupt) try: for i, event in enumerate(dag.events): try: self.compute_interpolated_time(event) if self.default_dp_permit: self.dp_checker.check_dataplane(i, self.simulation) if isinstance(event, InputEvent): self._check_early_state_changes(dag, i, event) self._check_new_state_changes(dag, i) self._check_unexpected_cp_messages(dag, i) # TODO(cs): quasi race-condition here. If unexpected state change # happens *while* we're waiting for event, we basically have a # deadlock (if controller logging is set to blocking) until the # timeout occurs # TODO(cs): we don't actually allow new internal message events # through.. we only let new state changes through. Should experiment # with whether we would get better fidelity if we let them through. event_scheduler.schedule(event) if self.logical_time != event.round: self.logical_time = event.round self.increment_round() except KeyboardInterrupt: interactive = Interactive(self.simulation_cfg, input_logger=self._input_logger) interactive.simulate(self.simulation, bound_objects=( ('replayer', self), )) self.old_interrupt = signal.signal(signal.SIGINT, interrupt) finally: if self.old_interrupt: signal.signal(signal.SIGINT, self.old_interrupt) msg.event(color.B_BLUE+"Event Stats: %s" % str(event_scheduler.stats)) if self.default_dp_permit: msg.event(color.B_BLUE+"DataplaneDrop Stats: %s" % str(self.dp_checker.stats)) if self.end_in_interactive: interactive = Interactive(self.simulation_cfg, input_logger=self._input_logger) interactive.simulate(self.simulation, bound_objects=( ('replayer', self), )) def _check_early_state_changes(self, dag, current_index, input): ''' Check whether any pending state change that were supposed to come *after* the current input have occured. If so, we have violated causality.''' # Note that we never violate causality due to message sends/receipts, # since we buffer all messages. pending_state_changes = self.sync_callback.pending_state_changes() if len(pending_state_changes) > 0: # TODO(cs): currently assumes a single controller (-> single pending state # change) state_change = pending_state_changes[0] next_expected = dag.next_state_change(current_index) original_input_index = dag.get_original_index_for_event(input) if (next_expected is not None and state_change == next_expected.pending_state_change and dag.get_original_index_for_event(next_expected) > original_input_index): # raise RuntimeError("State change happened before expected! Causality violated") log.warn("State change happened before expected! Causality violated") self.early_state_changes.append(repr(next_expected)) def _check_new_state_changes(self, dag, current_index): ''' If we are blocking controllers, it's bad news bears if an unexpected internal state change occurs. Check if there are any, and ACK them so that the execution can proceed.''' pending_state_changes = self.sync_callback.pending_state_changes() if len(pending_state_changes) > 0: # TODO(cs): currently assumes a single controller (-> single pending state # change) state_change = pending_state_changes[0] next_expected = dag.next_state_change(current_index) if (next_expected is None or state_change != next_expected.pending_state_change): log.info("Unexpected state change. Ack'ing") log_event = ControllerStateChange.from_pending_state_change(state_change) # Monkeypatch a "new internal event" marker to be logged to the JSON trace # (All fields picked up by event.to_json()) log_event.new_internal_event = True log_event.replay_time = SyncTime.now() self._log_input_event(log_event) self.unexpected_state_changes.append(repr(state_change)) self.sync_callback.ack_pending_state_change(state_change) def _check_unexpected_cp_messages(self, dag, current_index): ''' If throughout replay we observe new messages that weren't in the original trace, we (usually) want to let them through. This is especially true for messages that are related to timers, such as LLDP, since timings will often change during replay, and we don't want to unnecessarily change the controller's behavior. ''' if not self.allow_unexpected_messages: return # TODO(cs): evaluate whether _check_unexpected_cp_messages is more or less # effective than whitelisting cp message types (in OpenflowBuffer). # Compare executions with visualization tool. # Currently it appears that this method is too liberal, and ends up # causing timouts as a result of letting messages through. # First, build a set of expected ControlMessageSends/Receives fingerprints # within the next expected_message_round_window rounds. start_round = dag.events[current_index].round expected_receive_fingerprints = set() expected_send_fingerprints = set() for i in xrange(current_index, len(dag.events)): event = dag.events[i] if event.round - start_round > self.expected_message_round_window: break if type(event) == ControlMessageReceive: (_, of_fingerprint, dpid, cid) = event.fingerprint expected_receive_fingerprints.add((of_fingerprint, dpid, cid)) if type(event) == ControlMessageSend: (_, of_fingerprint, dpid, cid) = event.fingerprint expected_send_fingerprints.add((of_fingerprint, dpid, cid)) # Now check pending messages. for expected_fingerprints, messages in [ (expected_receive_fingerprints, self.simulation.openflow_buffer.pending_receives()), (expected_send_fingerprints, self.simulation.openflow_buffer.pending_sends())]: for pending_message in messages: fingerprint = (pending_message.fingerprint, pending_message.dpid, pending_message.controller_id) if fingerprint not in expected_fingerprints: message = self.simulation.openflow_buffer.schedule(pending_message) log.debug("Sending unexpected %s" % message) b64_packet = base64_encode(message) # Monkeypatch a "new internal event" marker to be logged to the JSON trace # (All fields picked up by event.to_json()) event_type = ControlMessageReceive if type(pending_message) == PendingReceive else ControlMessageSend log_event = event_type(pending_message.dpid, pending_message.controller_id, pending_message.fingerprint, b64_packet=b64_packet) log_event.new_internal_event = True log_event.replay_time = SyncTime.now() self.passed_unexpected_messages.append(repr(log_event)) self._log_input_event(log_event)