def receiveUnrecognizedMessage(self, msg, sender): # at the moment, we implement all message handling blocking. This is not ideal but simple to get started with. Besides, the caller # needs to block anyway. The only reason we implement mechanic as an actor is to distribute them. # noinspection PyBroadException try: self.logger.debug( "NodeMechanicActor#receiveMessage(msg = [%s] sender = [%s])", str(type(msg)), str(sender)) if isinstance(msg, ResetRelativeTime) and self.mechanic: self.mechanic.reset_relative_time() elif isinstance(msg, thespian.actors.WakeupMessage) and self.mechanic: self.mechanic.flush_metrics() self.wakeupAfter(METRIC_FLUSH_INTERVAL_SECONDS) elif isinstance(msg, StopNodes): self.mechanic.stop_engine() self.send(sender, NodesStopped()) self.mechanic = None elif isinstance(msg, thespian.actors.ActorExitRequest): if self.mechanic: self.mechanic.stop_engine() self.mechanic = None except BaseException as e: self.logger.exception("Cannot process message [%s]", msg) self.send( getattr(msg, "reply_to", sender), actor.BenchmarkFailure("Error on host %s" % str(self.host), e))
def receiveMsg_ChildActorExited(self, msg, sender): if self.is_current_status_expected(["cluster_stopping", "cluster_stopped"]): self.logger.info("Child actor exited while engine is stopping or stopped: [%s]", msg) return failmsg = "Child actor exited with [%s] while in status [%s]." % (msg, self.status) self.logger.error(failmsg) self.send(self.race_control, actor.BenchmarkFailure(failmsg))
def receiveMsg_StartNodes(self, msg, sender): try: self.host = msg.ip if msg.external: self.logger.info( "Connecting to externally provisioned nodes on [%s].", msg.ip) else: self.logger.info("Starting node(s) %s on [%s].", msg.node_ids, msg.ip) # Load node-specific configuration self.config = config.auto_load_local_config( msg.cfg, additional_sections=[ # only copy the relevant bits "track", "mechanic", "client", # allow metrics store to extract race meta-data "race", "source" ]) # set root path (normally done by the main entry point) self.config.add(config.Scope.application, "node", "rally.root", paths.rally_root()) if not msg.external: self.config.add(config.Scope.benchmark, "provisioning", "node.ip", msg.ip) # we need to override the port with the value that the user has specified instead of using the default value (39200) self.config.add(config.Scope.benchmark, "provisioning", "node.http.port", msg.port) self.config.add(config.Scope.benchmark, "provisioning", "node.ids", msg.node_ids) cls = metrics.metrics_store_class(self.config) self.metrics_store = cls(self.config) self.metrics_store.open(ctx=msg.open_metrics_context) # avoid follow-up errors in case we receive an unexpected ActorExitRequest due to an early failure in a parent actor. self.metrics_store.lap = 0 self.mechanic = create(self.config, self.metrics_store, msg.all_node_ips, msg.cluster_settings, msg.sources, msg.build, msg.distribution, msg.external, msg.docker) nodes = self.mechanic.start_engine() self.running = True self.send( getattr(msg, "reply_to", sender), NodesStarted([NodeMetaInfo(node) for node in nodes], self.metrics_store.meta_info)) except Exception: self.logger.exception("Cannot process message [%s]", msg) # avoid "can't pickle traceback objects" import traceback ex_type, ex_value, ex_traceback = sys.exc_info() self.send(getattr(msg, "reply_to", sender), actor.BenchmarkFailure(ex_value, traceback.format_exc()))
def receiveMsg_PoisonMessage(self, msg, sender): self.logger.info("MechanicActor#receiveMessage poison(msg = [%s] sender = [%s])", str(msg.poisonMessage), str(sender)) # something went wrong with a child actor (or another actor with which we have communicated) if isinstance(msg.poisonMessage, StartEngine): failmsg = "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" else: failmsg = msg.details self.logger.error(failmsg) self.send(self.race_control, actor.BenchmarkFailure(failmsg))
def receiveUnrecognizedMessage(self, msg, sender): # at the moment, we implement all message handling blocking. This is not ideal but simple to get started with. Besides, the caller # needs to block anyway. The only reason we implement mechanic as an actor is to distribute them. # noinspection PyBroadException try: self.logger.debug( "NodeMechanicActor#receiveMessage(msg = [%s] sender = [%s])", str(type(msg)), str(sender)) if isinstance(msg, ResetRelativeTime): self.logger.info( "Resetting relative time of system metrics store on host [%s].", self.host) self.metrics_store.reset_relative_time() elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap self.mechanic.on_benchmark_start() self.wakeupAfter(METRIC_FLUSH_INTERVAL_SECONDS) self.send(sender, BenchmarkStarted()) elif isinstance(msg, thespian.actors.WakeupMessage): if self.running: self.logger.debug( "Flushing system metrics store on host [%s].", self.host) self.metrics_store.flush(refresh=False) self.wakeupAfter(METRIC_FLUSH_INTERVAL_SECONDS) elif isinstance(msg, OnBenchmarkStop): self.mechanic.on_benchmark_stop() self.metrics_store.flush(refresh=False) # clear metrics store data to not send duplicate system metrics data self.send( sender, BenchmarkStopped( self.metrics_store.to_externalizable(clear=True))) elif isinstance(msg, StopNodes): self.logger.info("Stopping nodes %s.", self.mechanic.nodes) self.mechanic.stop_engine() self.send(sender, NodesStopped(self.metrics_store.to_externalizable())) # clear all state as the mechanic might get reused later self.metrics_store.close() self.running = False self.config = None self.mechanic = None self.metrics_store = None elif isinstance(msg, thespian.actors.ActorExitRequest): if self.running: self.logger.info( "Stopping nodes %s (due to ActorExitRequest)", self.mechanic.nodes) self.mechanic.stop_engine() self.running = False except BaseException as e: self.running = False self.logger.exception("Cannot process message [%s]", msg) self.send( getattr(msg, "reply_to", sender), actor.BenchmarkFailure("Error on host %s" % str(self.host), e))
def receiveMsg_StartNodes(self, msg, sender): try: self.host = msg.ip if msg.external: self.logger.info("Connecting to externally provisioned nodes on [%s].", msg.ip) else: self.logger.info("Starting node(s) %s on [%s].", msg.node_ids, msg.ip) # Load node-specific configuration cfg = config.auto_load_local_config( msg.cfg, additional_sections=[ # only copy the relevant bits "track", "mechanic", "client", "telemetry", # allow metrics store to extract race meta-data "race", "source", ], ) # set root path (normally done by the main entry point) cfg.add(config.Scope.application, "node", "rally.root", paths.rally_root()) if not msg.external: cfg.add(config.Scope.benchmark, "provisioning", "node.ids", msg.node_ids) cls = metrics.metrics_store_class(cfg) metrics_store = cls(cfg) metrics_store.open(ctx=msg.open_metrics_context) # avoid follow-up errors in case we receive an unexpected ActorExitRequest due to an early failure in a parent actor. self.mechanic = create( cfg, metrics_store, msg.ip, msg.port, msg.all_node_ips, msg.all_node_ids, msg.sources, msg.distribution, msg.external, msg.docker, ) self.mechanic.start_engine() self.wakeupAfter(METRIC_FLUSH_INTERVAL_SECONDS) self.send(getattr(msg, "reply_to", sender), NodesStarted()) except Exception: self.logger.exception("Cannot process message [%s]", msg) # avoid "can't pickle traceback objects" _, ex_value, _ = sys.exc_info() self.send(getattr(msg, "reply_to", sender), actor.BenchmarkFailure(ex_value, traceback.format_exc()))
def on_all_nodes_started(self): self.cluster_launcher = launcher.ClusterLauncher(self.cfg, self.metrics_store) # Workaround because we could raise a LaunchError here and thespian will attempt to retry a failed message. # In that case, we will get a followup RallyAssertionError because on the second attempt, Rally will check # the status which is now "nodes_started" but we expected the status to be "nodes_starting" previously. try: self.cluster = self.cluster_launcher.start() except BaseException as e: self.send(self.race_control, actor.BenchmarkFailure("Could not launch cluster", e)) else: # push down all meta data again self.send_to_children_and_transition(self.myAddress, ApplyMetricsMetaInfo(self.metrics_store.meta_info), "nodes_started", "apply_meta_info")
def receiveMsg_ActorSystemConventionUpdate(self, convmsg, sender): if not convmsg.remoteAdded: self.logger.warning("Remote Rally node [%s] exited during NodeMechanicActor startup process.", convmsg.remoteAdminAddress) self.start_sender(actor.BenchmarkFailure("Remote Rally node [%s] has been shutdown prematurely." % convmsg.remoteAdminAddress)) else: remote_ip = convmsg.remoteCapabilities.get("ip", None) self.logger.info("Remote Rally node [%s] has started.", remote_ip) for eachmsg in self.remotes[remote_ip]: self.pending.append((self.createActor(NodeMechanicActor, targetActorRequirements={"ip": remote_ip}), eachmsg)) if remote_ip in self.remotes: del self.remotes[remote_ip] if not self.remotes: # Notifications are no longer needed self.notifyOnSystemRegistrationChanges(False) self.send_all_pending()
def receiveMsg_PoisonMessage(self, msg, sender): if sender != self.myAddress: self.send(sender, actor.BenchmarkFailure(msg.details))
def receiveMsg_PoisonMessage(self, msg, sender): self.send(self.start_sender, actor.BenchmarkFailure(msg.details))
def receiveMessage(self, msg, sender): # at the moment, we implement all message handling blocking. This is not ideal but simple to get started with. Besides, the caller # needs to block anyway. The only reason we implement mechanic as an actor is to distribute them. # noinspection PyBroadException try: logger.debug( "NodeMechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartNodes): self.host = msg.ip if msg.external: logger.info( "Connecting to externally provisioned nodes on [%s]." % msg.ip) else: logger.info("Starting node(s) %s on [%s]." % (msg.node_ids, msg.ip)) # Load node-specific configuration self.config = config.auto_load_local_config( msg.cfg, additional_sections=[ # only copy the relevant bits "track", "mechanic", "client", # allow metrics store to extract race meta-data "race", "source" ]) # set root path (normally done by the main entry point) self.config.add(config.Scope.application, "node", "rally.root", paths.rally_root()) if not msg.external: self.config.add(config.Scope.benchmark, "provisioning", "node.ip", msg.ip) # we need to override the port with the value that the user has specified instead of using the default value (39200) self.config.add(config.Scope.benchmark, "provisioning", "node.http.port", msg.port) self.config.add(config.Scope.benchmark, "provisioning", "node.ids", msg.node_ids) self.metrics_store = metrics.InMemoryMetricsStore(self.config) self.metrics_store.open(ctx=msg.open_metrics_context) # avoid follow-up errors in case we receive an unexpected ActorExitRequest due to an early failure in a parent actor. self.metrics_store.lap = 0 self.mechanic = create(self.config, self.metrics_store, msg.all_node_ips, msg.cluster_settings, msg.sources, msg.build, msg.distribution, msg.external, msg.docker) nodes = self.mechanic.start_engine() self.running = True self.send( sender, NodesStarted([NodeMetaInfo(node) for node in nodes], self.metrics_store.meta_info)) elif isinstance(msg, ApplyMetricsMetaInfo): self.metrics_store.merge_meta_info(msg.meta_info) self.send(sender, MetricsMetaInfoApplied()) elif isinstance(msg, ResetRelativeTime): logger.info( "Resetting relative time of system metrics store on host [%s]." % self.host) self.metrics_store.reset_relative_time() elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap self.mechanic.on_benchmark_start() self.send(sender, BenchmarkStarted()) elif isinstance(msg, OnBenchmarkStop): self.mechanic.on_benchmark_stop() # clear metrics store data to not send duplicate system metrics data self.send( sender, BenchmarkStopped( self.metrics_store.to_externalizable(clear=True))) elif isinstance(msg, StopNodes): logger.info("Stopping nodes %s." % self.mechanic.nodes) self.mechanic.stop_engine() self.send(sender, NodesStopped(self.metrics_store.to_externalizable())) # clear all state as the mechanic might get reused later self.running = False self.config = None self.mechanic = None self.metrics_store = None elif isinstance(msg, thespian.actors.ActorExitRequest): if self.running: logger.info("Stopping nodes %s (due to ActorExitRequest)" % self.mechanic.nodes) self.mechanic.stop_engine() self.running = False except BaseException: self.running = False logger.exception("Cannot process message [%s]" % msg) # avoid "can't pickle traceback objects" import traceback ex_type, ex_value, ex_traceback = sys.exc_info() self.send(sender, actor.BenchmarkFailure(ex_value, traceback.format_exc()))
def receiveMessage(self, msg, sender): try: logger.info( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): self.on_start_engine(msg, sender) elif isinstance(msg, NodesStarted): self.metrics_store.merge_meta_info(msg.system_meta_info) self.transition_when_all_children_responded( sender, msg, "starting", "nodes_started", self.on_all_nodes_started) elif isinstance(msg, MetricsMetaInfoApplied): self.transition_when_all_children_responded( sender, msg, "apply_meta_info", "cluster_started", self.on_cluster_started) elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap self.cluster.on_benchmark_start() # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped" self.send_to_children_and_transition( sender, msg, ["cluster_started", "benchmark_stopped"], "benchmark_starting") elif isinstance(msg, BenchmarkStarted): self.transition_when_all_children_responded( sender, msg, "benchmark_starting", "benchmark_started", self.on_benchmark_started) elif isinstance(msg, ResetRelativeTime): if msg.reset_in_seconds > 0: self.wakeupAfter(msg.reset_in_seconds) else: self.reset_relative_time() elif isinstance(msg, thespian.actors.WakeupMessage): self.reset_relative_time() elif isinstance(msg, actor.BenchmarkFailure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): self.send_to_children_and_transition(sender, msg, "benchmark_started", "benchmark_stopping") elif isinstance(msg, BenchmarkStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "benchmark_stopping", "benchmark_stopped", self.on_benchmark_stopped) elif isinstance(msg, StopEngine): # detach from cluster and gather all system metrics self.cluster_launcher.stop(self.cluster) # we might have experienced a launch error or the user has cancelled the benchmark. Hence we need to allow to stop the # cluster from various states and we don't check here for a specific one. self.send_to_children_and_transition(sender, StopNodes(), [], "cluster_stopping") elif isinstance(msg, NodesStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "cluster_stopping", "cluster_stopped", self.on_all_nodes_stopped) elif isinstance(msg, thespian.actors.ActorExitRequest): # due to early termination by race control. If it's self-initiated we already took care of the rest. if sender != self.myAddress: self.send_to_children_and_transition( self.myAddress, msg, expected_status=None, new_status="cluster_stopping") elif isinstance(msg, thespian.actors.ChildActorExited): if self.is_current_status_expected("cluster_stopping"): logger.info( "Child actor exited while engine is stopping: [%s]" % msg) else: raise exceptions.RallyError( "Child actor exited with [%s] while in status [%s]." % (msg, self.status)) elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) except BaseException: # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor. recipient = self.race_control if sender in self.children else sender logger.exception("Cannot process message [%s]. Notifying [%s]." % (msg, recipient)) ex_type, ex_value, ex_traceback = sys.exc_info() # avoid "can't pickle traceback objects" import traceback self.send( recipient, actor.BenchmarkFailure( "Could not execute command (%s)" % ex_value, traceback.format_exc()))
def receiveMessage(self, msg, sender): try: logger.info( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): self.on_start_engine(msg, sender) elif isinstance(msg, NodesStarted): self.metrics_store.merge_meta_info(msg.system_meta_info) self.transition_when_all_children_responded( sender, msg, "starting", "nodes_started", self.on_all_nodes_started) elif isinstance(msg, MetricsMetaInfoApplied): self.transition_when_all_children_responded( sender, msg, "apply_meta_info", "cluster_started", self.on_cluster_started) elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped" self.send_to_children_and_transition( sender, msg, ["cluster_started", "benchmark_stopped"], "benchmark_starting") elif isinstance(msg, BenchmarkStarted): self.transition_when_all_children_responded( sender, msg, "benchmark_starting", "benchmark_started", self.on_benchmark_started) elif isinstance(msg, ResetRelativeTime): if msg.reset_in_seconds > 0: self.wakeupAfter(msg.reset_in_seconds) else: self.reset_relative_time() elif isinstance(msg, thespian.actors.WakeupMessage): self.reset_relative_time() elif isinstance(msg, actor.BenchmarkFailure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): self.send_to_children_and_transition(sender, msg, "benchmark_started", "benchmark_stopping") elif isinstance(msg, BenchmarkStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "benchmark_stopping", "benchmark_stopped", self.on_benchmark_stopped) elif isinstance(msg, StopEngine): # detach from cluster and gather all system metrics self.cluster_launcher.stop(self.cluster) # we might have experienced a launch error or the user has cancelled the benchmark. Hence we need to allow to stop the # cluster from various states and we don't check here for a specific one. self.send_to_children_and_transition(sender, StopNodes(), [], "cluster_stopping") elif isinstance(msg, NodesStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "cluster_stopping", "cluster_stopped", self.on_all_nodes_stopped) elif isinstance(msg, thespian.actors.ActorExitRequest): # due to early termination by race control. If it's self-initiated we already took care of the rest. if sender != self.myAddress: self.send_to_children_and_transition( self.myAddress, msg, expected_status=None, new_status="cluster_stopping") elif isinstance(msg, thespian.actors.ChildActorExited): if self.is_current_status_expected( ["cluster_stopping", "cluster_stopped"]): logger.info( "Child actor exited while engine is stopping or stopped: [%s]" % msg) else: raise exceptions.RallyError( "Child actor exited with [%s] while in status [%s]." % (msg, self.status)) elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) else: logger.info( "MechanicActor received unknown message [%s] (ignoring)." % (str(msg))) except BaseException as e: logger.exception("Cannot process message") logger.error("Failed message details: [%s]. Notifying [%s]." % (msg, self.race_control)) self.send( self.race_control, actor.BenchmarkFailure( "Error in Elasticsearch cluster coordinator", e))
def receiveMessage(self, msg, sender): try: logger.debug( "BenchmarkActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, Setup): self.start_sender = sender self.setup(msg) elif isinstance(msg, mechanic.EngineStarted): logger.info("Mechanic has started engine successfully.") self.metrics_store.meta_info = msg.system_meta_info cluster = msg.cluster_meta_info self.race.cluster = cluster console.info( "Racing on track [%s], challenge [%s] and car %s\n" % (self.race.track_name, self.race.challenge_name, self.race.car)) # start running we assume that each race has at least one lap self.run() elif isinstance(msg, driver.TaskFinished): logger.info("Task has finished.") logger.info("Bulk adding request metrics to metrics store.") self.metrics_store.bulk_add(msg.metrics) # We choose *NOT* to reset our own metrics store's timer as this one is only used to collect complete metrics records from # other stores (used by driver and mechanic). Hence there is no need to reset the timer in our own metrics store. self.send( self.mechanic, mechanic.ResetRelativeTime(msg.next_task_scheduled_in)) elif isinstance(msg, actor.BenchmarkCancelled): self.cancelled = True # even notify the start sender if it is the originator. The reason is that we call #ask() which waits for a reply. # We also need to ask in order to avoid races between this notification and the following ActorExitRequest. self.send(self.start_sender, msg) elif isinstance(msg, actor.BenchmarkFailure): logger.info( "Received a benchmark failure from [%s] and will forward it now." % sender) self.error = True self.send(self.start_sender, msg) elif isinstance(msg, driver.BenchmarkComplete): logger.info("Benchmark is complete.") logger.info("Bulk adding request metrics to metrics store.") self.metrics_store.bulk_add(msg.metrics) self.send(self.main_driver, thespian.actors.ActorExitRequest()) self.main_driver = None self.send(self.mechanic, mechanic.OnBenchmarkStop()) elif isinstance(msg, mechanic.BenchmarkStopped): logger.info("Bulk adding system metrics to metrics store.") self.metrics_store.bulk_add(msg.system_metrics) logger.info("Flushing metrics data...") self.metrics_store.flush() logger.info("Flushing done") self.lap_counter.after_lap() if self.lap_counter.has_more_laps(): self.run() else: self.teardown() elif isinstance(msg, mechanic.EngineStopped): logger.info("Mechanic has stopped engine successfully.") logger.info("Bulk adding system metrics to metrics store.") self.metrics_store.bulk_add(msg.system_metrics) self.metrics_store.flush() if not self.cancelled and not self.error: final_results = reporter.calculate_results( self.metrics_store, self.race) self.race.add_final_results(final_results) reporter.summarize(self.race, self.cfg) self.race_store.store_race(self.race) else: logger.info( "Suppressing output of summary report. Cancelled = [%r], Error = [%r]." % (self.cancelled, self.error)) self.metrics_store.close() self.send(self.start_sender, Success()) elif isinstance(msg, thespian.actors.ActorExitRequest): if self.mechanic: self.send(self.mechanic, msg) self.mechanic = None if self.main_driver: self.send(self.main_driver, msg) self.main_driver = None else: logger.info( "BenchmarkActor received unknown message [%s] (ignoring)." % (str(msg))) except BaseException as e: self.error = True logger.exception( "BenchmarkActor encountered a fatal exception. Shutting down.") self.send(self.start_sender, actor.BenchmarkFailure("Could not execute benchmark", e))