def stop(raise_errors=True): if actor.actor_system_already_running(): # noinspection PyBroadException try: # TheSpian writes the following warning upon start (at least) on Mac OS X: # # WARNING:root:Unable to get address info for address 103.1.168.192.in-addr.arpa (AddressFamily.AF_INET,\ # SocketKind.SOCK_DGRAM, 17, 0): <class 'socket.gaierror'> [Errno 8] nodename nor servname provided, or not known # # Therefore, we will not show warnings but only errors. logging.basicConfig(level=logging.ERROR) running_system = actor.bootstrap_actor_system(try_join=True) running_system.shutdown() # await termination... console.info("Shutting down actor system.", end="", flush=True) while actor.actor_system_already_running(): console.println(".", end="", flush=True) time.sleep(1) console.println(" [OK]") except BaseException: console.error("Could not shut down actor system.") if raise_errors: # raise again so user can see the error raise elif raise_errors: console.error( "Could not shut down actor system: Actor system is not running.") sys.exit(1)
def stop(raise_errors=True): if actor.actor_system_already_running(): try: # TheSpian writes the following warning upon start (at least) on Mac OS X: # # WARNING:root:Unable to get address info for address 103.1.168.192.in-addr.arpa (AddressFamily.AF_INET,\ # SocketKind.SOCK_DGRAM, 17, 0): <class 'socket.gaierror'> [Errno 8] nodename nor servname provided, or not known # # Therefore, we will not show warnings but only errors. logging.basicConfig(level=logging.ERROR) running_system = actor.bootstrap_actor_system(try_join=True) running_system.shutdown() # await termination... console.info("Shutting down actor system.", end="", flush=True) while actor.actor_system_already_running(): console.println(".", end="", flush=True) time.sleep(1) console.println(" [OK]") except BaseException: console.error("Could not shut down actor system.") if raise_errors: # raise again so user can see the error raise elif raise_errors: console.error("Could not shut down actor system: Actor system is not running.") sys.exit(1)
def on_start_engine(self, msg, sender): logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg cls = metrics.metrics_store_class(self.cfg) self.metrics_store = cls(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. mechanics_and_start_message = [] hosts = self.cfg.opts("client", "hosts") if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") if msg.external: logger.info("Cluster will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/external", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(ip=hosts))) else: logger.info("Cluster consisting of %s will be provisioned by Rally." % hosts) all_ips_and_ports = to_ip_port(hosts) all_node_ips = extract_all_node_ips(all_ips_and_ports) for ip_port, nodes in nodes_by_host(all_ips_and_ports).items(): ip, port = ip_port if ip == "127.0.0.1": m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/localhost", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) else: if self.cfg.opts("system", "remote.benchmarking.supported"): logger.info("Benchmarking against %s with external Rally daemon." % hosts) else: logger.error("User tried to benchmark against %s but no external Rally daemon has been started." % hosts) raise exceptions.SystemSetupError("To benchmark remote hosts (e.g. %s) you need to start the Rally daemon " "on each machine including this one." % ip) already_running = actor.actor_system_already_running(ip=ip) logger.info("Actor system on [%s] already running? [%s]" % (ip, str(already_running))) if not already_running: console.println("Waiting for Rally daemon on [%s] " % ip, end="", flush=True) while not actor.actor_system_already_running(ip=ip): console.println(".", end="", flush=True) time.sleep(3) if not already_running: console.println(" [OK]") m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/%s" % ip, targetActorRequirements={"ip": ip}) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) self.children.append(m) self.status = "starting" self.received_responses = [] for mechanic_actor, start_message in mechanics_and_start_message: self.send(mechanic_actor, start_message)
def on_start_engine(self, msg, sender): logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg self.metrics_store = metrics.InMemoryMetricsStore(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. mechanics_and_start_message = [] hosts = self.cfg.opts("client", "hosts") if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") if msg.external: logger.info("Cluster will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/external", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(ip=hosts))) else: logger.info("Cluster consisting of %s will be provisioned by Rally." % hosts) all_ips_and_ports = to_ip_port(hosts) all_node_ips = extract_all_node_ips(all_ips_and_ports) for ip_port, nodes in nodes_by_host(all_ips_and_ports).items(): ip, port = ip_port if ip == "127.0.0.1": m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/localhost", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) else: if self.cfg.opts("system", "remote.benchmarking.supported"): logger.info("Benchmarking against %s with external Rally daemon." % hosts) else: logger.error("User tried to benchmark against %s but no external Rally daemon has been started." % hosts) raise exceptions.SystemSetupError("To benchmark remote hosts (e.g. %s) you need to start the Rally daemon " "on each machine including this one." % ip) already_running = actor.actor_system_already_running(ip=ip) logger.info("Actor system on [%s] already running? [%s]" % (ip, str(already_running))) if not already_running: console.println("Waiting for Rally daemon on [%s] " % ip, end="", flush=True) while not actor.actor_system_already_running(ip=ip): console.println(".", end="", flush=True) time.sleep(3) if not already_running: console.println(" [OK]") m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/%s" % ip, targetActorRequirements={"ip": ip}) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) self.children.append(m) self.status = "starting" self.received_responses = [] for mechanic_actor, start_message in mechanics_and_start_message: self.send(mechanic_actor, start_message)
def with_actor_system(runnable, cfg): already_running = actor.actor_system_already_running() logger.info("Actor system already running locally? [%s]" % str(already_running)) try: actors = actor.bootstrap_actor_system(try_join=already_running, prefer_local_only=not already_running) # We can only support remote benchmarks if we have a dedicated daemon that is not only bound to 127.0.0.1 cfg.add(config.Scope.application, "system", "remote.benchmarking.supported", already_running) except RuntimeError as e: logger.exception("Could not bootstrap actor system.") if str(e) == "Unable to determine valid external socket address.": console.warn("Could not determine a socket address. Are you running without any network? Switching to degraded mode.", logger=logger) actor.use_offline_actor_system() actors = actor.bootstrap_actor_system(try_join=True) else: raise try: runnable(cfg) finally: # We only shutdown the actor system if it was not already running before if not already_running: shutdown_complete = False times_interrupted = 0 while not shutdown_complete and times_interrupted < 2: try: logger.info("Attempting to shutdown internal actor system.") actors.shutdown() # note that this check will only evaluate to True for a TCP-based actor system. timeout = 15 while actor.actor_system_already_running() and timeout > 0: logger.info("Actor system is still running. Waiting...") time.sleep(1) timeout -= 1 if timeout > 0: shutdown_complete = True logger.info("Shutdown completed.") else: logger.warning("Shutdown timed out. Actor system is still running.") break except KeyboardInterrupt: times_interrupted += 1 logger.warning("User interrupted shutdown of internal actor system.") console.info("Please wait a moment for Rally's internal components to shutdown.") if not shutdown_complete and times_interrupted > 0: logger.warning("Terminating after user has interrupted actor system shutdown explicitly for [%d] times." % times_interrupted) console.println("") console.warn("Terminating now at the risk of leaving child processes behind.") console.println("") console.warn("The next race may fail due to an unclean shutdown.") console.println("") console.println(SKULL) console.println("") elif not shutdown_complete: console.warn("Could not terminate all internal processes within timeout. Please check and force-terminate all Rally processes.")
def start(args): if actor.actor_system_already_running(): raise exceptions.RallyError( "An actor system appears to be already running.") actor.bootstrap_actor_system(local_ip=args.node_ip, coordinator_ip=args.coordinator_ip) console.info( "Successfully started actor system on node [%s] with coordinator node IP [%s]." % (args.node_ip, args.coordinator_ip))
def start(args): if actor.actor_system_already_running(): raise exceptions.RallyError("An actor system appears to be already running.") # TheSpian writes the following warning upon start (at least) on Mac OS X: # # WARNING:root:Unable to get address info for address 103.1.168.192.in-addr.arpa (AddressFamily.AF_INET,\ # SocketKind.SOCK_DGRAM, 17, 0): <class 'socket.gaierror'> [Errno 8] nodename nor servname provided, or not known # # Therefore, we will not show warnings but only errors. logging.basicConfig(level=logging.ERROR) actor.bootstrap_actor_system(local_ip=args.node_ip, coordinator_ip=args.coordinator_ip) console.info("Successfully started actor system on node [%s] with coordinator node IP [%s]." % (args.node_ip, args.coordinator_ip))
def status(): if actor.actor_system_already_running(): console.println("Running") else: console.println("Stopped")
def receiveMessage(self, msg, sender): try: logger.debug( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): logger.info( "Received signal from race control to start engine.") self.race_control = sender # In our startup procedure we first create all mechanics. Only if this succeeds mechanics_and_start_message = [] if msg.external: logger.info( "Target node(s) will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor( LocalNodeMechanicActor, globalName="/rally/mechanic/worker/external", targetActorRequirements={"coordinator": True}) self.mechanics.append(m) # we can use the original message in this case mechanics_and_start_message.append((m, msg)) else: hosts = msg.cfg.opts("client", "hosts") logger.info( "Target node(s) %s will be provisioned by Rally." % hosts) if len(hosts) == 0: raise exceptions.LaunchError( "No target hosts are configured.") for host in hosts: ip = host["host"] port = int(host["port"]) # user may specify "localhost" on the command line but the problem is that we auto-register the actor system # with "ip": "127.0.0.1" so we convert this special case automatically. In all other cases the user needs to # start the actor system on the other host and is aware that the parameter for the actor system and the # --target-hosts parameter need to match. if ip == "localhost" or ip == "127.0.0.1": m = self.createActor( LocalNodeMechanicActor, globalName="/rally/mechanic/worker/localhost", targetActorRequirements={"coordinator": True}) self.mechanics.append(m) mechanics_and_start_message.append( (m, msg.with_port(port))) else: if msg.cfg.opts("system", "remote.benchmarking.supported"): logger.info( "Benchmarking against %s with external Rally daemon." % hosts) else: logger.error( "User tried to benchmark against %s but no external Rally daemon has been started." % hosts) raise exceptions.SystemSetupError( "To benchmark remote hosts (e.g. %s) you need to start the Rally daemon " "on each machine including this one." % ip) already_running = actor.actor_system_already_running( ip=ip) logger.info( "Actor system on [%s] already running? [%s]" % (ip, str(already_running))) if not already_running: console.println( "Waiting for Rally daemon on [%s] " % ip, end="", flush=True) while not actor.actor_system_already_running( ip=ip): console.println(".", end="", flush=True) time.sleep(3) if not already_running: console.println(" [OK]") m = self.createActor( RemoteNodeMechanicActor, globalName="/rally/mechanic/worker/%s" % ip, targetActorRequirements={"ip": ip}) mechanics_and_start_message.append( (m, msg.with_port(port))) self.mechanics.append(m) for mechanic_actor, start_message in mechanics_and_start_message: self.send(mechanic_actor, start_message) elif isinstance(msg, EngineStarted): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStart): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, Success): self.send(self.race_control, msg) elif isinstance(msg, Failure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, BenchmarkStopped): # TODO dm: Actually we need to wait for all BenchmarkStopped messages from all our mechanic actors # TODO dm: We will actually duplicate cluster level metrics if each of our mechanic actors gathers these... self.send(self.race_control, msg) elif isinstance(msg, StopEngine): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, EngineStopped): self.send(self.race_control, msg) # clear all state as the mechanic might get reused later for m in self.mechanics: self.send(m, thespian.actors.ActorExitRequest()) self.mechanics = [] # self terminate + slave nodes self.send(self.myAddress, thespian.actors.ActorExitRequest()) elif isinstance(msg, thespian.actors.ChildActorExited): # TODO dm: Depending on our state model this can be fine (e.g. when it exited due to our ActorExitRequest message # or it could be problematic and mean that an exception has occured. pass elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) except BaseException: logger.exception("Cannot process message [%s]" % msg) # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor. recipient = self.race_control if sender in self.mechanics else sender ex_type, ex_value, ex_traceback = sys.exc_info() # avoid "can't pickle traceback objects" import traceback self.send( recipient, Failure("Could not execute command (%s)" % ex_value, traceback.format_exc()))
def with_actor_system(runnable, cfg): logger = logging.getLogger(__name__) already_running = actor.actor_system_already_running() logger.info("Actor system already running locally? [%s]", str(already_running)) try: actors = actor.bootstrap_actor_system( try_join=already_running, prefer_local_only=not already_running) # We can only support remote benchmarks if we have a dedicated daemon that is not only bound to 127.0.0.1 cfg.add(config.Scope.application, "system", "remote.benchmarking.supported", already_running) # This happens when the admin process could not be started, e.g. because it could not open a socket. except thespian.actors.InvalidActorAddress: logger.info("Falling back to offline actor system.") actor.use_offline_actor_system() actors = actor.bootstrap_actor_system(try_join=True) except KeyboardInterrupt: raise exceptions.UserInterrupted( "User has cancelled the benchmark (detected whilst bootstrapping actor system)." ) from None except Exception as e: logger.exception("Could not bootstrap actor system.") if str(e) == "Unable to determine valid external socket address.": console.warn( "Could not determine a socket address. Are you running without any network? Switching to degraded mode.", logger=logger) logger.info("Falling back to offline actor system.") actor.use_offline_actor_system() actors = actor.bootstrap_actor_system(try_join=True) else: raise try: runnable(cfg) finally: # We only shutdown the actor system if it was not already running before if not already_running: shutdown_complete = False times_interrupted = 0 while not shutdown_complete and times_interrupted < 2: try: # give some time for any outstanding messages to be delivered to the actor system time.sleep(3) logger.info( "Attempting to shutdown internal actor system.") actors.shutdown() # note that this check will only evaluate to True for a TCP-based actor system. timeout = 15 while actor.actor_system_already_running() and timeout > 0: logger.info( "Actor system is still running. Waiting...") time.sleep(1) timeout -= 1 if timeout > 0: shutdown_complete = True logger.info("Shutdown completed.") else: logger.warning( "Shutdown timed out. Actor system is still running." ) break except KeyboardInterrupt: times_interrupted += 1 logger.warning( "User interrupted shutdown of internal actor system.") console.info( "Please wait a moment for Rally's internal components to shutdown." ) if not shutdown_complete and times_interrupted > 0: logger.warning( "Terminating after user has interrupted actor system shutdown explicitly for [%d] times.", times_interrupted) console.println("") console.warn( "Terminating now at the risk of leaving child processes behind." ) console.println("") console.warn( "The next race may fail due to an unclean shutdown.") console.println("") console.println(SKULL) console.println("") raise exceptions.UserInterrupted( f"User has cancelled the benchmark (shutdown not complete as user interrupted " f"{times_interrupted} times).") from None elif not shutdown_complete: console.warn( "Could not terminate all internal processes within timeout. Please check and force-terminate all Rally processes." )