def _read_output(self, node_name, server, startup_event): """ Reads the output from the ES (node) subprocess. """ while True: l = server.stdout.readline().decode("utf-8") if len(l) == 0: # no more output -> the process has terminated. We can give up now startup_event.set() break l = l.rstrip() # don't log each output line as it is contained in the node's log files anyway and we just risk spamming our own log. if not startup_event.isSet(): logger.info("%s: %s" % (node_name, l.replace("\n", "\n%s (stdout): " % node_name))) if l.find("Initialization Failed") != -1 or l.find( "A fatal exception has occurred") != -1: logger.error("[%s] encountered initialization errors." % node_name) # wait a moment to ensure the process has terminated before we signal that we detected a (failed) startup. wait = 5 while not server.returncode or wait == 0: time.sleep(0.1) server.poll() wait -= 1 startup_event.set() if l.endswith("started") and not startup_event.isSet(): startup_event.set() logger.info("[%s] has successfully started." % node_name)
def await_termination(self, server, timeout=5): # wait a moment to ensure the process has terminated wait = timeout while not server.returncode or wait == 0: time.sleep(0.1) server.poll() wait -= 1
def _do_wait(self, expected_cluster_status): reached_cluster_status = None for attempt in range(10): try: result = self.client.cluster.health( wait_for_status=expected_cluster_status, wait_for_relocating_shards=0, timeout="3s") except (socket.timeout, elasticsearch.exceptions.ConnectionError, elasticsearch.exceptions.TransportError): pass else: reached_cluster_status = result["status"] relocating_shards = result["relocating_shards"] logger.info("GOT: %s" % str(result)) logger.info("ALLOC:\n%s" % self.client.cat.allocation(v=True)) logger.info("RECOVERY:\n%s" % self.client.cat.recovery(v=True)) logger.info("SHARDS:\n%s" % self.client.cat.shards(v=True)) if reached_cluster_status == expected_cluster_status and relocating_shards == 0: return reached_cluster_status, relocating_shards else: time.sleep(0.5) msg = "Cluster did not reach status [%s]. Last reached status: [%s]" % ( expected_cluster_status, reached_cluster_status) logger.error(msg) raise exceptions.LaunchError(msg)
def instrument_env(self, car, candidate_id): io.ensure_dir(self.log_root) log_file = "%s/%s-%s.jfr" % (self.log_root, car.safe_name, candidate_id) console.println("\n***************************************************************************\n") console.println("[WARNING] Java flight recorder is a commercial feature of the Oracle JDK.\n") console.println("You are using Java flight recorder which requires that you comply with\nthe licensing terms stated in:\n") console.println(console.format.link("http://www.oracle.com/technetwork/java/javase/terms/license/index.html")) console.println("\nBy using this feature you confirm that you comply with these license terms.\n") console.println("Otherwise, please abort and rerun Rally without the \"jfr\" telemetry device.") console.println("\n***************************************************************************\n") time.sleep(3) console.info("%s: Writing flight recording to [%s]" % (self.human_name, log_file), logger=logger) # this is more robust in case we want to use custom settings # see http://stackoverflow.com/questions/34882035/how-to-record-allocations-with-jfr-on-command-line # # in that case change to: -XX:StartFlightRecording=defaultrecording=true,settings=es-memory-profiling if self.java_major_version < 9: return {"ES_JAVA_OPTS": "-XX:+UnlockDiagnosticVMOptions -XX:+UnlockCommercialFeatures -XX:+DebugNonSafepoints " "-XX:+FlightRecorder " "-XX:FlightRecorderOptions=disk=true,maxage=0s,maxsize=0,dumponexit=true,dumponexitpath=%s " "-XX:StartFlightRecording=defaultrecording=true" % log_file} else: return {"ES_JAVA_OPTS": "-XX:+UnlockDiagnosticVMOptions -XX:+UnlockCommercialFeatures -XX:+DebugNonSafepoints " "-XX:StartFlightRecording=maxsize=0,maxage=0s,disk=true,dumponexit=true,filename=%s" % log_file}
def on_start_engine(self, msg, sender): logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg cls = metrics.metrics_store_class(self.cfg) self.metrics_store = cls(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. mechanics_and_start_message = [] hosts = self.cfg.opts("client", "hosts") if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") if msg.external: logger.info("Cluster will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/external", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(ip=hosts))) else: logger.info("Cluster consisting of %s will be provisioned by Rally." % hosts) all_ips_and_ports = to_ip_port(hosts) all_node_ips = extract_all_node_ips(all_ips_and_ports) for ip_port, nodes in nodes_by_host(all_ips_and_ports).items(): ip, port = ip_port if ip == "127.0.0.1": m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/localhost", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) else: if self.cfg.opts("system", "remote.benchmarking.supported"): logger.info("Benchmarking against %s with external Rally daemon." % hosts) else: logger.error("User tried to benchmark against %s but no external Rally daemon has been started." % hosts) raise exceptions.SystemSetupError("To benchmark remote hosts (e.g. %s) you need to start the Rally daemon " "on each machine including this one." % ip) already_running = actor.actor_system_already_running(ip=ip) logger.info("Actor system on [%s] already running? [%s]" % (ip, str(already_running))) if not already_running: console.println("Waiting for Rally daemon on [%s] " % ip, end="", flush=True) while not actor.actor_system_already_running(ip=ip): console.println(".", end="", flush=True) time.sleep(3) if not already_running: console.println(" [OK]") m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/%s" % ip, targetActorRequirements={"ip": ip}) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) self.children.append(m) self.status = "starting" self.received_responses = [] for mechanic_actor, start_message in mechanics_and_start_message: self.send(mechanic_actor, start_message)
def run(self): # noinspection PyBroadException try: while not self.stop: self.recorder.record() time.sleep(self.recorder.sample_interval) except BaseException as e: logger.exception("Could not determine {}".format(self.recorder))
def on_start_engine(self, msg, sender): logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg self.metrics_store = metrics.InMemoryMetricsStore(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. mechanics_and_start_message = [] hosts = self.cfg.opts("client", "hosts") if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") if msg.external: logger.info("Cluster will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/external", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(ip=hosts))) else: logger.info("Cluster consisting of %s will be provisioned by Rally." % hosts) all_ips_and_ports = to_ip_port(hosts) all_node_ips = extract_all_node_ips(all_ips_and_ports) for ip_port, nodes in nodes_by_host(all_ips_and_ports).items(): ip, port = ip_port if ip == "127.0.0.1": m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/localhost", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) else: if self.cfg.opts("system", "remote.benchmarking.supported"): logger.info("Benchmarking against %s with external Rally daemon." % hosts) else: logger.error("User tried to benchmark against %s but no external Rally daemon has been started." % hosts) raise exceptions.SystemSetupError("To benchmark remote hosts (e.g. %s) you need to start the Rally daemon " "on each machine including this one." % ip) already_running = actor.actor_system_already_running(ip=ip) logger.info("Actor system on [%s] already running? [%s]" % (ip, str(already_running))) if not already_running: console.println("Waiting for Rally daemon on [%s] " % ip, end="", flush=True) while not actor.actor_system_already_running(ip=ip): console.println(".", end="", flush=True) time.sleep(3) if not already_running: console.println(" [OK]") m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/%s" % ip, targetActorRequirements={"ip": ip}) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) self.children.append(m) self.status = "starting" self.received_responses = [] for mechanic_actor, start_message in mechanics_and_start_message: self.send(mechanic_actor, start_message)
def start(self, car): # hardcoded for the moment, should actually be identical to internal launcher # Only needed on Mac: # hosts = [{"host": process.run_subprocess_with_output("docker-machine ip default")[0].strip(), "port": 9200}] hosts = [{"host": "localhost", "port": 9200}] client_options = self.cfg.opts("launcher", "client.options") # unified client config self.cfg.add(config.Scope.benchmark, "client", "hosts", hosts) self.cfg.add(config.Scope.benchmark, "client", "options", client_options) es = self.client_factory(hosts, client_options).create() t = telemetry.Telemetry(self.cfg, devices=[ # Be aware that some the meta-data are taken from the host system, not the container (e.g. number of CPU cores) so if the # Docker container constrains these, the metrics are actually wrong. telemetry.EnvironmentInfo(self.cfg, es, self.metrics_store), telemetry.NodeStats(self.cfg, es, self.metrics_store), telemetry.IndexStats(self.cfg, es, self.metrics_store), telemetry.DiskIo(self.cfg, self.metrics_store), telemetry.CpuUsage(self.cfg, self.metrics_store) ]) distribution_version = self.cfg.opts("source", "distribution.version", mandatory=False) install_dir = self._install_dir() io.ensure_dir(install_dir) java_opts = "" if car.heap: java_opts += "-Xms%s -Xmx%s " % (car.heap, car.heap) if car.java_opts: java_opts += car.java_opts vars = { "es_java_opts": java_opts, "container_memory_gb": "%dg" % (convert.bytes_to_gb(psutil.virtual_memory().total) // 2), "es_data_dir": "%s/data" % install_dir, "es_version": distribution_version } docker_cfg = self._render_template_from_file(vars) logger.info("Starting Docker container with configuration:\n%s" % docker_cfg) docker_cfg_path = self._docker_cfg_path() with open(docker_cfg_path, "wt") as f: f.write(docker_cfg) c = cluster.Cluster([], t) self._start_process(cmd="docker-compose -f %s up" % docker_cfg_path, node_name="rally0") # Wait for a little while: Plugins may still be initializing although the node has already started. time.sleep(10) t.attach_to_cluster(c) logger.info("Successfully started Docker container") return c
def wait_for_pidfile(pidfilename, timeout=60): endtime = _time() + timeout while _time() < endtime: try: with open(pidfilename, "rb") as f: return int(f.read()) except FileNotFoundError: time.sleep(0.5) msg = "pid file not available after {} seconds!".format(timeout) logging.error(msg) raise exceptions.LaunchError(msg)
def _wait_for_healthy_running_container(self, container_id, timeout): cmd = 'docker ps -a --filter "id={}" --filter "status=running" --filter "health=healthy" -q'.format(container_id) stop_watch = self.clock.stop_watch() stop_watch.start() while stop_watch.split_time() < timeout: containers = process.run_subprocess_with_output(cmd) if len(containers) > 0: return time.sleep(0.5) msg = "No healthy running container after {} seconds!".format(timeout) logging.error(msg) raise exceptions.LaunchError(msg)
def _wait_for_healthy_running_container(container_id, timeout=60): cmd = 'docker ps -a --filter "id={}" --filter "status=running" --filter "health=healthy" -q'.format(container_id) endtime = _time() + timeout while _time() < endtime: output = subprocess.check_output(shlex.split(cmd)) containers = output.decode("utf-8").rstrip() if len(containers) > 0: return time.sleep(0.5) msg = "No healthy running container after {} seconds!".format(timeout) logging.error(msg) raise exceptions.LaunchError(msg)
def wait_for_pidfile(pidfilename, timeout=60, clock=time.Clock): stop_watch = clock.stop_watch() stop_watch.start() while stop_watch.split_time() < timeout: try: with open(pidfilename, "rb") as f: return int(f.read()) except FileNotFoundError: time.sleep(0.5) msg = "pid file not available after {} seconds!".format(timeout) logging.error(msg) raise exceptions.LaunchError(msg)
def wait_for_rest_layer(es, max_attempts=20): for attempt in range(max_attempts): import elasticsearch try: es.info() return True except elasticsearch.TransportError as e: if e.status_code == 503 or isinstance(e, elasticsearch.ConnectionError): time.sleep(1) elif e.status_code == 401: time.sleep(1) else: raise e return False
def wait_for_rest_layer(es, max_attempts=10): for attempt in range(max_attempts): import elasticsearch try: es.info() return True except elasticsearch.TransportError as e: if e.status_code == 503 or isinstance(e, elasticsearch.ConnectionError): logger.debug("Elasticsearch REST API is not available yet (probably cluster block).") time.sleep(2) elif e.status_code == 401: logger.debug("Could not authenticate yet (probably x-pack initializing).") time.sleep(2) else: raise e return False
def _do_wait(self, expected_cluster_status): reached_cluster_status = None for attempt in range(10): try: result = self.client.cluster.health(wait_for_status=expected_cluster_status, wait_for_relocating_shards=0, timeout="3s") except (socket.timeout, elasticsearch.exceptions.ConnectionError, elasticsearch.exceptions.TransportError): pass else: reached_cluster_status = result["status"] relocating_shards = result["relocating_shards"] logger.info("GOT: %s" % str(result)) logger.info("ALLOC:\n%s" % self.client.cat.allocation(v=True)) logger.info("RECOVERY:\n%s" % self.client.cat.recovery(v=True)) logger.info("SHARDS:\n%s" % self.client.cat.shards(v=True)) if reached_cluster_status == expected_cluster_status and relocating_shards == 0: return reached_cluster_status, relocating_shards else: time.sleep(0.5) msg = "Cluster did not reach status [%s]. Last reached status: [%s]" % (expected_cluster_status, reached_cluster_status) logger.error(msg) raise exceptions.LaunchError(msg)
def instrument_env(self, car, candidate_id): io.ensure_dir(self.log_root) log_file = "%s/%s-%s.jfr" % (self.log_root, car.safe_name, candidate_id) console.println( "\n***************************************************************************\n" ) console.println( "[WARNING] Java flight recorder is a commercial feature of the Oracle JDK.\n" ) console.println( "You are using Java flight recorder which requires that you comply with\nthe licensing terms stated in:\n" ) console.println( console.format.link( "http://www.oracle.com/technetwork/java/javase/terms/license/index.html" )) console.println( "\nBy using this feature you confirm that you comply with these license terms.\n" ) console.println( "Otherwise, please abort and rerun Rally without the \"jfr\" telemetry device." ) console.println( "\n***************************************************************************\n" ) time.sleep(3) console.info("%s: Writing flight recording to [%s]" % (self.human_name, log_file), logger=logger) java_opts = self.java_opts(log_file) logger.info("jfr: Adding JVM arguments: [%s].", java_opts) return {"ES_JAVA_OPTS": java_opts}
def record(self): current_sample = self.sample() for node_stats in current_sample: node_name = node_stats["name"] if self.include_indices: self.record_indices_stats(node_name, node_stats, include=[ "docs", "store", "indexing", "search", "merges", "query_cache", "fielddata", "segments", "translog", "request_cache" ]) if self.include_thread_pools: self.record_thread_pool_stats(node_name, node_stats) if self.include_breakers: self.record_circuit_breaker_stats(node_name, node_stats) if self.include_buffer_pools: self.record_jvm_buffer_pool_stats(node_name, node_stats) if self.include_network: self.record_network_stats(node_name, node_stats) time.sleep(self.sample_interval)
def wait_for_rest_layer(es, max_attempts=20): for attempt in range(max_attempts): import elasticsearch try: es.info() return True except elasticsearch.ConnectionError as e: if "SSL: UNKNOWN_PROTOCOL" in str(e): raise exceptions.LaunchError("Could not connect to cluster via https. Is this a https endpoint?", e) else: time.sleep(1) except elasticsearch.TransportError as e: if e.status_code == 503: time.sleep(1) elif e.status_code == 401: time.sleep(1) else: raise e return False
def migrate(config_file, current_version, target_version, out=print, i=input): prompter = Prompter(i=i, o=out, assume_defaults=False) logger.info("Upgrading configuration from version [%s] to [%s]." % (current_version, target_version)) # Something is really fishy. We don't want to downgrade the configuration. if current_version >= target_version: raise ConfigError( "The existing config file is available in a later version already. Expected version <= [%s] but found [%s]" % (target_version, current_version)) # but first a backup... config_file.backup() config = config_file.load(interpolation=None) if current_version == 0 and target_version > current_version: logger.info("Migrating config from version [0] to [1]") current_version = 1 config["meta"] = {} config["meta"]["config.version"] = str(current_version) # in version 1 we changed some directories from being absolute to being relative config["system"]["log.root.dir"] = "logs" config["provisioning"]["local.install.dir"] = "install" config["reporting"]["report.base.dir"] = "reports" if current_version == 1 and target_version > current_version: logger.info("Migrating config from version [1] to [2]") current_version = 2 config["meta"]["config.version"] = str(current_version) # no need to ask the user now if we are about to upgrade to version 4 config["reporting"]["datastore.type"] = "in-memory" config["reporting"]["datastore.host"] = "" config["reporting"]["datastore.port"] = "" config["reporting"]["datastore.secure"] = "" config["reporting"]["datastore.user"] = "" config["reporting"]["datastore.password"] = "" config["system"]["env.name"] = "local" if current_version == 2 and target_version > current_version: logger.info("Migrating config from version [2] to [3]") current_version = 3 config["meta"]["config.version"] = str(current_version) # Remove obsolete settings config["reporting"].pop("report.base.dir") config["reporting"].pop("output.html.report.filename") if current_version == 3 and target_version > current_version: root_dir = config["system"]["root.dir"] out(""" ***************************************************************************************** You have an old configuration of Rally. Rally has now a much simpler setup routine which will autodetect lots of settings for you and it also does not require you to setup a metrics store anymore. Rally will now migrate your configuration but if you don't need advanced features like a metrics store, then you should delete the configuration directory: rm -rf {0} and then rerun Rally's configuration routine: {1} configure Please also note you have {2:.1f} GB of data in your current benchmark directory at {3} You might want to clean up this directory also. For more details please see {4} ***************************************************************************************** Pausing for 10 seconds to let you consider this message. """.format( config_file.config_dir, PROGRAM_NAME, convert.bytes_to_gb(io.get_size(root_dir)), root_dir, console.format.link( "https://github.com/elastic/rally/blob/master/CHANGELOG.md#030" ))) time.sleep(10) logger.info("Migrating config from version [3] to [4]") current_version = 4 config["meta"]["config.version"] = str(current_version) if len(config["reporting"]["datastore.host"]) > 0: config["reporting"]["datastore.type"] = "elasticsearch" else: config["reporting"]["datastore.type"] = "in-memory" # Remove obsolete settings config["build"].pop("maven.bin") config["benchmarks"].pop("metrics.stats.disk.device") if current_version == 4 and target_version > current_version: config["tracks"] = {} config["tracks"][ "default.url"] = "https://github.com/elastic/rally-tracks" current_version = 5 config["meta"]["config.version"] = str(current_version) if current_version == 5 and target_version > current_version: config["defaults"] = {} config["defaults"]["preserve_benchmark_candidate"] = str(False) current_version = 6 config["meta"]["config.version"] = str(current_version) if current_version == 6 and target_version > current_version: # Remove obsolete settings config.pop("provisioning") config["system"].pop("log.root.dir") current_version = 7 config["meta"]["config.version"] = str(current_version) if current_version == 7 and target_version > current_version: # move [system][root.dir] to [node][root.dir] if "node" not in config: config["node"] = {} config["node"]["root.dir"] = config["system"].pop("root.dir") # also move all references! for section in config: for k, v in config[section].items(): config[section][k] = v.replace("${system:root.dir}", "${node:root.dir}") current_version = 8 config["meta"]["config.version"] = str(current_version) if current_version == 8 and target_version > current_version: config["teams"] = {} config["teams"][ "default.url"] = "https://github.com/elastic/rally-teams" current_version = 9 config["meta"]["config.version"] = str(current_version) if current_version == 9 and target_version > current_version: config["distributions"] = {} config["distributions"]["release.1.url"] = "https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-" \ "{{VERSION}}.tar.gz" config["distributions"]["release.2.url"] = "https://download.elasticsearch.org/elasticsearch/release/org/elasticsearch/" \ "distribution/tar/elasticsearch/{{VERSION}}/elasticsearch-{{VERSION}}.tar.gz" config["distributions"][ "release.url"] = "https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-{{VERSION}}.tar.gz" config["distributions"]["release.cache"] = "true" current_version = 10 config["meta"]["config.version"] = str(current_version) if current_version == 10 and target_version > current_version: config["runtime"]["java.home"] = config["runtime"].pop("java8.home") current_version = 11 config["meta"]["config.version"] = str(current_version) if current_version == 11 and target_version > current_version: # As this is a rather complex migration, we log more than usual to understand potential migration problems better. if "source" in config: if "local.src.dir" in config["source"]: previous_root = config["source"].pop("local.src.dir") logger.info("Set [source][local.src.dir] to [%s]." % previous_root) # if this directory was Rally's default location, then move it on the file system because to allow for checkouts of plugins # in the sibling directory. if previous_root == os.path.join(config["node"]["root.dir"], "src"): new_root_dir_all_sources = previous_root new_es_sub_dir = "elasticsearch" new_root = os.path.join(new_root_dir_all_sources, new_es_sub_dir) # only attempt to move if the directory exists. It may be possible that users never ran a source benchmark although they # have configured it. In that case the source directory will not yet exist. if io.exists(previous_root): logger.info( "Previous source directory was at Rally's default location [%s]. Moving to [%s]." % (previous_root, new_root)) try: # we need to do this in two steps as we need to move the sources to a subdirectory tmp_path = io.normalize_path( os.path.join(new_root_dir_all_sources, os.pardir, "tmp_src_mig")) os.rename(previous_root, tmp_path) io.ensure_dir(new_root) os.rename(tmp_path, new_root) except OSError: logger.exception( "Could not move source directory from [%s] to [%s]." % (previous_root, new_root)) # A warning is sufficient as Rally should just do a fresh checkout if moving did not work. console.warn( "Elasticsearch source directory could not be moved from [%s] to [%s]. Please check the logs." % (previous_root, new_root)) else: logger.info( "Source directory is configured at Rally's default location [%s] but does not exist yet." % previous_root) else: logger.info( "Previous source directory was the custom directory [%s]." % previous_root) new_root_dir_all_sources = io.normalize_path( os.path.join(previous_root, os.path.pardir)) # name of the elasticsearch project directory. new_es_sub_dir = io.basename(previous_root) logger.info("Setting [node][src.root.dir] to [%s]." % new_root_dir_all_sources) config["node"]["src.root.dir"] = new_root_dir_all_sources logger.info( "Setting [source][elasticsearch.src.subdir] to [%s]" % new_es_sub_dir) config["source"]["elasticsearch.src.subdir"] = new_es_sub_dir else: logger.info( "Key [local.src.dir] not found. Advancing without changes." ) else: logger.info( "No section named [source] found in config. Advancing without changes." ) current_version = 12 config["meta"]["config.version"] = str(current_version) if current_version == 12 and target_version > current_version: # the current configuration allows to benchmark from sources if "build" in config and "gradle.bin" in config["build"]: java_9_home = io.guess_java_home(major_version=9) from esrally.utils import jvm if java_9_home and not jvm.is_early_access_release(java_9_home): logger.debug("Autodetected a JDK 9 installation at [%s]" % java_9_home) if "runtime" not in config: config["runtime"] = {} config["runtime"]["java9.home"] = java_9_home else: logger.debug( "Could not autodetect a JDK 9 installation. Checking [java.home] already points to a JDK 9." ) detected = False if "runtime" in config: java_home = config["runtime"]["java.home"] if jvm.major_version( java_home ) == 9 and not jvm.is_early_access_release(java_home): config["runtime"]["java9.home"] = java_home detected = True if not detected: logger.debug( "Could not autodetect a JDK 9 installation. Asking user." ) raw_java_9_home = prompter.ask_property( "Enter the JDK 9 root directory", check_path_exists=True, mandatory=False) if raw_java_9_home and jvm.major_version( raw_java_9_home ) == 9 and not jvm.is_early_access_release( raw_java_9_home): java_9_home = io.normalize_path( raw_java_9_home) if raw_java_9_home else None config["runtime"]["java9.home"] = java_9_home else: out("********************************************************************************" ) out("You don't have a valid JDK 9 installation and cannot benchmark source builds." ) out("") out("You can still benchmark binary distributions with e.g.:" ) out("") out(" %s --distribution-version=6.0.0" % PROGRAM_NAME) out("********************************************************************************" ) out("") current_version = 13 config["meta"]["config.version"] = str(current_version) # all migrations done config_file.store(config) logger.info("Successfully self-upgraded configuration to version [%s]" % target_version)
def migrate(config_file, current_version, target_version, out=print): logger.info("Upgrading configuration from version [%s] to [%s]." % (current_version, target_version)) # Something is really fishy. We don't want to downgrade the configuration. if current_version >= target_version: raise ConfigError( "The existing config file is available in a later version already. Expected version <= [%s] but found [%s]" % (target_version, current_version)) # but first a backup... config_file.backup() config = config_file.load(interpolation=None) if current_version == 0 and target_version > current_version: logger.info("Migrating config from version [0] to [1]") current_version = 1 config["meta"] = {} config["meta"]["config.version"] = str(current_version) # in version 1 we changed some directories from being absolute to being relative config["system"]["log.root.dir"] = "logs" config["provisioning"]["local.install.dir"] = "install" config["reporting"]["report.base.dir"] = "reports" if current_version == 1 and target_version > current_version: logger.info("Migrating config from version [1] to [2]") current_version = 2 config["meta"]["config.version"] = str(current_version) # no need to ask the user now if we are about to upgrade to version 4 config["reporting"]["datastore.type"] = "in-memory" config["reporting"]["datastore.host"] = "" config["reporting"]["datastore.port"] = "" config["reporting"]["datastore.secure"] = "" config["reporting"]["datastore.user"] = "" config["reporting"]["datastore.password"] = "" config["system"]["env.name"] = "local" if current_version == 2 and target_version > current_version: logger.info("Migrating config from version [2] to [3]") current_version = 3 config["meta"]["config.version"] = str(current_version) # Remove obsolete settings config["reporting"].pop("report.base.dir") config["reporting"].pop("output.html.report.filename") if current_version == 3 and target_version > current_version: root_dir = config["system"]["root.dir"] out("*****************************************************************************************" ) out("") out("You have an old configuration of Rally. Rally has now a much simpler setup" ) out("routine which will autodetect lots of settings for you and it also does not" ) out("require you to setup a metrics store anymore.") out("") out("Rally will now migrate your configuration but if you don't need advanced features" ) out("like a metrics store, then you should delete the configuration directory:" ) out("") out(" rm -rf %s" % config_file.config_dir) out("") out("and then rerun Rally's configuration routine:") out("") out(" %s configure" % PROGRAM_NAME) out("") out("Please also note you have %.1f GB of data in your current benchmark directory at" % convert.bytes_to_gb(io.get_size(root_dir))) out() out(" %s" % root_dir) out("") out("You might want to clean up this directory also.") out() out("For more details please see %s" % console.format.link( "https://github.com/elastic/rally/blob/master/CHANGELOG.md#030")) out("") out("*****************************************************************************************" ) out("") out("Pausing for 10 seconds to let you consider this message.") time.sleep(10) logger.info("Migrating config from version [3] to [4]") current_version = 4 config["meta"]["config.version"] = str(current_version) if len(config["reporting"]["datastore.host"]) > 0: config["reporting"]["datastore.type"] = "elasticsearch" else: config["reporting"]["datastore.type"] = "in-memory" # Remove obsolete settings config["build"].pop("maven.bin") config["benchmarks"].pop("metrics.stats.disk.device") if current_version == 4 and target_version > current_version: config["tracks"] = {} config["tracks"][ "default.url"] = "https://github.com/elastic/rally-tracks" current_version = 5 config["meta"]["config.version"] = str(current_version) if current_version == 5 and target_version > current_version: config["defaults"] = {} config["defaults"]["preserve_benchmark_candidate"] = str(False) current_version = 6 config["meta"]["config.version"] = str(current_version) if current_version == 6 and target_version > current_version: # Remove obsolete settings config.pop("provisioning") config["system"].pop("log.root.dir") current_version = 7 config["meta"]["config.version"] = str(current_version) if current_version == 7 and target_version > current_version: # move [system][root.dir] to [node][root.dir] if "node" not in config: config["node"] = {} config["node"]["root.dir"] = config["system"].pop("root.dir") # also move all references! for section in config: for k, v in config[section].items(): config[section][k] = v.replace("${system:root.dir}", "${node:root.dir}") current_version = 8 config["meta"]["config.version"] = str(current_version) if current_version == 8 and target_version > current_version: config["teams"] = {} config["teams"][ "default.url"] = "https://github.com/elastic/rally-teams" current_version = 9 config["meta"]["config.version"] = str(current_version) if current_version == 9 and target_version > current_version: config["distributions"] = {} config["distributions"]["release.1.url"] = "https://download.elasticsearch.org/elasticsearch/elasticsearch/elasticsearch-" \ "{{VERSION}}.tar.gz" config["distributions"]["release.2.url"] = "https://download.elasticsearch.org/elasticsearch/release/org/elasticsearch/" \ "distribution/tar/elasticsearch/{{VERSION}}/elasticsearch-{{VERSION}}.tar.gz" config["distributions"][ "release.url"] = "https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-{{VERSION}}.tar.gz" config["distributions"]["release.cache"] = "true" current_version = 10 config["meta"]["config.version"] = str(current_version) if current_version == 10 and target_version > current_version: config["runtime"]["java.home"] = config["runtime"].pop("java8.home") current_version = 11 config["meta"]["config.version"] = str(current_version) # all migrations done config_file.store(config) logger.info("Successfully self-upgraded configuration to version [%s]" % target_version)
def migrate(config_file, current_version, target_version, out=print): logger.info("Upgrading configuration from version [%s] to [%s]." % (current_version, target_version)) # Something is really fishy. We don't want to downgrade the configuration. if current_version >= target_version: raise ConfigError("The existing config file is available in a later version already. Expected version <= [%s] but found [%s]" % (target_version, current_version)) # but first a backup... config_file.backup() config = config_file.load(interpolation=None) if current_version == 0 and target_version > current_version: logger.info("Migrating config from version [0] to [1]") current_version = 1 config["meta"] = {} config["meta"]["config.version"] = str(current_version) # in version 1 we changed some directories from being absolute to being relative config["system"]["log.root.dir"] = "logs" config["provisioning"]["local.install.dir"] = "install" config["reporting"]["report.base.dir"] = "reports" if current_version == 1 and target_version > current_version: logger.info("Migrating config from version [1] to [2]") current_version = 2 config["meta"]["config.version"] = str(current_version) # no need to ask the user now if we are about to upgrade to version 4 config["reporting"]["datastore.type"] = "in-memory" config["reporting"]["datastore.host"] = "" config["reporting"]["datastore.port"] = "" config["reporting"]["datastore.secure"] = "" config["reporting"]["datastore.user"] = "" config["reporting"]["datastore.password"] = "" config["system"]["env.name"] = "local" if current_version == 2 and target_version > current_version: logger.info("Migrating config from version [2] to [3]") current_version = 3 config["meta"]["config.version"] = str(current_version) # Remove obsolete settings config["reporting"].pop("report.base.dir") config["reporting"].pop("output.html.report.filename") if current_version == 3 and target_version > current_version: root_dir = config["system"]["root.dir"] out("*****************************************************************************************") out("") out("You have an old configuration of Rally. Rally has now a much simpler setup") out("routine which will autodetect lots of settings for you and it also does not") out("require you to setup a metrics store anymore.") out("") out("Rally will now migrate your configuration but if you don't need advanced features") out("like a metrics store, then you should delete the configuration directory:") out("") out(" rm -rf %s" % config_file.config_dir) out("") out("and then rerun Rally's configuration routine:") out("") out(" %s configure" % PROGRAM_NAME) out("") out("Please also note you have %.1f GB of data in your current benchmark directory at" % convert.bytes_to_gb(io.get_size(root_dir))) out() out(" %s" % root_dir) out("") out("You might want to clean up this directory also.") out() out("For more details please see %s" % console.format.link("https://github.com/elastic/rally/blob/master/CHANGELOG.md#030")) out("") out("*****************************************************************************************") out("") out("Pausing for 10 seconds to let you consider this message.") time.sleep(10) logger.info("Migrating config from version [3] to [4]") current_version = 4 config["meta"]["config.version"] = str(current_version) if len(config["reporting"]["datastore.host"]) > 0: config["reporting"]["datastore.type"] = "elasticsearch" else: config["reporting"]["datastore.type"] = "in-memory" # Remove obsolete settings config["build"].pop("maven.bin") config["benchmarks"].pop("metrics.stats.disk.device") if current_version == 4 and target_version > current_version: config["tracks"] = {} config["tracks"]["default.url"] = "https://github.com/elastic/rally-tracks" current_version = 5 config["meta"]["config.version"] = str(current_version) if current_version == 5 and target_version > current_version: config["defaults"] = {} config["defaults"]["preserve_benchmark_candidate"] = str(False) current_version = 6 config["meta"]["config.version"] = str(current_version) # all migrations done config_file.store(config) logger.info("Successfully self-upgraded configuration to version [%s]" % target_version)
def start(self, car): # hardcoded for the moment, should actually be identical to internal launcher # Only needed on Mac: # hosts = [{"host": process.run_subprocess_with_output("docker-machine ip default")[0].strip(), "port": 9200}] hosts = [{"host": "localhost", "port": 9200}] client_options = self.cfg.opts("launcher", "client.options") # unified client config self.cfg.add(config.Scope.benchmark, "client", "hosts", hosts) self.cfg.add(config.Scope.benchmark, "client", "options", client_options) es = self.client_factory(hosts, client_options).create() t = telemetry.Telemetry( self.cfg, devices=[ # Be aware that some the meta-data are taken from the host system, not the container (e.g. number of CPU cores) so if the # Docker container constrains these, the metrics are actually wrong. telemetry.EnvironmentInfo(self.cfg, es, self.metrics_store), telemetry.NodeStats(self.cfg, es, self.metrics_store), telemetry.IndexStats(self.cfg, es, self.metrics_store), telemetry.DiskIo(self.cfg, self.metrics_store), telemetry.CpuUsage(self.cfg, self.metrics_store) ]) distribution_version = self.cfg.opts("source", "distribution.version", mandatory=False) install_dir = self._install_dir() io.ensure_dir(install_dir) java_opts = "" if car.heap: java_opts += "-Xms%s -Xmx%s " % (car.heap, car.heap) if car.java_opts: java_opts += car.java_opts vars = { "es_java_opts": java_opts, "container_memory_gb": "%dg" % (convert.bytes_to_gb(psutil.virtual_memory().total) // 2), "es_data_dir": "%s/data" % install_dir, "es_version": distribution_version } docker_cfg = self._render_template_from_file(vars) logger.info("Starting Docker container with configuration:\n%s" % docker_cfg) docker_cfg_path = self._docker_cfg_path() with open(docker_cfg_path, "wt") as f: f.write(docker_cfg) c = cluster.Cluster([], t) self._start_process(cmd="docker-compose -f %s up" % docker_cfg_path, node_name="rally0") # Wait for a little while: Plugins may still be initializing although the node has already started. time.sleep(10) t.attach_to_cluster(c) logger.info("Successfully started Docker container") return c
def receiveMessage(self, msg, sender): try: logger.debug( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): logger.info( "Received signal from race control to start engine.") self.race_control = sender # In our startup procedure we first create all mechanics. Only if this succeeds mechanics_and_start_message = [] if msg.external: logger.info( "Target node(s) will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor( LocalNodeMechanicActor, globalName="/rally/mechanic/worker/external", targetActorRequirements={"coordinator": True}) self.mechanics.append(m) # we can use the original message in this case mechanics_and_start_message.append((m, msg)) else: hosts = msg.cfg.opts("client", "hosts") logger.info( "Target node(s) %s will be provisioned by Rally." % hosts) if len(hosts) == 0: raise exceptions.LaunchError( "No target hosts are configured.") for host in hosts: ip = host["host"] port = int(host["port"]) # user may specify "localhost" on the command line but the problem is that we auto-register the actor system # with "ip": "127.0.0.1" so we convert this special case automatically. In all other cases the user needs to # start the actor system on the other host and is aware that the parameter for the actor system and the # --target-hosts parameter need to match. if ip == "localhost" or ip == "127.0.0.1": m = self.createActor( LocalNodeMechanicActor, globalName="/rally/mechanic/worker/localhost", targetActorRequirements={"coordinator": True}) self.mechanics.append(m) mechanics_and_start_message.append( (m, msg.with_port(port))) else: if msg.cfg.opts("system", "remote.benchmarking.supported"): logger.info( "Benchmarking against %s with external Rally daemon." % hosts) else: logger.error( "User tried to benchmark against %s but no external Rally daemon has been started." % hosts) raise exceptions.SystemSetupError( "To benchmark remote hosts (e.g. %s) you need to start the Rally daemon " "on each machine including this one." % ip) already_running = actor.actor_system_already_running( ip=ip) logger.info( "Actor system on [%s] already running? [%s]" % (ip, str(already_running))) if not already_running: console.println( "Waiting for Rally daemon on [%s] " % ip, end="", flush=True) while not actor.actor_system_already_running( ip=ip): console.println(".", end="", flush=True) time.sleep(3) if not already_running: console.println(" [OK]") m = self.createActor( RemoteNodeMechanicActor, globalName="/rally/mechanic/worker/%s" % ip, targetActorRequirements={"ip": ip}) mechanics_and_start_message.append( (m, msg.with_port(port))) self.mechanics.append(m) for mechanic_actor, start_message in mechanics_and_start_message: self.send(mechanic_actor, start_message) elif isinstance(msg, EngineStarted): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStart): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, Success): self.send(self.race_control, msg) elif isinstance(msg, Failure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, BenchmarkStopped): # TODO dm: Actually we need to wait for all BenchmarkStopped messages from all our mechanic actors # TODO dm: We will actually duplicate cluster level metrics if each of our mechanic actors gathers these... self.send(self.race_control, msg) elif isinstance(msg, StopEngine): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, EngineStopped): self.send(self.race_control, msg) # clear all state as the mechanic might get reused later for m in self.mechanics: self.send(m, thespian.actors.ActorExitRequest()) self.mechanics = [] # self terminate + slave nodes self.send(self.myAddress, thespian.actors.ActorExitRequest()) elif isinstance(msg, thespian.actors.ChildActorExited): # TODO dm: Depending on our state model this can be fine (e.g. when it exited due to our ActorExitRequest message # or it could be problematic and mean that an exception has occured. pass elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) except BaseException: logger.exception("Cannot process message [%s]" % msg) # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor. recipient = self.race_control if sender in self.mechanics else sender ex_type, ex_value, ex_traceback = sys.exc_info() # avoid "can't pickle traceback objects" import traceback self.send( recipient, Failure("Could not execute command (%s)" % ex_value, traceback.format_exc()))