def start(self, car): port = self.cfg.opts("provisioning", "node.http.port") hosts = [{"host": "localhost", "port": port}] client_options = self.cfg.opts("launcher", "client.options") # unified client config self.cfg.add(config.Scope.benchmark, "client", "hosts", hosts) self.cfg.add(config.Scope.benchmark, "client", "options", client_options) es = client.EsClientFactory(hosts, client_options).create() # we're very specific which nodes we kill as there is potentially also an Elasticsearch based metrics store running on this machine node_prefix = self.cfg.opts("provisioning", "node.name.prefix") process.kill_running_es_instances(node_prefix) logger.info("Starting a cluster based on car [%s] with [%d] nodes." % (car, car.nodes)) cluster_telemetry = [ # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.MergeParts(self.cfg, self.metrics_store), telemetry.EnvironmentInfo(self.cfg, es, self.metrics_store), telemetry.NodeStats(self.cfg, es, self.metrics_store), telemetry.IndexStats(self.cfg, es, self.metrics_store), # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.IndexSize(self.cfg, self.metrics_store) ] t = telemetry.Telemetry(self.cfg, devices=cluster_telemetry) c = cluster.Cluster( [self._start_node(node, car, es) for node in range(car.nodes)], t) t.attach_to_cluster(c) return c
def test_stores_cluster_level_metrics_on_attach(self, metrics_store_add_meta_info): nodes_info = {"nodes": collections.OrderedDict()} nodes_info["nodes"]["FCFjozkeTiOpN-SI88YEcg"] = { "name": "rally0", "host": "127.0.0.1", "attributes": { "group": "cold_nodes" }, "os": { "name": "Mac OS X", "version": "10.11.4", "available_processors": 8 }, "jvm": { "version": "1.8.0_74", "vm_vendor": "Oracle Corporation" } } nodes_info["nodes"]["EEEjozkeTiOpN-SI88YEcg"] = { "name": "rally1", "host": "127.0.0.1", "attributes": { "group": "hot_nodes" }, "os": { "name": "Mac OS X", "version": "10.11.5", "available_processors": 8 }, "jvm": { "version": "1.8.0_102", "vm_vendor": "Oracle Corporation" } } cluster_info = { "version": { "build_hash": "abc123", "number": "6.0.0-alpha1" } } client = Client(nodes=SubClient(info=nodes_info), info=cluster_info) metrics_store = metrics.EsMetricsStore(self.cfg) env_device = telemetry.EnvironmentInfo(client, metrics_store) t = telemetry.Telemetry(self.cfg, devices=[env_device]) t.attach_to_cluster(cluster.Cluster([], [], t)) calls = [ mock.call(metrics.MetaInfoScope.cluster, None, "source_revision", "abc123"), mock.call(metrics.MetaInfoScope.cluster, None, "distribution_version", "6.0.0-alpha1"), mock.call(metrics.MetaInfoScope.node, "rally0", "jvm_vendor", "Oracle Corporation"), mock.call(metrics.MetaInfoScope.node, "rally0", "jvm_version", "1.8.0_74"), mock.call(metrics.MetaInfoScope.node, "rally1", "jvm_vendor", "Oracle Corporation"), mock.call(metrics.MetaInfoScope.node, "rally1", "jvm_version", "1.8.0_102"), mock.call(metrics.MetaInfoScope.node, "rally0", "attribute_group", "cold_nodes"), mock.call(metrics.MetaInfoScope.node, "rally1", "attribute_group", "hot_nodes") ] metrics_store_add_meta_info.assert_has_calls(calls)
def test_stores_node_level_metrics_on_attach(self, cpu_model, physical_cpu_cores, logical_cpu_cores, os_version, os_name, metrics_store_add_meta_info): cpu_model.return_value = "Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz" physical_cpu_cores.return_value = 4 logical_cpu_cores.return_value = 8 os_version.return_value = "4.2.0-18-generic" os_name.return_value = "Linux" metrics_store = metrics.EsMetricsStore(self.cfg) node = cluster.Node(None, "io", "rally0", None) env_device = telemetry.EnvironmentInfo(self.cfg, None, metrics_store) env_device.attach_to_node(node) calls = [ mock.call(metrics.MetaInfoScope.node, "rally0", "os_name", "Linux"), mock.call(metrics.MetaInfoScope.node, "rally0", "os_version", "4.2.0-18-generic"), mock.call(metrics.MetaInfoScope.node, "rally0", "cpu_logical_cores", 8), mock.call(metrics.MetaInfoScope.node, "rally0", "cpu_physical_cores", 4), mock.call(metrics.MetaInfoScope.node, "rally0", "cpu_model", "Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz"), mock.call(metrics.MetaInfoScope.node, "rally0", "node_name", "rally0"), mock.call(metrics.MetaInfoScope.node, "rally0", "host_name", "io"), ] metrics_store_add_meta_info.assert_has_calls(calls)
def start(self, car, binary, data_paths): self.binary_path = binary hosts = self.cfg.opts("client", "hosts") client_options = self.cfg.opts("client", "options") es = self.client_factory(hosts, client_options).create() # Cannot enable custom telemetry devices here t = telemetry.Telemetry(devices=[ # Be aware that some the meta-data are taken from the host system, not the container (e.g. number of CPU cores) so if the # Docker container constrains these, the metrics are actually wrong. telemetry.EnvironmentInfo(es, self.metrics_store), telemetry.NodeStats(es, self.metrics_store), telemetry.IndexStats(es, self.metrics_store), telemetry.DiskIo(self.metrics_store), telemetry.CpuUsage(self.metrics_store) ]) c = cluster.Cluster(hosts, [], t) self._start_process(cmd="docker-compose -f %s up" % self.binary_path, node_name="rally0") logger.info("Docker container has successfully started. Checking if REST API is available.") if wait_for_rest_layer(es): logger.info("REST API is available. Attaching telemetry devices to cluster.") t.attach_to_cluster(c) logger.info("Telemetry devices are now attached to the cluster.") else: logger.error("REST API layer is not yet available. Forcefully terminating cluster.") self.stop(c) raise exceptions.LaunchError("Elasticsearch REST API layer is not available. Forcefully terminated cluster.") return c
def _start_node(self, node, car, es, binary_path): node_name = self._node_name(node) host_name = socket.gethostname() enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") node_telemetry = [ telemetry.FlightRecorder(self.node_telemetry_dir), telemetry.JitCompiler(self.node_telemetry_dir), telemetry.Gc(self.node_telemetry_dir), telemetry.PerfStat(self.node_telemetry_dir), telemetry.DiskIo(self.metrics_store), telemetry.CpuUsage(self.metrics_store), telemetry.EnvironmentInfo(es, self.metrics_store), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) env = self._prepare_env(car, node_name, t) cmd = self.prepare_cmd(car, node_name) process = self._start_process(cmd, env, node_name, binary_path) node = cluster.Node(process, host_name, node_name, t) logger.info("Cluster node [%s] has successfully started. Attaching telemetry devices to node." % node_name) t.attach_to_node(node) logger.info("Telemetry devices are now attached to node [%s]." % node_name) return node
def start(self, car, binary, data_paths): hosts = self.cfg.opts("client", "hosts") client_options = self.cfg.opts("client", "options") es = client.EsClientFactory(hosts, client_options).create() # we're very specific which nodes we kill as there is potentially also an Elasticsearch based metrics store running on this machine node_prefix = self.cfg.opts("provisioning", "node.name.prefix") process.kill_running_es_instances(node_prefix) logger.info("Starting a cluster based on car [%s] with [%d] nodes." % (car, car.nodes)) # TODO dm: Get rid of these... enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") cluster_telemetry = [ # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.MergeParts(self.metrics_store, self.node_log_dir), telemetry.EnvironmentInfo(es, self.metrics_store), telemetry.NodeStats(es, self.metrics_store), telemetry.IndexStats(es, self.metrics_store), # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.IndexSize(data_paths, self.metrics_store) ] t = telemetry.Telemetry(enabled_devices, devices=cluster_telemetry) c = cluster.Cluster(hosts, [self._start_node(node, car, es, binary) for node in range(car.nodes)], t) logger.info("All cluster nodes have successfully started. Checking if REST API is available.") if wait_for_rest_layer(es): logger.info("REST API is available. Attaching telemetry devices to cluster.") t.attach_to_cluster(c) logger.info("Telemetry devices are now attached to the cluster.") else: logger.error("REST API layer is not yet available. Forcefully terminating cluster.") self.stop(c) raise exceptions.LaunchError("Elasticsearch REST API layer is not available. Forcefully terminated cluster.") return c
def _start_node(self, host, node, es): node_name = self._node_name(node) p = self._start_process(cmd="docker-compose -f %s up" % self.binary_path, node_name=node_name) # only support a subset of telemetry for Docker hosts (specifically, we do not allow users to enable any devices) node_telemetry = [ telemetry.DiskIo(self.metrics_store), telemetry.CpuUsage(self.metrics_store), telemetry.EnvironmentInfo(es, self.metrics_store) ] t = telemetry.Telemetry(devices=node_telemetry) return cluster.Node(p, host["host"], node_name, t)
def _start_node(self, node, car, es): node_name = self._node_name(node) host_name = socket.gethostname() node_telemetry = [ telemetry.FlightRecorder(self.cfg, self.metrics_store), telemetry.JitCompiler(self.cfg, self.metrics_store), telemetry.Gc(self.cfg, self.metrics_store), telemetry.PerfStat(self.cfg, self.metrics_store), telemetry.DiskIo(self.cfg, self.metrics_store), telemetry.CpuUsage(self.cfg, self.metrics_store), telemetry.EnvironmentInfo(self.cfg, es, self.metrics_store), ] t = telemetry.Telemetry(self.cfg, devices=node_telemetry) env = self._prepare_env(car, node_name, t) cmd = self.prepare_cmd(car, node_name) process = self._start_process(cmd, env, node_name) node = cluster.Node(process, host_name, node_name, t) t.attach_to_node(node) return node
def start(self, car): # hardcoded for the moment, should actually be identical to internal launcher # Only needed on Mac: # hosts = [{"host": process.run_subprocess_with_output("docker-machine ip default")[0].strip(), "port": 9200}] hosts = [{"host": "localhost", "port": 9200}] client_options = self.cfg.opts("launcher", "client.options") # unified client config self.cfg.add(config.Scope.benchmark, "client", "hosts", hosts) self.cfg.add(config.Scope.benchmark, "client", "options", client_options) es = self.client_factory(hosts, client_options).create() t = telemetry.Telemetry( self.cfg, devices=[ # Be aware that some the meta-data are taken from the host system, not the container (e.g. number of CPU cores) so if the # Docker container constrains these, the metrics are actually wrong. telemetry.EnvironmentInfo(self.cfg, es, self.metrics_store), telemetry.NodeStats(self.cfg, es, self.metrics_store), telemetry.IndexStats(self.cfg, es, self.metrics_store), telemetry.DiskIo(self.cfg, self.metrics_store), telemetry.CpuUsage(self.cfg, self.metrics_store) ]) distribution_version = self.cfg.opts("source", "distribution.version", mandatory=False) install_dir = self._install_dir() io.ensure_dir(install_dir) java_opts = "" if car.heap: java_opts += "-Xms%s -Xmx%s " % (car.heap, car.heap) if car.java_opts: java_opts += car.java_opts vars = { "es_java_opts": java_opts, "container_memory_gb": "%dg" % (convert.bytes_to_gb(psutil.virtual_memory().total) // 2), "es_data_dir": "%s/data" % install_dir, "es_version": distribution_version } docker_cfg = self._render_template_from_file(vars) logger.info("Starting Docker container with configuration:\n%s" % docker_cfg) docker_cfg_path = self._docker_cfg_path() with open(docker_cfg_path, "wt") as f: f.write(docker_cfg) c = cluster.Cluster([], t) self._start_process(cmd="docker-compose -f %s up" % docker_cfg_path, node_name="rally0") # Wait for a little while: Plugins may still be initializing although the node has already started. time.sleep(10) t.attach_to_cluster(c) logger.info("Successfully started Docker container") return c