def _start_node(self, node_configuration, node_count_on_host): host_name = node_configuration.ip node_name = node_configuration.node_name car = node_configuration.car binary_path = node_configuration.binary_path data_paths = node_configuration.data_paths node_telemetry_dir = "%s/telemetry" % node_configuration.node_root_path java_major_version, java_home = java_resolver.java_home(car, self.cfg) self.logger.info("Starting node [%s] based on car [%s].", node_name, car) enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") telemetry_params = self.cfg.opts("mechanic", "telemetry.params") node_telemetry = [ telemetry.DiskIo(self.metrics_store, node_count_on_host, node_telemetry_dir, node_name), telemetry.NodeEnvironmentInfo(self.metrics_store), telemetry.IndexSize(data_paths, self.metrics_store), telemetry.MergeParts(self.metrics_store, node_configuration.log_path), telemetry.StartupTime(self.metrics_store), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) env = self._prepare_env(car, node_name, java_home, t) t.on_pre_node_start(node_name) node_pid = self._start_process(binary_path, env) node = cluster.Node(node_pid, host_name, node_name, t) self.logger.info("Attaching telemetry devices to node [%s].", node_name) t.attach_to_node(node) return node
def _start_node(self, node_configuration, node_count_on_host, java_major_version): host_name = node_configuration.ip node_name = node_configuration.node_name car = node_configuration.car binary_path = node_configuration.binary_path data_paths = node_configuration.data_paths node_telemetry_dir = "%s/telemetry" % node_configuration.node_root_path self.logger.info("Starting node [%s] based on car [%s].", node_name, car) enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") telemetry_params = self.cfg.opts("mechanic", "telemetry.params") node_telemetry = [ telemetry.FlightRecorder(telemetry_params, node_telemetry_dir, java_major_version), telemetry.JitCompiler(node_telemetry_dir), telemetry.Gc(node_telemetry_dir, java_major_version), telemetry.PerfStat(node_telemetry_dir), telemetry.DiskIo(self.metrics_store, node_count_on_host), telemetry.CpuUsage(self.metrics_store), telemetry.NodeEnvironmentInfo(self.metrics_store), telemetry.IndexSize(data_paths, self.metrics_store), telemetry.MergeParts(self.metrics_store, node_configuration.log_path), telemetry.StartupTime(self.metrics_store), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) env = self._prepare_env(car, node_name, t) t.on_pre_node_start(node_name) node_process = self._start_process(env, node_name, binary_path) node = cluster.Node(node_process, host_name, node_name, t) self.logger.info("Node [%s] has successfully started. Attaching telemetry devices.", node_name) t.attach_to_node(node) self.logger.info("Telemetry devices are now attached to node [%s].", node_name) return node
def start(self, car): port = self.cfg.opts("provisioning", "node.http.port") hosts = [{"host": "localhost", "port": port}] client_options = self.cfg.opts("launcher", "client.options") # unified client config self.cfg.add(config.Scope.benchmark, "client", "hosts", hosts) self.cfg.add(config.Scope.benchmark, "client", "options", client_options) es = client.EsClientFactory(hosts, client_options).create() # we're very specific which nodes we kill as there is potentially also an Elasticsearch based metrics store running on this machine node_prefix = self.cfg.opts("provisioning", "node.name.prefix") process.kill_running_es_instances(node_prefix) logger.info("Starting a cluster based on car [%s] with [%d] nodes." % (car, car.nodes)) cluster_telemetry = [ # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.MergeParts(self.cfg, self.metrics_store), telemetry.EnvironmentInfo(self.cfg, es, self.metrics_store), telemetry.NodeStats(self.cfg, es, self.metrics_store), telemetry.IndexStats(self.cfg, es, self.metrics_store), # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.IndexSize(self.cfg, self.metrics_store) ] t = telemetry.Telemetry(self.cfg, devices=cluster_telemetry) c = cluster.Cluster( [self._start_node(node, car, es) for node in range(car.nodes)], t) t.attach_to_cluster(c) return c
def test_stores_index_size_for_data_paths(self, run_subprocess, metrics_store_node_count, get_size): get_size.side_effect = [2048, 16384] cfg = create_config() metrics_store = metrics.EsMetricsStore(cfg) device = telemetry.IndexSize( ["/var/elasticsearch/data/1", "/var/elasticsearch/data/2"], metrics_store) t = telemetry.Telemetry(enabled_devices=[], devices=[device]) node = cluster.Node(process=None, host_name="localhost", node_name="rally-node-0", telemetry=t) t.attach_to_node(node) t.on_benchmark_start() t.on_benchmark_stop() t.detach_from_node(node, running=True) t.detach_from_node(node, running=False) metrics_store_node_count.assert_has_calls([ mock.call("rally-node-0", "final_index_size_bytes", 18432, "byte") ]) run_subprocess.assert_has_calls([ mock.call("find /var/elasticsearch/data/1 -ls", header="index files:"), mock.call("find /var/elasticsearch/data/2 -ls", header="index files:") ])
def start(self, car, binary, data_paths): hosts = self.cfg.opts("client", "hosts") client_options = self.cfg.opts("client", "options") es = client.EsClientFactory(hosts, client_options).create() # we're very specific which nodes we kill as there is potentially also an Elasticsearch based metrics store running on this machine node_prefix = self.cfg.opts("provisioning", "node.name.prefix") process.kill_running_es_instances(node_prefix) logger.info("Starting a cluster based on car [%s] with [%d] nodes." % (car, car.nodes)) # TODO dm: Get rid of these... enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") cluster_telemetry = [ # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.MergeParts(self.metrics_store, self.node_log_dir), telemetry.EnvironmentInfo(es, self.metrics_store), telemetry.NodeStats(es, self.metrics_store), telemetry.IndexStats(es, self.metrics_store), # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.IndexSize(data_paths, self.metrics_store) ] t = telemetry.Telemetry(enabled_devices, devices=cluster_telemetry) c = cluster.Cluster(hosts, [self._start_node(node, car, es, binary) for node in range(car.nodes)], t) logger.info("All cluster nodes have successfully started. Checking if REST API is available.") if wait_for_rest_layer(es): logger.info("REST API is available. Attaching telemetry devices to cluster.") t.attach_to_cluster(c) logger.info("Telemetry devices are now attached to the cluster.") else: logger.error("REST API layer is not yet available. Forcefully terminating cluster.") self.stop(c) raise exceptions.LaunchError("Elasticsearch REST API layer is not available. Forcefully terminated cluster.") return c
def test_stores_nothing_if_no_data_path(self, run_subprocess, metrics_store_cluster_count, get_size): get_size.return_value = 2048 cfg = create_config() metrics_store = metrics.EsMetricsStore(cfg) device = telemetry.IndexSize(data_paths=[], metrics_store=metrics_store) t = telemetry.Telemetry(devices=[device]) t.attach_to_cluster(None) t.on_benchmark_start() t.on_benchmark_stop() t.detach_from_cluster(None) run_subprocess.assert_not_called() metrics_store_cluster_count.assert_not_called() get_size.assert_not_called()
def test_stores_nothing_if_no_data_path(self, run_subprocess, metrics_store_cluster_count, get_size): get_size.return_value = 2048 cfg = create_config() metrics_store = metrics.EsMetricsStore(cfg) device = telemetry.IndexSize(data_paths=[], metrics_store=metrics_store) t = telemetry.Telemetry(devices=[device]) node = cluster.Node(process=None, host_name="localhost", node_name="rally-node-0", telemetry=t) t.attach_to_node(node) t.on_benchmark_start() t.on_benchmark_stop() t.detach_from_node(node, running=True) t.detach_from_node(node, running=False) run_subprocess.assert_not_called() metrics_store_cluster_count.assert_not_called() get_size.assert_not_called()
def test_stores_index_size_for_data_path(self, run_subprocess, metrics_store_cluster_count, get_size): get_size.return_value = 2048 cfg = create_config() metrics_store = metrics.EsMetricsStore(cfg) device = telemetry.IndexSize(["/var/elasticsearch/data"], metrics_store) t = telemetry.Telemetry(enabled_devices=[], devices=[device]) t.attach_to_cluster(None) t.on_benchmark_start() t.on_benchmark_stop() t.detach_from_cluster(None) metrics_store_cluster_count.assert_has_calls([ mock.call("final_index_size_bytes", 2048, "byte") ]) run_subprocess.assert_has_calls([ mock.call("find /var/elasticsearch/data -ls", header="index files:") ])
def _start_node(self, node_configuration, node_count_on_host): host_name = node_configuration.ip node_name = node_configuration.node_name car = node_configuration.car binary_path = node_configuration.binary_path data_paths = node_configuration.data_paths node_telemetry_dir = os.path.join(node_configuration.node_root_path, "telemetry") java_major_version, java_home = java_resolver.java_home(car, self.cfg) telemetry.add_metadata_for_node(self.metrics_store, node_name, host_name) self.logger.info("Starting node [%s] based on car [%s].", node_name, car) enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") telemetry_params = self.cfg.opts("mechanic", "telemetry.params") node_telemetry = [ telemetry.FlightRecorder(telemetry_params, node_telemetry_dir, java_major_version), telemetry.JitCompiler(node_telemetry_dir), telemetry.Gc(node_telemetry_dir, java_major_version), telemetry.DiskIo(self.metrics_store, node_count_on_host, node_telemetry_dir, node_name), telemetry.IndexSize(data_paths, self.metrics_store), telemetry.StartupTime(self.metrics_store), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) env = self._prepare_env(car, node_name, java_home, t) t.on_pre_node_start(node_name) node_pid = self._start_process(binary_path, env) node = cluster.Node(node_pid, host_name, node_name, t) self.logger.info("Attaching telemetry devices to node [%s].", node_name) t.attach_to_node(node) return node