def _start_node(self, node_configuration, node_count_on_host, java_major_version): host_name = node_configuration.ip node_name = node_configuration.node_name car = node_configuration.car binary_path = node_configuration.binary_path data_paths = node_configuration.data_paths node_telemetry_dir = "%s/telemetry" % node_configuration.node_root_path self.logger.info("Starting node [%s] based on car [%s].", node_name, car) enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") telemetry_params = self.cfg.opts("mechanic", "telemetry.params") node_telemetry = [ telemetry.FlightRecorder(telemetry_params, node_telemetry_dir, java_major_version), telemetry.JitCompiler(node_telemetry_dir), telemetry.Gc(node_telemetry_dir, java_major_version), telemetry.PerfStat(node_telemetry_dir), telemetry.DiskIo(self.metrics_store, node_count_on_host), telemetry.CpuUsage(self.metrics_store), telemetry.NodeEnvironmentInfo(self.metrics_store), telemetry.IndexSize(data_paths, self.metrics_store), telemetry.MergeParts(self.metrics_store, node_configuration.log_path), telemetry.StartupTime(self.metrics_store), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) env = self._prepare_env(car, node_name, t) t.on_pre_node_start(node_name) node_process = self._start_process(env, node_name, binary_path) node = cluster.Node(node_process, host_name, node_name, t) self.logger.info("Node [%s] has successfully started. Attaching telemetry devices.", node_name) t.attach_to_node(node) self.logger.info("Telemetry devices are now attached to node [%s].", node_name) return node
def _start_node(self, node_configuration, node_count_on_host): host_name = node_configuration.ip node_name = node_configuration.node_name car = node_configuration.car binary_path = node_configuration.binary_path data_paths = node_configuration.data_paths node_telemetry_dir = "%s/telemetry" % node_configuration.node_root_path java_major_version, java_home = java_resolver.java_home(car, self.cfg) self.logger.info("Starting node [%s] based on car [%s].", node_name, car) enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") telemetry_params = self.cfg.opts("mechanic", "telemetry.params") node_telemetry = [ telemetry.DiskIo(self.metrics_store, node_count_on_host, node_telemetry_dir, node_name), telemetry.NodeEnvironmentInfo(self.metrics_store), telemetry.IndexSize(data_paths, self.metrics_store), telemetry.MergeParts(self.metrics_store, node_configuration.log_path), telemetry.StartupTime(self.metrics_store), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) env = self._prepare_env(car, node_name, java_home, t) t.on_pre_node_start(node_name) node_pid = self._start_process(binary_path, env) node = cluster.Node(node_pid, host_name, node_name, t) self.logger.info("Attaching telemetry devices to node [%s].", node_name) t.attach_to_node(node) return node
def test_store_calculated_metrics(self, listdir_mock, open_mock, metrics_store_put_value, metrics_store_put_count): log_file = ''' INFO: System starting up INFO: 100 msec to merge doc values [500 docs] INFO: Something unrelated INFO: 250 msec to merge doc values [1350 docs] INFO: System shutting down ''' listdir_mock.return_value = [open_mock] open_mock.side_effect = [ mock.mock_open(read_data=log_file).return_value ] metrics_store = metrics.EsMetricsStore(self.cfg) node = cluster.Node(None, "io", "rally0", None) merge_parts_device = telemetry.MergeParts(metrics_store, node_log_dir="/var/log") merge_parts_device.attach_to_node(node) merge_parts_device.on_benchmark_stop() metrics_store_put_value.assert_called_with( "rally0", "merge_parts_total_time_doc_values", 350, "ms") metrics_store_put_count.assert_called_with( "rally0", "merge_parts_total_docs_doc_values", 1850)
def start(self, car): port = self.cfg.opts("provisioning", "node.http.port") hosts = [{"host": "localhost", "port": port}] client_options = self.cfg.opts("launcher", "client.options") # unified client config self.cfg.add(config.Scope.benchmark, "client", "hosts", hosts) self.cfg.add(config.Scope.benchmark, "client", "options", client_options) es = client.EsClientFactory(hosts, client_options).create() # we're very specific which nodes we kill as there is potentially also an Elasticsearch based metrics store running on this machine node_prefix = self.cfg.opts("provisioning", "node.name.prefix") process.kill_running_es_instances(node_prefix) logger.info("Starting a cluster based on car [%s] with [%d] nodes." % (car, car.nodes)) cluster_telemetry = [ # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.MergeParts(self.cfg, self.metrics_store), telemetry.EnvironmentInfo(self.cfg, es, self.metrics_store), telemetry.NodeStats(self.cfg, es, self.metrics_store), telemetry.IndexStats(self.cfg, es, self.metrics_store), # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.IndexSize(self.cfg, self.metrics_store) ] t = telemetry.Telemetry(self.cfg, devices=cluster_telemetry) c = cluster.Cluster( [self._start_node(node, car, es) for node in range(car.nodes)], t) t.attach_to_cluster(c) return c
def start(self, car, binary, data_paths): hosts = self.cfg.opts("client", "hosts") client_options = self.cfg.opts("client", "options") es = client.EsClientFactory(hosts, client_options).create() # we're very specific which nodes we kill as there is potentially also an Elasticsearch based metrics store running on this machine node_prefix = self.cfg.opts("provisioning", "node.name.prefix") process.kill_running_es_instances(node_prefix) logger.info("Starting a cluster based on car [%s] with [%d] nodes." % (car, car.nodes)) # TODO dm: Get rid of these... enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") cluster_telemetry = [ # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.MergeParts(self.metrics_store, self.node_log_dir), telemetry.EnvironmentInfo(es, self.metrics_store), telemetry.NodeStats(es, self.metrics_store), telemetry.IndexStats(es, self.metrics_store), # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.IndexSize(data_paths, self.metrics_store) ] t = telemetry.Telemetry(enabled_devices, devices=cluster_telemetry) c = cluster.Cluster(hosts, [self._start_node(node, car, es, binary) for node in range(car.nodes)], t) logger.info("All cluster nodes have successfully started. Checking if REST API is available.") if wait_for_rest_layer(es): logger.info("REST API is available. Attaching telemetry devices to cluster.") t.attach_to_cluster(c) logger.info("Telemetry devices are now attached to the cluster.") else: logger.error("REST API layer is not yet available. Forcefully terminating cluster.") self.stop(c) raise exceptions.LaunchError("Elasticsearch REST API layer is not available. Forcefully terminated cluster.") return c
def test_store_nothing_if_no_metrics_present(self, listdir_mock, open_mock, metrics_store_put_value, metrics_store_put_count): listdir_mock.return_value = [open_mock] open_mock.side_effect = [ mock.mock_open(read_data="no data to parse").return_value ] metrics_store = metrics.EsMetricsStore(self.cfg) merge_parts_device = telemetry.MergeParts(self.cfg, metrics_store) merge_parts_device.on_benchmark_stop() metrics_store_put_value.assert_not_called() metrics_store_put_count.assert_not_called()