Exemplo n.º 1
0
def execute_schedule(schedule, es, sampler):
    """
    Executes tasks according to the schedule for a given operation.

    :param schedule: The schedule for this operation.
    :param es: Elasticsearch client that will be used to execute the operation.
    :param sampler: A container to store raw samples.
    """
    total_start = time.perf_counter()
    curr_total_it = 1
    # noinspection PyBroadException
    try:
        for expected_scheduled_time, sample_type_calculator, curr_iteration, total_it_for_task, runner, params in schedule:
            sample_type = sample_type_calculator(total_start)
            absolute_expected_schedule_time = total_start + expected_scheduled_time
            throughput_throttled = expected_scheduled_time > 0
            if throughput_throttled:
                rest = absolute_expected_schedule_time - time.perf_counter()
                if rest > 0:
                    time.sleep(rest)
            start = time.perf_counter()
            with runner:
                total_ops, total_ops_unit = runner(es, params)
            stop = time.perf_counter()

            service_time = stop - start
            # Do not calculate latency separately when we don't throttle throughput. This metric is just confusing then.
            latency = stop - absolute_expected_schedule_time if throughput_throttled else service_time
            sampler.add(sample_type, convert.seconds_to_ms(latency), convert.seconds_to_ms(service_time), total_ops, total_ops_unit,
                        (stop - total_start), curr_total_it, total_it_for_task)
            curr_total_it += 1
    except BaseException:
        logger.exception("Could not execute schedule")
        raise
Exemplo n.º 2
0
def execute_schedule(schedule, es, sampler):
    """
    Executes tasks according to the schedule for a given operation.

    :param schedule: The schedule for this operation.
    :param es: Elasticsearch client that will be used to execute the operation.
    :param sampler: A container to store raw samples.
    """
    total_start = time.perf_counter()
    # noinspection PyBroadException
    try:
        for expected_scheduled_time, sample_type, percent_completed, runner, params in schedule:
            absolute_expected_schedule_time = total_start + expected_scheduled_time
            throughput_throttled = expected_scheduled_time > 0
            if throughput_throttled:
                rest = absolute_expected_schedule_time - time.perf_counter()
                if rest > 0:
                    time.sleep(rest)
            start = time.perf_counter()
            total_ops, total_ops_unit, request_meta_data = execute_single(runner, es, params)
            stop = time.perf_counter()

            service_time = stop - start
            # Do not calculate latency separately when we don't throttle throughput. This metric is just confusing then.
            latency = stop - absolute_expected_schedule_time if throughput_throttled else service_time
            sampler.add(sample_type, request_meta_data, convert.seconds_to_ms(latency), convert.seconds_to_ms(service_time), total_ops,
                        total_ops_unit, (stop - total_start), percent_completed)
    except BaseException:
        logger.exception("Could not execute schedule")
        raise
Exemplo n.º 3
0
    def _index_stats(self, expected_doc_count):
        logger.info("Gathering indices stats")
        # warmup
        self.repeat(self.cluster.client.indices.stats,
                    metric="_all",
                    level="shards")
        durations, stats = self.repeat(self.cluster.client.indices.stats,
                                       metric="_all",
                                       level="shards")
        for duration in durations:
            self.metrics_store.put_value_cluster_level(
                "indices_stats_latency", convert.seconds_to_ms(duration), "ms")
        primaries = stats["_all"]["primaries"]
        self.metrics_store.put_count_cluster_level(
            "segments_count", primaries["segments"]["count"])
        self.metrics_store.put_count_cluster_level(
            "segments_memory_in_bytes",
            primaries["segments"]["memory_in_bytes"], "byte")
        self.metrics_store.put_count_cluster_level(
            "segments_doc_values_memory_in_bytes",
            primaries["segments"]["doc_values_memory_in_bytes"], "byte")
        self.metrics_store.put_count_cluster_level(
            "segments_stored_fields_memory_in_bytes",
            primaries["segments"]["stored_fields_memory_in_bytes"], "byte")
        self.metrics_store.put_count_cluster_level(
            "segments_terms_memory_in_bytes",
            primaries["segments"]["terms_memory_in_bytes"], "byte")
        self.metrics_store.put_count_cluster_level(
            "segments_norms_memory_in_bytes",
            primaries["segments"]["norms_memory_in_bytes"], "byte")
        if "points_memory_in_bytes" in primaries["segments"]:
            self.metrics_store.put_count_cluster_level(
                "segments_points_memory_in_bytes",
                primaries["segments"]["points_memory_in_bytes"], "byte")
        self.metrics_store.put_value_cluster_level(
            "merges_total_time", primaries["merges"]["total_time_in_millis"],
            "ms")
        self.metrics_store.put_value_cluster_level(
            "merges_total_throttled_time",
            primaries["merges"]["total_throttled_time_in_millis"], "ms")
        self.metrics_store.put_value_cluster_level(
            "indexing_total_time",
            primaries["indexing"]["index_time_in_millis"], "ms")
        self.metrics_store.put_value_cluster_level(
            "refresh_total_time", primaries["refresh"]["total_time_in_millis"],
            "ms")
        self.metrics_store.put_value_cluster_level(
            "flush_total_time", primaries["flush"]["total_time_in_millis"],
            "ms")

        actual_doc_count = primaries["docs"]["count"]
        if expected_doc_count is not None and expected_doc_count != actual_doc_count:
            msg = "wrong number of documents indexed: expected %s but got %s" % (
                expected_doc_count, actual_doc_count)
            logger.error(msg)
            raise RuntimeError(msg)
Exemplo n.º 4
0
    def run(self):
        logger.info("Running warmup iterations")
        self._run_benchmark("  Benchmarking %s (warmup iteration %d/%d)")
        logger.info("Running measurement iterations")
        times = self._run_benchmark("  Benchmarking %s (iteration %d/%d)")

        for q in self.queries:
            latencies = [t[q.name] for t in times]
            for latency in latencies:
                self.metrics_store.put_value_cluster_level(
                    "query_latency_%s" % q.name,
                    convert.seconds_to_ms(latency), "ms")
Exemplo n.º 5
0
def execute_schedule(schedule, es, sampler, operation):
    """
    Executes tasks according to the schedule for a given operation.

    :param schedule: The schedule for this operation.
    :param es: Elasticsearch client that will be used to execute the operation.
    :param sampler: A container to store raw samples.
    """
    relative = None
    previous_sample_type = None
    total_start = time.perf_counter()
    curr_total_it = 1
    for expected_scheduled_time, sample_type_calculator, curr_iteration, total_it_for_sample_type, total_it_for_task, runner, params in schedule:
        sample_type = sample_type_calculator(total_start)
        # restart the relative time when the sample type changes. This way all warmup samples and measurement samples will start at
        # the relative time zero which simplifies throughput calculation.
        #
        # Assumption: We always get the same operation here, otherwise simply resetting one timer will not work!
        if sample_type != previous_sample_type:
            relative = time.perf_counter()
            previous_sample_type = sample_type
        # expected_scheduled_time is relative to the start of the first iteration
        absolute_expected_schedule_time = relative + expected_scheduled_time
        throughput_throttled = expected_scheduled_time > 0
        if throughput_throttled:
            rest = absolute_expected_schedule_time - time.perf_counter()
            if rest > 0:
                time.sleep(rest)
        start = time.perf_counter()
        with runner:
            weight = runner(es, params)
        stop = time.perf_counter()

        service_time = stop - start
        # Do not calculate latency separately when we don't throttle throughput. This metric is just confusing then.
        latency = stop - absolute_expected_schedule_time if throughput_throttled else service_time
        sampler.add(sample_type, convert.seconds_to_ms(latency), convert.seconds_to_ms(service_time), weight, (stop - relative),
                    curr_total_it, total_it_for_task)
        curr_total_it += 1
Exemplo n.º 6
0
    def run(self):
        es = self.cluster.client
        logger.info("Running search benchmark (warmup)")
        self._run_benchmark(es,
                            "  Benchmarking search (warmup iteration %d/%d)",
                            repetitions=1000)
        logger.info("Running search benchmark")
        times = self._run_benchmark(es,
                                    "  Benchmarking search (iteration %d/%d)",
                                    repetitions=1000)
        logger.info("Search benchmark has finished")

        for q in self.track.queries:
            latencies = [t[q.name] for t in times]
            for latency in latencies:
                self.metrics_store.put_value_cluster_level(
                    "query_latency_%s" % q.name,
                    convert.seconds_to_ms(latency), "ms")
Exemplo n.º 7
0
    def _node_stats(self):
        logger.info("Gathering nodes stats")
        # warmup
        self.repeat(self.cluster.client.nodes.stats,
                    metric="_all",
                    level="shards")
        durations, stats = self.repeat(self.cluster.client.nodes.stats,
                                       metric="_all",
                                       level="shards")
        for duration in durations:
            self.metrics_store.put_value_cluster_level(
                "node_stats_latency", convert.seconds_to_ms(duration), "ms")
        total_old_gen_collection_time = 0
        total_young_gen_collection_time = 0
        nodes = stats["nodes"]
        for node in nodes.values():
            node_name = node["name"]
            gc = node["jvm"]["gc"]["collectors"]
            old_gen_collection_time = gc["old"]["collection_time_in_millis"]
            young_gen_collection_time = gc["young"][
                "collection_time_in_millis"]
            self.metrics_store.put_value_node_level(node_name,
                                                    "node_old_gen_gc_time",
                                                    old_gen_collection_time,
                                                    "ms")
            self.metrics_store.put_value_node_level(node_name,
                                                    "node_young_gen_gc_time",
                                                    young_gen_collection_time,
                                                    "ms")
            total_old_gen_collection_time += old_gen_collection_time
            total_young_gen_collection_time += young_gen_collection_time

        self.metrics_store.put_value_cluster_level(
            "node_total_old_gen_gc_time", total_old_gen_collection_time, "ms")
        self.metrics_store.put_value_cluster_level(
            "node_total_young_gen_gc_time", total_young_gen_collection_time,
            "ms")
Exemplo n.º 8
0
    def run(self):
        logger.info("Running warmup iterations")
        self._run_benchmark("  Benchmarking %s (warmup iteration %d/%d)", self.warmup_repetitions)
        logger.info("Running measurement iterations")
        times = self._run_benchmark("  Benchmarking %s (iteration %d/%d)", self.repetitions)

        for q in self.queries:
            latencies = [t[q.name] for t in times]
            for latency in latencies:
                self.metrics_store.put_value_cluster_level("query_latency_%s" % q.name, convert.seconds_to_ms(latency), "ms")
Exemplo n.º 9
0
def execute_schedule(cancel,
                     client_id,
                     op,
                     schedule,
                     es,
                     sampler,
                     enable_profiling=False):
    """
    Executes tasks according to the schedule for a given operation.

    :param cancel: A shared boolean that indicates we need to cancel execution.
    :param client_id: The id of the client that executes the operation.
    :param op: The operation that is executed.
    :param schedule: The schedule for this operation.
    :param es: Elasticsearch client that will be used to execute the operation.
    :param sampler: A container to store raw samples.
    :param enable_profiling: Enables a Python profiler for this execution (default: False).
    """
    if enable_profiling:
        logger.debug("Enabling Python profiler for [%s]" % str(op))
        import cProfile, pstats
        import io as python_io
        profiler = cProfile.Profile()
        profiler.enable()
    else:
        logger.debug("Python profiler for [%s] is disabled." % str(op))

    total_start = time.perf_counter()
    # noinspection PyBroadException
    try:
        for expected_scheduled_time, sample_type, percent_completed, runner, params in schedule:
            if cancel.is_set():
                logger.info("User cancelled execution.")
                break
            absolute_expected_schedule_time = total_start + expected_scheduled_time
            throughput_throttled = expected_scheduled_time > 0
            if throughput_throttled:
                rest = absolute_expected_schedule_time - time.perf_counter()
                if rest > 0:
                    time.sleep(rest)
            start = time.perf_counter()
            total_ops, total_ops_unit, request_meta_data = execute_single(
                runner, es, params)
            stop = time.perf_counter()

            service_time = stop - start
            # Do not calculate latency separately when we don't throttle throughput. This metric is just confusing then.
            latency = stop - absolute_expected_schedule_time if throughput_throttled else service_time
            sampler.add(sample_type, request_meta_data,
                        convert.seconds_to_ms(latency),
                        convert.seconds_to_ms(service_time), total_ops,
                        total_ops_unit, (stop - total_start),
                        percent_completed)
    except BaseException:
        logger.exception("Could not execute schedule")
        raise
    finally:
        if enable_profiling:
            profiler.disable()
            s = python_io.StringIO()
            sortby = 'cumulative'
            ps = pstats.Stats(profiler, stream=s).sort_stats(sortby)
            ps.print_stats()

            profile = "\n=== Profile START for client [%s] and operation [%s] ===\n" % (
                str(client_id), str(op))
            profile += s.getvalue()
            profile += "=== Profile END for client [%s] and operation [%s] ===" % (
                str(client_id), str(op))
            profile_logger.info(profile)