Exemplo n.º 1
0
def run(config: Dict[str, Any], metrics_client: HeronMetricsClient,
        total_hours: int, period_length_secs: int,
        topology_model: QTTopologyModel, topology_id: str, cluster: str,
        environ: str, metric_bucket_length: int, **kwargs: Any):

    start: dt.datetime = (dt.datetime.utcnow().replace(tzinfo=dt.timezone.utc)
                          - dt.timedelta(hours=total_hours))

    zk_config = config["heron.topology.models.config"]
    last_updated: dt.datetime = zookeeper.last_topo_update_ts_html(
        zk_config["heron.statemgr.connection.string"],
        zk_config["heron.statemgr.root.path"], topology_id,
        zk_config["zk.time.offset"]).astimezone(dt.timezone.utc)

    if start < last_updated:
        update_err: str = (f"The provided total hours ({total_hours}) will "
                           f"result in a start time ({start.isoformat()}) "
                           f"which is before the last update to "
                           f"{topology_id}'s physical plan "
                           f"({last_updated.isoformat()})")
        LOG.error(update_err)
        raise RuntimeError(update_err)

    periods: List[Tuple[dt.datetime, dt.datetime]] = \
        validation_helper.create_start_end_list(total_hours,
                                                period_length_secs)

    output: pd.DataFrame = None

    for j, (traffic_start, traffic_end) in enumerate(periods):

        LOG.info("Using metrics sourced from %s to %s",
                 traffic_start.isoformat(), traffic_end.isoformat())

        LOG.info("\n\nComparing prediction, using metrics from period "
                 "%d and traffic from period %d, to actual performance"
                 " during period %d\n", j, j, j)
        try:

            spout_state = heron_helper.get_spout_state(
                metrics_client, topology_id, cluster, environ,
                config["heron.tracker.url"], traffic_start, traffic_end,
                60, "mean")

            # Get the actual arrival rates at all instances
            actual_arrs: pd.DataFrame = \
                metrics_client.get_tuple_arrivals_at_stmgr(topology_id, cluster,
                                                            environ, traffic_start,
                                                            traffic_end, **kwargs)

            actual_arrs: pd.DataFrame = \
                (actual_arrs.groupby(["task", "component", "timestamp"]).sum().reset_index())

            actual_arrs["arrival_rate_tps"] = (actual_arrs["num-tuples"] / 60)

            actual_instanceå_arrs: pd.DataFrame = \
                (actual_arrs.groupby(["component", "task"])
                 ["arrival_rate_tps"].mean().reset_index()
                 .rename(index=str, columns={"arrival_rate_tps":
                                             "actual_arrival_rates_tps"}))

            results: pd.DataFrame = compare(
                metrics_client, spout_state, actual_instance_arrs,
                topology_model, topology_id,
                cluster, environ, traffic_start, traffic_end,
                metric_bucket_length, **kwargs)

        except ConnectionRefusedError as cr_err:
            LOG.error("Connection was refused with message: %s",
                      str(cr_err))
        except ConnectionResetError as cre_err:
            LOG.error("Connection was reset with message: %s",
                      str(cre_err))
        except requests.exceptions.ConnectionError as req_err:
            LOG.error("Connection error with message: %s", str(req_err))
        except Exception as err:
            LOG.error("Error (%s) with message: %s", str(type(err)),
                      str(err))
            raise err
        else:
            results["traffic_start"] = traffic_start
            results["traffic_end"] = traffic_end

            if output is not None:
                output = output.append(results, ignore_index=True)
            else:
                output = results

    return output
Exemplo n.º 2
0
def lstsq_io_ratios(metrics_client: HeronMetricsClient,
                    graph_client: GremlinClient, topology_id: str,
                    cluster: str, environ: str,
                    start: dt.datetime, end: dt.datetime, bucket_length: int,
                    **kwargs: Union[str, int, float]) -> pd.DataFrame:
    """ This method will calculate the input/output ratio for each instance in
    the supplied topology using data aggregated from the defined period. The
    method uses least squares regression to calculate a coefficient for each
    input stream into a instance such that the total output amount for a given
    output stream is sum of all input stream arrival amounts times their
    coefficient.

    *NOTE*: This method assumes that there is an (approximately) linear
    relationship between the inputs and outputs of a given component.

    Arguments:
        metrics_client (HeronMetricsClient):    The client instance for the
                                                metrics database.
        graph_client (GremlinClient):   The client instance for the graph
                                        database.
        topology_id (str):  The topology identification string.
        start (dt.datetime):    The UTC datetime object for the start of the
                                metric gathering period.
        end (dt.datetime):  The UTC datetime object for the end of the metric
                            gathering period.
        bucket_length (int):    The length in seconds that the metrics should
                                be aggregated into. *NOTE*: For the least
                                squares regression to work the number of
                                buckets must exceed the highest number of input
                                streams into the component of the topology.
        **kwargs:   Additional keyword arguments that will be passed to the
                    metrics client object. Consult the documentation for the
                    specific metrics client beings used.
    Returns:
        pandas.DataFrame:   A DataFrame with the following columns:

        * task: Task ID integer.
        * output_stream: The output stream name.
        * input_stream: The input stream name.
        * source_component: The name of the source component for the input
          stream.
        * coefficient: The value of the input amount coefficient for this
          output stream, inputs stream source component combination.
    """

    LOG.info("Calculating instance input/output ratios using least squares "
             "regression for topology %s over a %d second window between %s "
             "and %s", topology_id, (end-start).total_seconds(),
             start.isoformat(), end.isoformat())

    emit_counts: pd.DataFrame = metrics_client.get_emit_counts(
        topology_id, cluster, environ, start, end, **kwargs)

    arrived_tuples: pd.DataFrame = metrics_client.get_tuple_arrivals_at_stmgr(
        topology_id, cluster, environ, start, end, **kwargs)

    execute_counts: pd.DataFrame = metrics_client.get_execute_counts(
        topology_id, cluster, environ, start, end, **kwargs)

    arrived_tuples = arrived_tuples.merge(execute_counts, on=["task", "component", "container", "timestamp"])

    arrived_tuples.drop("execute_count", axis=1, inplace=True)
    # Limit the count DataFrames to only those component with both incoming and
    # outgoing streams
    in_out_comps: List[str] = get_in_out_components(graph_client, topology_id)

    emit_counts = emit_counts[emit_counts["component"].isin(in_out_comps)]
    emit_counts.rename(index=str, columns={"stream": "outgoing_stream"},
                       inplace=True)

    arrived_tuples = arrived_tuples[arrived_tuples["component"]
                                    .isin(in_out_comps)]
    arrived_tuples.rename(index=str, columns={"stream": "incoming_stream"},
                          inplace=True)
    # Re-sample the counts into equal length time buckets and group by task id,
    # time bucket and stream. This aligns the two DataFrames with timestamps of
    # equal length and start point so they can be merged later
    emit_counts_ts: pd.DataFrame = \
        (emit_counts.set_index(["task", "timestamp"])
         .groupby([pd.Grouper(level="task"),
                   pd.Grouper(freq=f"{bucket_length}S", level='timestamp'),
                   "component", "outgoing_stream"])
         ["emit_count"]
         .sum().reset_index())

    arrived_tuples_ts: pd.DataFrame = \
        (arrived_tuples.set_index(["task", "timestamp"])
         .groupby([pd.Grouper(level="task"),
                   pd.Grouper(freq=f"{bucket_length}S", level='timestamp'),
                   "component", "incoming_stream", "source_component"])
         ["num-tuples"]
         .sum().reset_index())

    rows: List[Dict[str, Union[str, float]]] = []

    # Now we loop through each component and munge the data until we have an
    # output total for each output stream for each task on the same row (one
    # row per time bucket) as the input total for each input stream
    component: str
    in_data: pd.DataFrame
    for component, in_data in arrived_tuples_ts.groupby(["component"]):
        in_stream_counts: pd.DataFrame = \
            (in_data.set_index(["task", "timestamp", "incoming_stream",
                                "source_component"])
             ["num-tuples"].unstack(level=["incoming_stream",
                                           "source_component"])
             .reset_index())

        out_stream_counts: pd.DataFrame = \
            emit_counts_ts[emit_counts_ts.component == component]

        merged: pd.DataFrame = out_stream_counts.merge(in_stream_counts,
                                                       on=["task",
                                                           "timestamp"])
        task: int
        out_stream: str
        data: pd.DataFrame
        for (task, out_stream), data in merged.groupby(["task",
                                                        "outgoing_stream"]):

            LOG.debug("Processing instance %d output stream %s", task,
                      out_stream)

            # Get a series of the output counts for this output stream, these
            # are the dependent variables (b) of the least squares regression
            # a x = b
            output_counts: pd.DataFrame = data.emit_count

            # If this instance's component has output stream registered that
            # nothing else subscribes too then the emit count will be zero and
            # we can skip this output stream
            if output_counts.sum() <= 0.0:
                LOG.debug("No emissions from instance %d on stream %s, "
                          "skipping this stream...", task, out_stream)
                continue

            # Get just the input stream counts for each time bucket. This is
            # the coefficients matrix (a) of the least squares regression
            # a x = b
            cols: List[Tuple[str, str]] = data.columns[5:]
            input_counts: pd.DataFrame = data[cols]

            coeffs: List[float]
            coeffs, _, _, _ = np.linalg.lstsq(input_counts, output_counts,
                                              rcond=None)
            i: int
            in_stream: str
            source: str
            for i, (in_stream, source) in enumerate(cols):
                row: Dict[str, Union[str, float]] = {
                    "task": task,
                    "output_stream": out_stream,
                    "input_stream": in_stream,
                    "source_component": source,
                    "coefficient": coeffs[i]}
                rows.append(row)
    result = pd.DataFrame(rows)

    if result.empty:
        raise Exception("lstsq_io_ratios returns an empty dataframe")

    return result