예제 #1
0
def plot_qps(dirs, log_prefix, output_prefix, plot_features, fig_name, condition=""):
    for log_dir in dirs:
        if condition in log_dir:
            print(log_dir)
            stdout_file, LOG_file, report_csv = get_log_and_std_files(log_dir)
            report_df = read_report_csv_with_change_points(report_csv)
            print(len(report_df))
            plt.subplot(211)
            plt.plot(report_df["secs_elapsed"], report_df["interval_qps"], color="r")
            plt.ylim(0, 600000)
            plt.subplot(212)
            plt.plot(report_df["secs_elapsed"], report_df["change_points"], color="g")
            plt.ylim(0, 16)
            # report_df[plot_features].plot(subplots=True)
            output_path = output_prefix + "/%s/" % log_dir.replace(log_dir_prefix, "").replace("/", "_")
            mkdir_p(output_path)
            plt.savefig("{}/{}.pdf".format(output_path, fig_name), bbox_inches="tight")
            plt.savefig("{}/{}.png".format(output_path, fig_name), bbox_inches="tight")
            plt.clf()
예제 #2
0
def plot_stat(dirs, log_prefix, output_prefix, fig_name, condition=""):
    for log_dir in dirs:
        if condition in log_dir:
            print(log_dir)
            stdout_file, LOG_file, report_csv, stat_csv = get_log_and_std_files(log_dir, with_stat_csv=True)

            report_df = read_report_csv_with_change_points(report_csv)
            stat_df = read_stat_csv(stat_csv)
            plt.subplot(411)
            plt.plot(report_df["secs_elapsed"], report_df["interval_qps"], color="r")
            plt.ylabel("qps")
            plt.ylim(0, 600000)

            plt.subplot(412)
            plt.plot(stat_df["secs_elapsed"], stat_df["cpu_utils"], color="b")
            plt.ylabel("cpu_utils")
            plt.plot()
            plt.ylim(0, 1200)

            plt.subplot(413)
            plt.plot(stat_df["secs_elapsed"], stat_df["disk_usage"], color="c")
            # plt.plot(stat_df["secs_elapsed"], [2e7 for x in stat_df["secs_elapsed"]], color="r")
            plt.ylabel("disk_utils")
            plt.hlines(1e7, 0, stat_df["secs_elapsed"].tolist()[-1], colors="r", linestyles="dashed")
            plt.hlines(2e7, 0, stat_df["secs_elapsed"].tolist()[-1], colors="g", linestyles="dashed")
            plt.hlines(3e7, 0, stat_df["secs_elapsed"].tolist()[-1], colors="b", linestyles="dashed")

            plt.plot()

            plt.subplot(414)
            plt.plot(report_df["secs_elapsed"], report_df["change_points"], color="g")
            plt.ylabel(r"SST Size")
            plt.ylim(0, 16)

            plt.tight_layout()
            # report_df[plot_features].plot(subplots=True)
            output_path = output_prefix + "/%s/" % log_dir.replace(log_prefix, "").replace("/", "_")
            mkdir_p(output_path)
            plt.savefig("{}/{}.pdf".format(output_path, fig_name), bbox_inches="tight")
            plt.savefig("{}/{}.png".format(output_path, fig_name), bbox_inches="tight")
            plt.clf()
예제 #3
0
    dirs = get_log_dirs(log_dir_prefix)
    for log_dir in dirs:
        print(log_dir)
        stdout_file, LOG_file, report_csv = get_log_and_std_files(log_dir)
        data_set = load_log_and_qps(LOG_file, report_csv)
        bucket_df = vectorize_by_compaction_output_level(data_set, 7)
        bucket_df = combine_vector_with_qps(bucket_df, data_set.qps_df)

        # bucket_df = data_cleaning_by_max_MBPS(bucket_df)
        #
        plot_columns = ["level" + str(x) for x in range(7)]
        plot_columns.append("interval_qps")
        fig = bucket_df[plot_columns].plot(subplots=True)
        output_path = "compaction_style/universal/%s/" % log_dir.replace(
            log_dir_prefix, "").replace("/", "_")
        mkdir_p(output_path)
        plt.savefig(
            "{}/compaction_distribution_by_level.pdf".format(output_path),
            bbox_inches="tight")
        plt.savefig(
            "{}/compaction_distribution_by_level.png".format(output_path),
            bbox_inches="tight")
        plt.close()
    # start_time = datetime.now()
    # from feature_selection import vectorize_by_compaction_output_level
    # from traversal import get_log_dirs, get_log_and_std_files
    #
    # log_prefix_dir = "log_files"
    # dirs = get_log_dirs(log_prefix_dir)
    #
    #
예제 #4
0
def HMM_on_one_file(log_dir):
    stdout_file, LOG_file, report_csv = get_log_and_std_files(log_dir)

    data_set = load_log_and_qps(LOG_file, report_csv)
    bucket_df = vectorize_by_compaction_output_level(data_set)
    bucket_df["qps"] = data_set.qps_df["interval_qps"]

    _ = bucket_df.plot(subplots=True)
    num_states = 5  # memtable filling, flush only, L0 compaction (CPU busy), crowded compaction (disk busy)

    initial_state_logits = np.zeros([num_states],
                                    dtype=np.float32)  # uniform distribution

    initial_state_logits[
        0] = 1.0  # the possiblity of transferring into the Flushing limitation
    initial_state_logits

    initial_distribution = tfd.Categorical(probs=initial_state_logits)

    daily_change_prob = 0.05
    transition_probs = daily_change_prob / (num_states - 1) * np.ones(
        [num_states, num_states], dtype=np.float32)
    np.fill_diagonal(transition_probs, 1 - daily_change_prob)

    observed_counts = bucket_df["qps"].fillna(0).tolist()
    observed_counts = np.array(observed_counts).astype(np.float32)

    transition_distribution = tfd.Categorical(probs=transition_probs)
    trainable_log_rates = tf.Variable(np.log(np.mean(observed_counts)) +
                                      tf.random.normal([num_states]),
                                      name='log_rates')

    hmm = tfd.HiddenMarkovModel(
        initial_distribution=initial_distribution,
        transition_distribution=transition_distribution,
        observation_distribution=tfd.Poisson(log_rate=trainable_log_rates),
        num_steps=len(observed_counts))

    rate_prior = tfd.LogNormal(5, 5)

    #
    def log_prob():
        return (tf.reduce_sum(
            rate_prior.log_prob(tf.math.exp(trainable_log_rates))) +
                hmm.log_prob(observed_counts))

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

    @tf.function(autograph=False)
    def train_op():
        with tf.GradientTape() as tape:
            neg_log_prob = -log_prob()
        grads = tape.gradient(neg_log_prob, [trainable_log_rates])[0]
        optimizer.apply_gradients([(grads, trainable_log_rates)])
        return neg_log_prob, tf.math.exp(trainable_log_rates)

    #
    for step in range(201):
        loss, rates = [t.numpy() for t in train_op()]
        if step % 20 == 0:
            print("step {}: log prob {} rates {}".format(step, -loss, rates))

    posterior_dists = hmm.posterior_marginals(observed_counts)
    posterior_probs = posterior_dists.probs_parameter().numpy()
    most_probable_states = np.argmax(posterior_probs, axis=1)
    most_probable_rates = rates[most_probable_states]

    fig = plt.figure(figsize=(10, 4))
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(most_probable_rates, c='green', lw=3, label='inferred rate')
    ax.plot(observed_counts, c='black', alpha=0.3, label='observed counts')
    ax.set_ylabel("latent rate")
    ax.set_xlabel("time")
    ax.set_title("Inferred latent rate over time")
    ax.legend(loc=4)
    output_path = "image/" + log_dir.replace("log_files/", "").replace(
        "/", "_")
    mkdir_p(output_path)
    plt.savefig("{}/state_guessing.pdf".format(output_path),
                bbox_inches="tight")