def plot_qps(dirs, log_prefix, output_prefix, plot_features, fig_name, condition=""): for log_dir in dirs: if condition in log_dir: print(log_dir) stdout_file, LOG_file, report_csv = get_log_and_std_files(log_dir) report_df = read_report_csv_with_change_points(report_csv) print(len(report_df)) plt.subplot(211) plt.plot(report_df["secs_elapsed"], report_df["interval_qps"], color="r") plt.ylim(0, 600000) plt.subplot(212) plt.plot(report_df["secs_elapsed"], report_df["change_points"], color="g") plt.ylim(0, 16) # report_df[plot_features].plot(subplots=True) output_path = output_prefix + "/%s/" % log_dir.replace(log_dir_prefix, "").replace("/", "_") mkdir_p(output_path) plt.savefig("{}/{}.pdf".format(output_path, fig_name), bbox_inches="tight") plt.savefig("{}/{}.png".format(output_path, fig_name), bbox_inches="tight") plt.clf()
def plot_stat(dirs, log_prefix, output_prefix, fig_name, condition=""): for log_dir in dirs: if condition in log_dir: print(log_dir) stdout_file, LOG_file, report_csv, stat_csv = get_log_and_std_files(log_dir, with_stat_csv=True) report_df = read_report_csv_with_change_points(report_csv) stat_df = read_stat_csv(stat_csv) plt.subplot(411) plt.plot(report_df["secs_elapsed"], report_df["interval_qps"], color="r") plt.ylabel("qps") plt.ylim(0, 600000) plt.subplot(412) plt.plot(stat_df["secs_elapsed"], stat_df["cpu_utils"], color="b") plt.ylabel("cpu_utils") plt.plot() plt.ylim(0, 1200) plt.subplot(413) plt.plot(stat_df["secs_elapsed"], stat_df["disk_usage"], color="c") # plt.plot(stat_df["secs_elapsed"], [2e7 for x in stat_df["secs_elapsed"]], color="r") plt.ylabel("disk_utils") plt.hlines(1e7, 0, stat_df["secs_elapsed"].tolist()[-1], colors="r", linestyles="dashed") plt.hlines(2e7, 0, stat_df["secs_elapsed"].tolist()[-1], colors="g", linestyles="dashed") plt.hlines(3e7, 0, stat_df["secs_elapsed"].tolist()[-1], colors="b", linestyles="dashed") plt.plot() plt.subplot(414) plt.plot(report_df["secs_elapsed"], report_df["change_points"], color="g") plt.ylabel(r"SST Size") plt.ylim(0, 16) plt.tight_layout() # report_df[plot_features].plot(subplots=True) output_path = output_prefix + "/%s/" % log_dir.replace(log_prefix, "").replace("/", "_") mkdir_p(output_path) plt.savefig("{}/{}.pdf".format(output_path, fig_name), bbox_inches="tight") plt.savefig("{}/{}.png".format(output_path, fig_name), bbox_inches="tight") plt.clf()
dirs = get_log_dirs(log_dir_prefix) for log_dir in dirs: print(log_dir) stdout_file, LOG_file, report_csv = get_log_and_std_files(log_dir) data_set = load_log_and_qps(LOG_file, report_csv) bucket_df = vectorize_by_compaction_output_level(data_set, 7) bucket_df = combine_vector_with_qps(bucket_df, data_set.qps_df) # bucket_df = data_cleaning_by_max_MBPS(bucket_df) # plot_columns = ["level" + str(x) for x in range(7)] plot_columns.append("interval_qps") fig = bucket_df[plot_columns].plot(subplots=True) output_path = "compaction_style/universal/%s/" % log_dir.replace( log_dir_prefix, "").replace("/", "_") mkdir_p(output_path) plt.savefig( "{}/compaction_distribution_by_level.pdf".format(output_path), bbox_inches="tight") plt.savefig( "{}/compaction_distribution_by_level.png".format(output_path), bbox_inches="tight") plt.close() # start_time = datetime.now() # from feature_selection import vectorize_by_compaction_output_level # from traversal import get_log_dirs, get_log_and_std_files # # log_prefix_dir = "log_files" # dirs = get_log_dirs(log_prefix_dir) # #
def HMM_on_one_file(log_dir): stdout_file, LOG_file, report_csv = get_log_and_std_files(log_dir) data_set = load_log_and_qps(LOG_file, report_csv) bucket_df = vectorize_by_compaction_output_level(data_set) bucket_df["qps"] = data_set.qps_df["interval_qps"] _ = bucket_df.plot(subplots=True) num_states = 5 # memtable filling, flush only, L0 compaction (CPU busy), crowded compaction (disk busy) initial_state_logits = np.zeros([num_states], dtype=np.float32) # uniform distribution initial_state_logits[ 0] = 1.0 # the possiblity of transferring into the Flushing limitation initial_state_logits initial_distribution = tfd.Categorical(probs=initial_state_logits) daily_change_prob = 0.05 transition_probs = daily_change_prob / (num_states - 1) * np.ones( [num_states, num_states], dtype=np.float32) np.fill_diagonal(transition_probs, 1 - daily_change_prob) observed_counts = bucket_df["qps"].fillna(0).tolist() observed_counts = np.array(observed_counts).astype(np.float32) transition_distribution = tfd.Categorical(probs=transition_probs) trainable_log_rates = tf.Variable(np.log(np.mean(observed_counts)) + tf.random.normal([num_states]), name='log_rates') hmm = tfd.HiddenMarkovModel( initial_distribution=initial_distribution, transition_distribution=transition_distribution, observation_distribution=tfd.Poisson(log_rate=trainable_log_rates), num_steps=len(observed_counts)) rate_prior = tfd.LogNormal(5, 5) # def log_prob(): return (tf.reduce_sum( rate_prior.log_prob(tf.math.exp(trainable_log_rates))) + hmm.log_prob(observed_counts)) optimizer = tf.keras.optimizers.Adam(learning_rate=0.1) @tf.function(autograph=False) def train_op(): with tf.GradientTape() as tape: neg_log_prob = -log_prob() grads = tape.gradient(neg_log_prob, [trainable_log_rates])[0] optimizer.apply_gradients([(grads, trainable_log_rates)]) return neg_log_prob, tf.math.exp(trainable_log_rates) # for step in range(201): loss, rates = [t.numpy() for t in train_op()] if step % 20 == 0: print("step {}: log prob {} rates {}".format(step, -loss, rates)) posterior_dists = hmm.posterior_marginals(observed_counts) posterior_probs = posterior_dists.probs_parameter().numpy() most_probable_states = np.argmax(posterior_probs, axis=1) most_probable_rates = rates[most_probable_states] fig = plt.figure(figsize=(10, 4)) ax = fig.add_subplot(1, 1, 1) ax.plot(most_probable_rates, c='green', lw=3, label='inferred rate') ax.plot(observed_counts, c='black', alpha=0.3, label='observed counts') ax.set_ylabel("latent rate") ax.set_xlabel("time") ax.set_title("Inferred latent rate over time") ax.legend(loc=4) output_path = "image/" + log_dir.replace("log_files/", "").replace( "/", "_") mkdir_p(output_path) plt.savefig("{}/state_guessing.pdf".format(output_path), bbox_inches="tight")