def plot_probability_vs_class_rates(self, range_metrics, extra_title="", perc_ranges=None): """ Plots probability assigned to class (x-axis) vs the percentage of assignments that were that class (# of class A / all samples given probability of class in the range A). At top of each bar is AVERAGE # assigned probability in that range (over runs), and bars are colored accordingly. If using cross-fold, count is just the total. :param range_metrics: Map of classes to [TP_range_sums, total_range_sums] from compute_probability_range_metrics :param extra_title: Extra string to add to title. """ if perc_ranges is None: perc_ranges = ["10%", "30%", "50%", "70%", "90%"] # Set +/- minus range based on number of xticks. if len(perc_ranges) == 10: pm = 5 elif len(perc_ranges) == 5: pm = 10 x_indices = np.arange(len(perc_ranges)) # total_pos_pr - total positive # of samples per range. total_pos_pr = np.zeros(len(perc_ranges)) for class_name in self.class_labels: # Collect data for this class true_positives, totals = range_metrics[class_name] pos_class_counts_per_range = np.array( self.class_positives[class_name]) total_pos_pr += pos_class_counts_per_range prob_rates = self.class_prob_rates[class_name] if self.num_runs is not None: totals = [math.ceil(t / self.num_runs) for t in totals] # Plotting f, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI) norm = plt.Normalize(0, max(totals)) colors = mpl.cm.Blues(norm(totals)) a = ax.bar(x_indices, prob_rates, color=colors, edgecolor='black') thex_utils.annotate_plot(ax=ax, x=x_indices, y=prob_rates, annotations=totals) plt.xticks(x_indices, perc_ranges, fontsize=TICK_S) y_indices, y_ticks = get_perc_ticks() plt.yticks(y_indices, y_ticks, fontsize=TICK_S) pretty_class_name = clean_class_name(class_name) plt.xlabel('Assigned Probability' + r' $\pm$' + str(pm) + '%', fontsize=LAB_S) plt.ylabel('Empirical Probability', fontsize=LAB_S) ax.set_title(pretty_class_name + extra_title, fontsize=TITLE_S) m = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.Blues) cbar = plt.colorbar(mappable=m) cbar.ax.tick_params(labelsize=LAB_S) print("\nProbability vs Class Rates for: " + str(pretty_class_name)) print(prob_rates) thex_utils.display_and_save_plot( self.dir, "Probability vs Positive Rate: " + pretty_class_name + extra_title)
def visualize_completeness(model_dir, X, class_labels, data_completeness): """ Plot completeness of dataset as heatmap. :param model_dir: directory of model to save figure :param X: DataFrame of features :param class_labels: list of class names :param data_completeness: list in order of class names, which contains completeness per feature """ features = get_ordered_features(list(X)) for index, f in enumerate(features): if '_mag' in f: features[index] = f.replace("_mag", "") df = pd.DataFrame(data_completeness, index=class_labels, columns=features) f, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI) a = plt.pcolor(df, vmin=0, vmax=1, cmap='gist_heat') plt.yticks(ticks=np.arange(len(df.index)) + 0.6, labels=df.index, fontsize=TICK_S - 2) plt.xticks(ticks=np.arange(len(df.columns)) + 0.5, labels=df.columns, fontsize=TICK_S + 2) f.colorbar(a) util.display_and_save_plot(model_dir, "Completeness", None, f)
def plot_class_hist(model_dir, class_names, counts): """ Plots histogram of class sizes :param model_dir: directory of model to save figure :param class_counts: Map from class name to counts """ num_classes = len(class_names) f, ax = plt.subplots(figsize=(6, 6), dpi=DPI) # Plot data horizontally bar_width = 0.3 if num_classes <= 5: tick_size = TICK_S + 1 label_size = LAB_S + 2 max_y = (bar_width * num_classes) - (bar_width / 2) else: tick_size = TICK_S + 3 label_size = LAB_S + 2 max_y = bar_width * (num_classes) class_indices = np.linspace(0, max_y, num_classes) ax.barh(y=class_indices, width=counts, height=bar_width, edgecolor='black') plt.gcf().subplots_adjust(left=0.1) ax.set_xscale('log') ax.set_xticks([50, 100, 500, 1000, 5000]) ax.get_xaxis().set_major_formatter(ScalarFormatter()) plt.yticks(class_indices, class_names, fontsize=tick_size) ax.tick_params(axis='x', which='both', labelsize=tick_size, rotation=-90) plt.xlabel('Host galaxies count (log scale)', fontsize=label_size) plt.tight_layout() util.display_and_save_plot( model_dir, "Distribution of Transient Types in Data Sample")
def plot_feature_distribution(model_dir, df, feature, class_labels): """ Plots the normal distribution of each transient type in df over 'feature' :param model_dir: directory of model to save figure :param df: DataFrame with both feature column and TARGET_LABEL column :param feature: Name of feature to plot distribution over :param class_labels: class labels """ df = relabel_df(df, class_labels) f, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI) min_value = df[feature].min() max_value = df[feature].max() bins = np.linspace(0, max_value, 50) colors = plt.get_cmap('tab20').colors for index, class_name in enumerate(class_labels): class_values = df[df[TARGET_LABEL] == class_name][feature].values mean, std = norm.fit(class_values) x = np.linspace(min_value, max_value, 100) y = norm.pdf(x, mean, std) plt.plot(x, y, color=colors[index], label=class_name) plt.xlabel(feature.capitalize(), fontsize=LAB_S) plt.ylabel("Normalized density", fontsize=LAB_S) plt.legend() util.display_and_save_plot(model_dir, "feature_" + str(feature))
def plot_agg_prob_vs_class_rates(self, total_pos_pr, weighted, perc_ranges): """ Aggregates probability versus class rates across all classes. # of positive class samples per range. So, last index is total number of # TP samples with probability in range 90-100% :param total_pos_pr: Numpy array of length 10, with total :param weighted: Boolean to weigh by class frequency """ aggregated_rates = get_agg_prob_vs_class_rates(total_pos_pr, self.class_labels, self.class_positives, self.class_prob_rates, weighted) if weighted: p_title = 'Aggregated (Weighted) Probability vs Class Rates' else: p_title = 'Aggregated (Balanced) Probability vs Class Rates' print(p_title + ": \n" + str(aggregated_rates)) f, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI) x_indices = np.arange(len(perc_ranges)) # Get average count per bin over runs if self.num_runs is not None: totals = [math.ceil(t / self.num_runs) for t in total_pos_pr] else: totals = [int(t) for t in total_pos_pr] norm = plt.Normalize(0, max(totals)) colors = mpl.cm.Blues(norm(totals)) # Plot aggregated rates ax.bar(x_indices, aggregated_rates, color=colors, edgecolor='black') plt.xticks(x_indices, perc_ranges, fontsize=TICK_S) y_indices, y_ticks = get_perc_ticks() plt.yticks(y_indices, y_ticks, fontsize=TICK_S) plt.xlabel('Probability ' + r'$\pm$' + '10%', fontsize=LAB_S) plt.ylabel('Empirical Probability', fontsize=LAB_S) ax.set_title(p_title, fontsize=TITLE_S, pad=20) m = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.Blues) cbar = plt.colorbar(mappable=m) cbar.ax.tick_params(labelsize=LAB_S) thex_utils.display_and_save_plot(self.dir, p_title)
def plot_feature_hist(model_dir, df, feature, class_labels): """ Plots the histogram of each transient type in df over 'feature' :param model_dir: directory of model to save figure :param df: DataFrame with both feature column and TARGET_LABEL column :param feature: Name of feature to plot distribution over :param class_labels: class labels """ df = relabel_df(df, class_labels) f, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI) max_value = df[feature].max() bins = np.linspace(0, max_value, 50) counts = [] edges = [] bars = [] colors = plt.get_cmap('tab20').colors for index, class_name in enumerate(class_labels): class_rs = df[df[TARGET_LABEL] == class_name][feature].values c, e, b = ax.hist(x=class_rs, bins=bins, density=True, color=colors[index], label=class_name) counts.append(c) edges.append(e) bars.append(b) # Iterate over each bin it_bins = bars[0] for bin_index, value in enumerate(it_bins): bin_counts = [] # Count per class for this bin for class_count in counts: bin_counts.append(class_count[bin_index]) # Sorted biggest to smallest, indices sorted_indices = np.flip(np.argsort(bin_counts)) zorder = 0 for sorted_index in sorted_indices: bars[sorted_index][bin_index].set_zorder(zorder) zorder += 1 plt.xlabel(feature.capitalize(), fontsize=LAB_S) plt.ylabel("Normalized density", fontsize=LAB_S) plt.legend() util.display_and_save_plot(model_dir, "Feature distribution")
def plot_confusion_matrix(self, results): """ Plot confusion matrix :param results: List of 2D Numpy arrays, with each row corresponding to sample, and each column the probability of that class, in order of self.class_labels & the last column containing the full, true label """ cm = compute_confusion_matrix(results, self.class_labels) fig, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI) hm = ax.imshow(cm, cmap='Blues', interpolation='nearest') indices = list(range(len(self.class_labels))) ax.set_ylabel("Actual", fontsize=LAB_S) ax.set_xlabel("Prediction", fontsize=LAB_S) pretty_class_names = clean_class_names(self.class_labels) plt.yticks(indices, pretty_class_names, fontsize=TICK_S) plt.xticks(indices, pretty_class_names, rotation=-90, fontsize=TICK_S) plt.colorbar(hm) print("\nConfusion Matrix") print(cm) thex_utils.display_and_save_plot(self.dir, "Confusion Matrix", fig=fig)
def plot_all_metrics(self, purities, comps, all_pc, y): """ Plot performance metrics for model :param purities: Average purity across folds/trials, per class (dict) :param comps: Average completeness across folds/trials, per class (dict) :param all_pc: Purity & completeness per trial/fold, per class :param y: all y dataset """ a = self.get_num_classes() c_baselines, p_baselines = compute_baselines(self.class_counts, self.class_labels, self.get_num_classes(), self.balanced_purity, self.class_priors) p_intvls, c_intvls = compute_confintvls(all_pc, self.class_labels, self.balanced_purity) f, ax = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True, figsize=(8, 4), dpi=DPI) self.plot_metrics_ax(ax[0], purities, "Purity", p_baselines, p_intvls) y_indices, class_names = self.plot_metrics_ax(ax[1], comps, "Completeness", c_baselines, c_intvls) plt.subplots_adjust(wspace=0, hspace=0) ax[0].set_yticks(y_indices) ax[0].set_yticklabels(clean_class_names(class_names), fontsize=18, horizontalalignment='right') thex_utils.display_and_save_plot(self.dir, self.name + "_metrics.pdf", bbox_inches='tight', fig=f)
def plot_example_output(self, row, i=None, priors=None): """ Plots example output for a set of probabilities for a particular host-galaxy :param row: Numpy array of probabilities in order of self.class_labels and then TARGET_LABEL :param i: Index of sample :param priors: Boolean if using priors, for saving """ labels = row[len(row) - 1] true_class_index = None for class_index, class_name in enumerate(self.class_labels): if self.is_class(class_name, labels): true_class_index = class_index f, ax = plt.subplots(figsize=(5, 3), dpi=220) ACC = "#b3e0ff" # actual class color, light blue DCC = "#005c99" # default class color, dark blue colors = [DCC] * len(self.class_labels) colors[true_class_index] = ACC probabilities = row[0:len(row) - 1] # np.arange(len(self.class_labels)) x_indices = np.linspace(0, len(self.class_labels) * 0.4, len(self.class_labels)) ax.bar(x=x_indices, height=probabilities, width=0.4, color=colors) ax.set_ylim([0, 1]) plt.ylabel('Probability Assigned', fontsize=LAB_S) plt.xlabel('Class', fontsize=LAB_S) pretty_class_names = clean_class_name(self.class_labels) plt.xticks(x_indices, pretty_class_names, fontsize=TICK_S) title = "example_output" if i is not None: title += "_" + str(i) if priors is not None: title += "_" + str(priors) if not os.path.exists(self.dir + '/examples'): os.mkdir(self.dir + '/examples') thex_utils.display_and_save_plot(self.dir + '/examples', title)
def plot_prob_pc_curves(self, range_metrics): """ Plot purity & completeness curves relative to >= probability assigned to event :param range_metrics: Map of classes to [TP_range_sums, total_range_sums] where total_range_sums is the number of samples with probability in range for this class and TP_range_sums is the true positives per range :param class_counts: Map from class name to counts """ for index, class_name in enumerate(range_metrics.keys()): true_positives, totals = range_metrics[class_name] fig, ax1 = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI) class_total = self.class_counts[class_name] if self.num_runs is not None: class_total = self.num_runs * class_total * .33 purities = [] # Accuracy per range (true positive/total) comps = [] TP_count = 0 total_count = 0 for index in reversed(range(len(true_positives))): cur_p = 0 # Current purity cur_c = 0 # Current completeness TP_count += true_positives[index] total_count += totals[index] if total_count != 0: # positive class samples / totals # with prob in range cur_p = TP_count / total_count if class_total != 0: cur_c = TP_count / class_total purities.append(cur_p) comps.append(cur_c) purities.reverse() comps.reverse() def plot_axis(ax, data, label, color): """ Plot data on axis in certain color """ x_indices = np.linspace(0, 1, len(data)) ax.set_ylabel(label, color=color, fontsize=LAB_S) ax.scatter(x_indices, data, color=color, s=4) ax.plot(x_indices, data, color=color, linewidth=4) ax.set_ylim([0, 1]) y_indices, y_ticks = get_perc_ticks() plt.yticks(ticks=y_indices, labels=y_ticks, color=color, fontsize=TICK_S) ax1.set_xlabel(r'Probability $\geq$', fontsize=LAB_S) plot_axis(ax1, purities, "Purity", 'tab:red') ax2 = ax1.twinx( ) # instantiate a second axes that shares the same x-axis plot_axis(ax2, comps, "Completeness", 'tab:blue') pretty_class_name = clean_class_name(class_name) x_indices, x_ticks = get_perc_ticks() plt.xticks(x_indices, x_ticks, fontsize=TICK_S) plt.title(pretty_class_name, fontsize=TITLE_S) thex_utils.display_and_save_plot( model_dir=self.dir, file_name=self.name + " Purity and Completeness vs. Probability: " + pretty_class_name, bbox_inches=None, fig=fig)
def plot_density_performance(self, unnorm_results): """ Plots accuracy vs the X% of top unnormalized probabilities (densities) evaluated """ p, c, a, class_purities = self.get_avg_performances(unnorm_results) # Plot aggregated performance metrics fig, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI, tight_layout=True) clean_plot(p, ax, "Purity", 'red') clean_plot(c, ax, "Completeness", 'blue') clean_plot(a, ax, "Accuracy", 'green') ax.set_xlabel("% Top Densities", fontsize=LAB_S) ax.set_ylabel("Performance", fontsize=LAB_S) ax.set_ylim([0, 1.01]) y_indices, y_ticks = get_perc_ticks() plt.xticks(y_indices * 100, y_ticks, fontsize=TICK_S) plt.yticks(y_indices, y_ticks, fontsize=TICK_S) ax.legend() thex_utils.display_and_save_plot(self.dir, "Average Performance vs Density", fig=fig) # Plot purity per class fig, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI, tight_layout=True) colors = plt.get_cmap('tab20').colors for class_index, class_name in enumerate(self.class_labels): if self.num_runs is not None: raise ValueError( "Can only aggregate this over folds, not runs.") total = self.class_counts[class_name] purities = class_purities[class_name] x = [] y = [] star = None for vals in purities: if vals is not None: TP, den, i, class_count = vals x.append(i) y.append(TP / den) if round(class_count / total, 1) == 0.5: star = [i, TP / den] pretty_class_name = clean_class_name(class_name) print("\n" + pretty_class_name + " purities " + str([x for x in zip(x, y)])) ax.scatter(x, y, color=colors[class_index], s=2) ax.plot(x, y, color=colors[class_index], label=pretty_class_name) if star is not None: ax.plot(star[0], star[1], marker='*', color=colors[class_index]) ax.set_ylabel("Purity", fontsize=LAB_S) ax.set_xlabel("% Top Densities", fontsize=LAB_S) y_indices, y_ticks = get_perc_ticks() plt.xticks(y_indices, y_ticks, fontsize=TICK_S) plt.yticks(y_indices, y_ticks, fontsize=TICK_S) ax.legend(loc='upper center', bbox_to_anchor=(1.3, 1), ncol=1, prop={'size': LAB_S - 2}) thex_utils.display_and_save_plot(self.dir, "Prob Density % vs Purities")