示例#1
0
    def plot_probability_vs_class_rates(self,
                                        range_metrics,
                                        extra_title="",
                                        perc_ranges=None):
        """
        Plots probability assigned to class (x-axis) vs the percentage of assignments that were that class (# of class A / all samples given probability of class in the range A). At top of each bar is AVERAGE  # assigned probability in that range (over runs), and bars are colored accordingly. If using cross-fold, count is just the total.
        :param range_metrics: Map of classes to [TP_range_sums, total_range_sums] from compute_probability_range_metrics
        :param extra_title: Extra string to add to title.
        """
        if perc_ranges is None:
            perc_ranges = ["10%", "30%", "50%", "70%", "90%"]

        # Set +/- minus range based on number of xticks.
        if len(perc_ranges) == 10:
            pm = 5
        elif len(perc_ranges) == 5:
            pm = 10

        x_indices = np.arange(len(perc_ranges))
        # total_pos_pr - total positive # of samples per range.
        total_pos_pr = np.zeros(len(perc_ranges))
        for class_name in self.class_labels:
            # Collect data for this class
            true_positives, totals = range_metrics[class_name]
            pos_class_counts_per_range = np.array(
                self.class_positives[class_name])
            total_pos_pr += pos_class_counts_per_range
            prob_rates = self.class_prob_rates[class_name]

            if self.num_runs is not None:
                totals = [math.ceil(t / self.num_runs) for t in totals]

            # Plotting
            f, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI)
            norm = plt.Normalize(0, max(totals))
            colors = mpl.cm.Blues(norm(totals))
            a = ax.bar(x_indices, prob_rates, color=colors, edgecolor='black')

            thex_utils.annotate_plot(ax=ax,
                                     x=x_indices,
                                     y=prob_rates,
                                     annotations=totals)
            plt.xticks(x_indices, perc_ranges, fontsize=TICK_S)
            y_indices, y_ticks = get_perc_ticks()
            plt.yticks(y_indices, y_ticks, fontsize=TICK_S)
            pretty_class_name = clean_class_name(class_name)
            plt.xlabel('Assigned Probability' + r' $\pm$' + str(pm) + '%',
                       fontsize=LAB_S)
            plt.ylabel('Empirical Probability', fontsize=LAB_S)
            ax.set_title(pretty_class_name + extra_title, fontsize=TITLE_S)
            m = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.Blues)
            cbar = plt.colorbar(mappable=m)
            cbar.ax.tick_params(labelsize=LAB_S)

            print("\nProbability vs Class Rates for: " +
                  str(pretty_class_name))
            print(prob_rates)
            thex_utils.display_and_save_plot(
                self.dir, "Probability vs Positive Rate: " +
                pretty_class_name + extra_title)
示例#2
0
def visualize_completeness(model_dir, X, class_labels, data_completeness):
    """
    Plot completeness of dataset as heatmap. 
    :param model_dir: directory of model to save figure
    :param X: DataFrame of features
    :param class_labels: list of class names
    :param data_completeness: list in order of class names, which contains completeness per feature
    """
    features = get_ordered_features(list(X))
    for index, f in enumerate(features):
        if '_mag' in f:
            features[index] = f.replace("_mag", "")

    df = pd.DataFrame(data_completeness, index=class_labels, columns=features)

    f, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI)

    a = plt.pcolor(df, vmin=0, vmax=1, cmap='gist_heat')
    plt.yticks(ticks=np.arange(len(df.index)) + 0.6,
               labels=df.index,
               fontsize=TICK_S - 2)
    plt.xticks(ticks=np.arange(len(df.columns)) + 0.5,
               labels=df.columns,
               fontsize=TICK_S + 2)
    f.colorbar(a)

    util.display_and_save_plot(model_dir, "Completeness", None, f)
示例#3
0
def plot_class_hist(model_dir, class_names, counts):
    """
    Plots histogram of class sizes
    :param model_dir: directory of model to save figure
    :param class_counts: Map from class name to counts
    """
    num_classes = len(class_names)
    f, ax = plt.subplots(figsize=(6, 6), dpi=DPI)
    # Plot data horizontally
    bar_width = 0.3
    if num_classes <= 5:
        tick_size = TICK_S + 1
        label_size = LAB_S + 2
        max_y = (bar_width * num_classes) - (bar_width / 2)
    else:
        tick_size = TICK_S + 3
        label_size = LAB_S + 2
        max_y = bar_width * (num_classes)
    class_indices = np.linspace(0, max_y, num_classes)
    ax.barh(y=class_indices, width=counts, height=bar_width, edgecolor='black')

    plt.gcf().subplots_adjust(left=0.1)

    ax.set_xscale('log')
    ax.set_xticks([50, 100, 500, 1000, 5000])
    ax.get_xaxis().set_major_formatter(ScalarFormatter())

    plt.yticks(class_indices, class_names, fontsize=tick_size)
    ax.tick_params(axis='x', which='both', labelsize=tick_size, rotation=-90)

    plt.xlabel('Host galaxies count (log scale)', fontsize=label_size)
    plt.tight_layout()
    util.display_and_save_plot(
        model_dir, "Distribution of Transient Types in Data Sample")
示例#4
0
def plot_feature_distribution(model_dir, df, feature, class_labels):
    """
    Plots the normal distribution of each transient type in df over 'feature'
    :param model_dir: directory of model to save figure
    :param df: DataFrame with both feature column and TARGET_LABEL column
    :param feature: Name of feature to plot distribution over
    :param class_labels: class labels
    """
    df = relabel_df(df, class_labels)

    f, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI)
    min_value = df[feature].min()
    max_value = df[feature].max()
    bins = np.linspace(0, max_value, 50)
    colors = plt.get_cmap('tab20').colors
    for index, class_name in enumerate(class_labels):
        class_values = df[df[TARGET_LABEL] == class_name][feature].values
        mean, std = norm.fit(class_values)
        x = np.linspace(min_value, max_value, 100)
        y = norm.pdf(x, mean, std)
        plt.plot(x, y, color=colors[index], label=class_name)
    plt.xlabel(feature.capitalize(), fontsize=LAB_S)
    plt.ylabel("Normalized density", fontsize=LAB_S)
    plt.legend()
    util.display_and_save_plot(model_dir, "feature_" + str(feature))
示例#5
0
    def plot_agg_prob_vs_class_rates(self, total_pos_pr, weighted,
                                     perc_ranges):
        """
        Aggregates probability versus class rates across all classes.
        # of positive class samples per range. So, last index is total number of
        # TP samples with probability in range 90-100%
        :param total_pos_pr: Numpy array of length 10, with total
        :param weighted: Boolean to weigh by class frequency
        """
        aggregated_rates = get_agg_prob_vs_class_rates(total_pos_pr,
                                                       self.class_labels,
                                                       self.class_positives,
                                                       self.class_prob_rates,
                                                       weighted)

        if weighted:
            p_title = 'Aggregated (Weighted) Probability vs Class Rates'
        else:
            p_title = 'Aggregated (Balanced) Probability vs Class Rates'

        print(p_title + ": \n" + str(aggregated_rates))

        f, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI)
        x_indices = np.arange(len(perc_ranges))

        # Get average count per bin over runs
        if self.num_runs is not None:
            totals = [math.ceil(t / self.num_runs) for t in total_pos_pr]
        else:
            totals = [int(t) for t in total_pos_pr]

        norm = plt.Normalize(0, max(totals))
        colors = mpl.cm.Blues(norm(totals))

        # Plot aggregated rates
        ax.bar(x_indices, aggregated_rates, color=colors, edgecolor='black')
        plt.xticks(x_indices, perc_ranges, fontsize=TICK_S)

        y_indices, y_ticks = get_perc_ticks()
        plt.yticks(y_indices, y_ticks, fontsize=TICK_S)
        plt.xlabel('Probability ' + r'$\pm$' + '10%', fontsize=LAB_S)
        plt.ylabel('Empirical Probability', fontsize=LAB_S)
        ax.set_title(p_title, fontsize=TITLE_S, pad=20)
        m = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.Blues)
        cbar = plt.colorbar(mappable=m)
        cbar.ax.tick_params(labelsize=LAB_S)

        thex_utils.display_and_save_plot(self.dir, p_title)
示例#6
0
def plot_feature_hist(model_dir, df, feature, class_labels):
    """
    Plots the histogram of each transient type in df over 'feature'
    :param model_dir: directory of model to save figure
    :param df: DataFrame with both feature column and TARGET_LABEL column
    :param feature: Name of feature to plot distribution over
    :param class_labels: class labels
    """

    df = relabel_df(df, class_labels)

    f, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI)

    max_value = df[feature].max()
    bins = np.linspace(0, max_value, 50)
    counts = []
    edges = []
    bars = []
    colors = plt.get_cmap('tab20').colors
    for index, class_name in enumerate(class_labels):
        class_rs = df[df[TARGET_LABEL] == class_name][feature].values
        c, e, b = ax.hist(x=class_rs,
                          bins=bins,
                          density=True,
                          color=colors[index],
                          label=class_name)
        counts.append(c)
        edges.append(e)
        bars.append(b)

    # Iterate over each bin
    it_bins = bars[0]
    for bin_index, value in enumerate(it_bins):
        bin_counts = []  # Count per class for this bin
        for class_count in counts:
            bin_counts.append(class_count[bin_index])
        # Sorted biggest to smallest, indices
        sorted_indices = np.flip(np.argsort(bin_counts))
        zorder = 0
        for sorted_index in sorted_indices:
            bars[sorted_index][bin_index].set_zorder(zorder)
            zorder += 1
    plt.xlabel(feature.capitalize(), fontsize=LAB_S)
    plt.ylabel("Normalized density", fontsize=LAB_S)
    plt.legend()
    util.display_and_save_plot(model_dir, "Feature distribution")
示例#7
0
 def plot_confusion_matrix(self, results):
     """
     Plot confusion matrix
     :param results: List of 2D Numpy arrays, with each row corresponding to sample, and each column the probability of that class, in order of self.class_labels & the last column containing the full, true label
     """
     cm = compute_confusion_matrix(results, self.class_labels)
     fig, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI)
     hm = ax.imshow(cm, cmap='Blues', interpolation='nearest')
     indices = list(range(len(self.class_labels)))
     ax.set_ylabel("Actual", fontsize=LAB_S)
     ax.set_xlabel("Prediction", fontsize=LAB_S)
     pretty_class_names = clean_class_names(self.class_labels)
     plt.yticks(indices, pretty_class_names, fontsize=TICK_S)
     plt.xticks(indices, pretty_class_names, rotation=-90, fontsize=TICK_S)
     plt.colorbar(hm)
     print("\nConfusion Matrix")
     print(cm)
     thex_utils.display_and_save_plot(self.dir, "Confusion Matrix", fig=fig)
示例#8
0
    def plot_all_metrics(self, purities, comps, all_pc, y):
        """
        Plot performance metrics for model
        :param purities: Average purity across folds/trials, per class (dict)
        :param comps: Average completeness across folds/trials, per class (dict)
        :param all_pc: Purity & completeness per trial/fold, per class
        :param y: all y dataset
        """
        a = self.get_num_classes()
        c_baselines, p_baselines = compute_baselines(self.class_counts,
                                                     self.class_labels,
                                                     self.get_num_classes(),
                                                     self.balanced_purity,
                                                     self.class_priors)
        p_intvls, c_intvls = compute_confintvls(all_pc, self.class_labels,
                                                self.balanced_purity)

        f, ax = plt.subplots(nrows=1,
                             ncols=2,
                             sharex=True,
                             sharey=True,
                             figsize=(8, 4),
                             dpi=DPI)

        self.plot_metrics_ax(ax[0], purities, "Purity", p_baselines, p_intvls)
        y_indices, class_names = self.plot_metrics_ax(ax[1], comps,
                                                      "Completeness",
                                                      c_baselines, c_intvls)

        plt.subplots_adjust(wspace=0, hspace=0)

        ax[0].set_yticks(y_indices)
        ax[0].set_yticklabels(clean_class_names(class_names),
                              fontsize=18,
                              horizontalalignment='right')

        thex_utils.display_and_save_plot(self.dir,
                                         self.name + "_metrics.pdf",
                                         bbox_inches='tight',
                                         fig=f)
示例#9
0
    def plot_example_output(self, row, i=None, priors=None):
        """
        Plots example output for a set of probabilities for a particular host-galaxy
        :param row: Numpy array of probabilities in order of self.class_labels and then TARGET_LABEL
        :param i: Index of sample
        :param priors: Boolean if using priors, for saving
        """
        labels = row[len(row) - 1]
        true_class_index = None
        for class_index, class_name in enumerate(self.class_labels):
            if self.is_class(class_name, labels):
                true_class_index = class_index

        f, ax = plt.subplots(figsize=(5, 3), dpi=220)

        ACC = "#b3e0ff"  # actual class color, light blue
        DCC = "#005c99"  # default class color, dark blue

        colors = [DCC] * len(self.class_labels)
        colors[true_class_index] = ACC
        probabilities = row[0:len(row) - 1]
        # np.arange(len(self.class_labels))
        x_indices = np.linspace(0,
                                len(self.class_labels) * 0.4,
                                len(self.class_labels))
        ax.bar(x=x_indices, height=probabilities, width=0.4, color=colors)
        ax.set_ylim([0, 1])
        plt.ylabel('Probability Assigned', fontsize=LAB_S)
        plt.xlabel('Class', fontsize=LAB_S)
        pretty_class_names = clean_class_name(self.class_labels)
        plt.xticks(x_indices, pretty_class_names, fontsize=TICK_S)
        title = "example_output"
        if i is not None:
            title += "_" + str(i)
        if priors is not None:
            title += "_" + str(priors)

        if not os.path.exists(self.dir + '/examples'):
            os.mkdir(self.dir + '/examples')
        thex_utils.display_and_save_plot(self.dir + '/examples', title)
示例#10
0
    def plot_prob_pc_curves(self, range_metrics):
        """
        Plot purity & completeness curves relative to >= probability assigned to event
        :param range_metrics: Map of classes to [TP_range_sums, total_range_sums] where total_range_sums is the number of samples with probability in range for this class and TP_range_sums is the true positives per range
        :param class_counts: Map from class name to counts
        """

        for index, class_name in enumerate(range_metrics.keys()):
            true_positives, totals = range_metrics[class_name]
            fig, ax1 = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT), dpi=DPI)
            class_total = self.class_counts[class_name]
            if self.num_runs is not None:
                class_total = self.num_runs * class_total * .33

            purities = []  # Accuracy per range (true positive/total)
            comps = []
            TP_count = 0
            total_count = 0
            for index in reversed(range(len(true_positives))):
                cur_p = 0  # Current purity
                cur_c = 0  # Current completeness
                TP_count += true_positives[index]
                total_count += totals[index]
                if total_count != 0:
                    # positive class samples / totals # with prob in range
                    cur_p = TP_count / total_count
                if class_total != 0:
                    cur_c = TP_count / class_total

                purities.append(cur_p)
                comps.append(cur_c)
            purities.reverse()
            comps.reverse()

            def plot_axis(ax, data, label, color):
                """
                Plot data on axis in certain color
                """
                x_indices = np.linspace(0, 1, len(data))
                ax.set_ylabel(label, color=color, fontsize=LAB_S)
                ax.scatter(x_indices, data, color=color, s=4)
                ax.plot(x_indices, data, color=color, linewidth=4)
                ax.set_ylim([0, 1])
                y_indices, y_ticks = get_perc_ticks()
                plt.yticks(ticks=y_indices,
                           labels=y_ticks,
                           color=color,
                           fontsize=TICK_S)

            ax1.set_xlabel(r'Probability $\geq$', fontsize=LAB_S)
            plot_axis(ax1, purities, "Purity", 'tab:red')
            ax2 = ax1.twinx(
            )  # instantiate a second axes that shares the same x-axis
            plot_axis(ax2, comps, "Completeness", 'tab:blue')

            pretty_class_name = clean_class_name(class_name)
            x_indices, x_ticks = get_perc_ticks()
            plt.xticks(x_indices, x_ticks, fontsize=TICK_S)
            plt.title(pretty_class_name, fontsize=TITLE_S)

            thex_utils.display_and_save_plot(
                model_dir=self.dir,
                file_name=self.name +
                " Purity and Completeness vs. Probability: " +
                pretty_class_name,
                bbox_inches=None,
                fig=fig)
示例#11
0
    def plot_density_performance(self, unnorm_results):
        """
        Plots accuracy vs the X% of top unnormalized probabilities (densities) evaluated
        """
        p, c, a, class_purities = self.get_avg_performances(unnorm_results)

        # Plot aggregated performance metrics
        fig, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT),
                               dpi=DPI,
                               tight_layout=True)
        clean_plot(p, ax, "Purity", 'red')
        clean_plot(c, ax, "Completeness", 'blue')
        clean_plot(a, ax, "Accuracy", 'green')
        ax.set_xlabel("% Top Densities", fontsize=LAB_S)
        ax.set_ylabel("Performance", fontsize=LAB_S)
        ax.set_ylim([0, 1.01])
        y_indices, y_ticks = get_perc_ticks()
        plt.xticks(y_indices * 100, y_ticks, fontsize=TICK_S)
        plt.yticks(y_indices, y_ticks, fontsize=TICK_S)
        ax.legend()

        thex_utils.display_and_save_plot(self.dir,
                                         "Average Performance vs Density",
                                         fig=fig)

        # Plot purity per class
        fig, ax = plt.subplots(figsize=(FIG_WIDTH, FIG_HEIGHT),
                               dpi=DPI,
                               tight_layout=True)

        colors = plt.get_cmap('tab20').colors
        for class_index, class_name in enumerate(self.class_labels):
            if self.num_runs is not None:
                raise ValueError(
                    "Can only aggregate this over folds, not runs.")
            total = self.class_counts[class_name]
            purities = class_purities[class_name]
            x = []
            y = []
            star = None
            for vals in purities:
                if vals is not None:
                    TP, den, i, class_count = vals
                    x.append(i)
                    y.append(TP / den)
                    if round(class_count / total, 1) == 0.5:
                        star = [i, TP / den]
            pretty_class_name = clean_class_name(class_name)
            print("\n" + pretty_class_name + " purities " +
                  str([x for x in zip(x, y)]))
            ax.scatter(x, y, color=colors[class_index], s=2)
            ax.plot(x, y, color=colors[class_index], label=pretty_class_name)
            if star is not None:
                ax.plot(star[0],
                        star[1],
                        marker='*',
                        color=colors[class_index])

        ax.set_ylabel("Purity", fontsize=LAB_S)
        ax.set_xlabel("% Top Densities", fontsize=LAB_S)
        y_indices, y_ticks = get_perc_ticks()
        plt.xticks(y_indices, y_ticks, fontsize=TICK_S)
        plt.yticks(y_indices, y_ticks, fontsize=TICK_S)
        ax.legend(loc='upper center',
                  bbox_to_anchor=(1.3, 1),
                  ncol=1,
                  prop={'size': LAB_S - 2})

        thex_utils.display_and_save_plot(self.dir,
                                         "Prob Density % vs Purities")