Пример #1
0
def raise_if_mean_differs_from(accuracy_balanced,
                               class_sizes,
                               reference_level=None,
                               eps_chance_acc=None,
                               method_descr=''):
    """
    Check if the performance is close to chance.

    Generic method that works for multi-class too!"""

    if eps_chance_acc is None:
        total_num_classes = len(class_sizes)
        eps_chance_acc = max(0.02, 0.1 / total_num_classes)

    if reference_level is None:
        reference_level = chance_accuracy(class_sizes)
    elif not 0.0 < reference_level <= 1.0:
        raise ValueError('invalid reference_level: must be in (0, 1]')

    # chance calculation expects "average", not median
    mean_bal_acc = np.mean(accuracy_balanced, axis=0)
    for ma in mean_bal_acc:
        print('for {},\n reference level accuracy expected: {} '
              '-- Estimated via CV:  {}'.format(method_descr, reference_level,
                                                ma))
        abs_diff = abs(ma - reference_level)
        if abs_diff > eps_chance_acc:
            raise ValueError('they substantially differ by {:.4f} that is '
                             'more than {:.4f}!'.format(
                                 abs_diff, eps_chance_acc))
Пример #2
0
def raise_if_mean_differs_from_chance(accuracy_balanced, class_sizes,
                                      eps_chance_acc=None):
    "Check if the performance is close to chance. Generic method that works for multi-class too!"

    if eps_chance_acc is None:
        total_num_classes = len(class_sizes)
        eps_chance_acc = max(0.02, 0.1 / total_num_classes)

    chance_acc = chance_accuracy(class_sizes)
    # chance calculation expects "average", not median
    mean_bal_acc = np.mean(accuracy_balanced, axis=0)
    for ma  in mean_bal_acc:
        print('Chance accuracy expected: {} -- Estimated via CV:  {}'.format(chance_acc, ma))
        if abs(ma - chance_acc) > eps_chance_acc:
            raise ValueError('they substantially differ by more than {:.4f}!'.format(eps_chance_acc))
Пример #3
0
    def _summarize_expt(self):
        """Summarize the experiment for user info"""

        print('\nCURRENT EXPERIMENT:\n{line}'.format(line='-' * 50))
        print('Training percentage      : {:.2}'.format(self.train_perc))
        print('Number of CV repetitions : {}'.format(self.num_rep_cv))
        print('Number of processors     : {}'.format(self.num_procs))
        print('Dim reduction method     : {}'.format(self.dim_red_method))
        print('Dim reduction size       : {}'.format(self.reduced_dim))
        print('Predictive model chosen  : {}'.format(self.pred_model))
        print('Grid search level        : {}\n'.format(self.grid_search_level))

        if len(self.covariates) > 0:
            print('Covarites selected       : {}'.format(', '.join(
                self.covariates)))
            print('Deconfoudning method     : {}\n'.format(self.deconfounder))

        if self._workflow_type == 'classify':
            self._target_sizes = list(self.datasets.target_sizes.values())
            self._chance_accuracy = chance_accuracy(self._target_sizes,
                                                    'balanced')
            print('Estimated chance accuracy : {:.3f}\n'
                  ''.format(self._chance_accuracy))
Пример #4
0
def metric_distribution(metric,
                        labels,
                        output_path,
                        class_sizes,
                        num_classes=2,
                        metric_label='balanced accuracy'):
    """

    Distribution plots of various metrics such as balanced accuracy!

    metric is expected to be ndarray of size [num_repetitions, num_datasets]

    """

    num_repetitions = metric.shape[0]
    num_datasets = metric.shape[1]
    if len(labels) < num_datasets:
        raise ValueError(
            "Insufficient number of labels for {} features!".format(
                num_datasets))
    method_ticks = 1.0 + np.arange(num_datasets)

    fig, ax = plt.subplots(figsize=cfg.COMMON_FIG_SIZE)
    line_coll = ax.violinplot(metric,
                              widths=cfg.violin_width,
                              bw_method=cfg.violin_bandwidth,
                              showmedians=True,
                              showextrema=False,
                              positions=method_ticks)

    cmap = cm.get_cmap(cfg.CMAP_DATASETS, num_datasets)
    for cc, ln in enumerate(line_coll['bodies']):
        ln.set_facecolor(cmap(cc))
        ln.set_label(labels[cc])

    ax.tick_params(axis='both', which='major', labelsize=15)
    ax.grid(axis='y', which='major', linewidth=cfg.LINE_WIDTH, zorder=0)

    lower_lim = np.round(np.min([np.float64(0.9 / num_classes),
                                 metric.min()]), cfg.PRECISION_METRICS)
    upper_lim = np.round(np.min([1.01, metric.max()]), cfg.PRECISION_METRICS)
    step_tick = 0.05
    ax.set_ylim(lower_lim, upper_lim)

    ax.set_xlim(np.min(method_ticks) - 1, np.max(method_ticks) + 1)
    ax.set_xticks(method_ticks)
    # ax.set_xticklabels(labels, rotation=45)  # 'vertical'

    ytick_loc = np.arange(lower_lim, upper_lim, step_tick)
    # add a tick for chance accuracy and/or % of majority class
    # given the classifier trained on stratified set, we must use the balanced version
    chance_acc = chance_accuracy(class_sizes, 'balanced')

    # rounding to ensure improved labels
    chance_acc = np.round(chance_acc, cfg.PRECISION_METRICS)
    ytick_loc = np.round(np.append(ytick_loc, chance_acc),
                         cfg.PRECISION_METRICS)

    ax.set_yticks(ytick_loc)
    ax.set_yticklabels(ytick_loc)
    plt.text(0.05, chance_acc, 'chance accuracy')
    plt.ylabel(metric_label, fontsize=cfg.FONT_SIZE)

    plt.tick_params(axis='both', which='major', labelsize=cfg.FONT_SIZE)

    # numbered labels
    numbered_labels = [
        '{} {}'.format(int(ix), lbl) for ix, lbl in zip(method_ticks, labels)
    ]

    # putting legends outside the plot below.
    fig.subplots_adjust(bottom=0.2)
    leg = ax.legend(numbered_labels, ncol=2, loc=9, bbox_to_anchor=(0.5, -0.1))
    # setting colors manually as plot has been through arbitray jumps
    for ix, lh in enumerate(leg.legendHandles):
        lh.set_color(cmap(ix))

    leg.set_frame_on(False)  # making leg background transparent

    # fig.savefig(output_path + '.png', transparent=True, dpi=300,
    #             bbox_extra_artists=(leg,), bbox_inches='tight')

    fig.savefig(output_path + '.pdf',
                bbox_extra_artists=(leg, ),
                bbox_inches='tight')

    plt.close()

    return