예제 #1
0
def run_test(data, test, *args, groups=None, **kwargs):
    """
    Runs test on each column of data, correcting for multiple comparisons.
    
    Parameters
    ----------
    data : pandas.DataFrame
        Shape [n_observations, n_features].
    test : function
        Called as `test(data, *args, **kwargs)`, and must return the test
        statistic and p-value (as per the `scipy.stats` API).
    groups : pd.Series, optional
        If this is present, `data` is stratified based on group, and the
        call is then `test(data[g1], data[g2], ...,  *args, **kwargs)`.
    
    Returns
    -------
    statistics : pandas.Series
        Shape [n_features,].
    pvalues : pandas.Series
        Shape [n_features,].
    """

    data, groups, _ = utils.sanitise_inputs(data, groups, None)

    statistics = np.full([data.shape[1]], np.NaN)
    pvalues = np.full([data.shape[1]], np.NaN)
    for c, col in enumerate(data.columns):
        if groups is None:
            [statistics[c], pvalues[c]] = test(data[col], *args, **kwargs)
        else:
            [statistics[c], pvalues[c]] = test(
                *[data.loc[groups == g, col] for g in groups.cat.categories],
                *args, **kwargs)
    _, pvalues, _, _ = mt.multipletests(pvalues)

    statistics = pd.Series(data=statistics, index=data.columns)
    pvalues = pd.Series(data=pvalues, index=data.columns)

    return statistics, pvalues
예제 #2
0
def run_tests(data,
              labels=None,
              *,
              confounds=None,
              demean_confounds=True,
              alpha=0.05):
    """
    Runs several simple statistical tests, primarily for group differences.
    
    Parameters
    ----------
    data : pandas.DataFrame
        Shape [n_observations, n_features].
    labels : pandas.Series
        Shape [???,]. Must be indexable by `data.index`.
    
    Other Parameters
    ----------------
    confounds : pandas.DataFrame, optional
        Shape [???, n_confounds]. Must be indexable by `data.index`.
    demean_confounds : bool, optional
        If `True`, confounds are normalised along the features axis.
    alpha : float, optional
        Only print results of tests where `p < alpha`.
    """

    # Dummy labels (i.e. just a single group)
    if labels is None:
        labels = ['Group'] * data.shape[0]
        labels = pd.Series(data=labels, index=data.index, dtype='category')

    data, labels, confounds = utils.sanitise_inputs(data, labels, confounds)
    if confounds is not None:
        data = utils.remove_confounds(data, confounds, demean_confounds)

    print("Running statistical tests...")
    print("Correction for multiple comparisons is across features, not tests.")
    print("No. of features: {:d}".format(data.shape[1]))
    print("No. of observations: {:d}".format(data.shape[0]))
    if confounds is not None:
        print("No. of confounds: {:d}".format(confounds.shape[1]))
    print("No. of classes: {:d}".format(len(labels.cat.categories)))
    print("Classes: {}".format(", ".join(map(str, labels.cat.categories))))
    print()

    # Test groups individually
    for cat in labels.cat.categories:
        print("{}: {}".format(labels.name, cat))
        c_data = data.loc[labels == cat, :]
        print("No. of observations: {:d}".format(c_data.shape[0]))

        # Are observations normally distributed?
        # https://doi.org/10.1080/00949655.2010.520163
        print("Shapiro-Wilk test for non-normality:")
        s, p = run_test(c_data, scipy.stats.shapiro)
        print_test_results(s, p, alpha=alpha)

        print()

    # And test for group differences etc...
    if len(labels.cat.categories) >= 2:
        #print("ANOVA for difference in group means:")
        #s, p = run_test(data, scipy.stats.f_oneway, groups=labels)
        #print_test_results(s, p, alpha=alpha)
        #print()

        print("Kruskal-Wallis H-test for difference in group medians:")
        s, p = run_test(data, scipy.stats.kruskal, groups=labels)
        print_test_results(s, p, alpha=alpha)
        print()

        print("Levene test for difference in group variances:")
        s, p = run_test(data, scipy.stats.levene, groups=labels)
        print_test_results(s, p, alpha=alpha)
        print()

    else:
        # Not clear from the docs if `scipy.stats.wilcoxon` is a suitable
        # alternative in the one-sample case
        print("t-test for non-zero means:")
        s, p = run_test(data, scipy.stats.ttest_1samp, popmean=0.0)
        print_test_results(s, p, alpha=alpha)
        print()

    return
예제 #3
0
def explain_classifier(data,
                       labels,
                       classifier,
                       parameters={},
                       *,
                       confounds=None,
                       demean_confounds=True,
                       l1_reg='bic',
                       **kwargs):
    """
    Generates an interpretation of the classifier output.
    
    Parameters
    ----------
    data : pandas.DataFrame
        Shape [n_observations, n_features].
    labels : pandas.Series
        Shape [???,]. Must be indexable by `data.index`.
    classifier
        A classifier (or pipeline) conforming to the sklearn interface, which
        must also provide `predict_proba()`.
    parameters : dict, optional
        A set of hyperparameters to tune, conforming to the sklearn
        `GridSearchCV` interface.
    
    Returns
    -------
    explanation
        This contains the data, fitted classifier, and the explanation in the
        form of a shap `KernelExplainer` and a set of SHAP values.
    
    Other Parameters
    ----------------
    confounds : pandas.DataFrame, optional
        Shape [???, n_confounds]. Must be indexable by `data.index`.
    demean_confounds : bool, optional
        If `True`, confounds are normalised along the features axis.
    l1_reg : optional
        Passed to `shap.KernelExplainer.shap_values()`.
    **kwargs
        All other keyword args are passed to `optimise_classifier()` to
        modify the inner cross-validation loop for hyperparameter tuning.
    """

    print("Generating a classifier explanation...")

    data, labels, confounds = utils.sanitise_inputs(data, labels, confounds)

    # Preprocess data
    data = data.apply(sklearn.preprocessing.scale)
    if confounds is not None:
        data = utils.remove_confounds(data, confounds, demean_confounds)

    # Fit to data, and tune hyperparameters
    clf = optimise_classifier(data, labels.cat.codes, classifier, parameters,
                              **kwargs)
    print("Classifier parameters optimised.")
    print(clf.best_params_)
    clf = clf.best_estimator_

    explainer = shap.KernelExplainer(clf.predict_proba,
                                     data,
                                     link='logit',
                                     keep_index=True)
    shap_values = explainer.shap_values(data, l1_reg=l1_reg)
    print("SHAP values generated.")

    # And wrap all the results up
    explanation = types.SimpleNamespace()
    explanation.data = data
    explanation.labels = labels
    explanation.classifier = clf
    explanation.clf_categories = labels.cat.categories[clf.classes_]
    explanation.explainer = explainer
    explanation.shap_values = shap_values

    print()
    return explanation
예제 #4
0
def predict(data,
            labels,
            classifier,
            parameters={},
            *,
            confounds=None,
            demean_confounds=True,
            normalise=True,
            cv_iter=None,
            groups=None,
            inner_cv_kwargs={},
            return_probabilities=False,
            verbose=True):
    """
    Predicts `labels` from `data` using nested cross-validation.
    
    Parameters
    ----------
    data : pandas.DataFrame
        Shape [n_observations, n_features].
    labels : pandas.Series
        Shape [???,]. Must be indexable by `data.index`.
    classifier
        A classifier (or pipeline) conforming to the sklearn interface. See
        `get_default_classifier()`.
    parameters : dict, optional
        A set of hyperparameters to tune, conforming to the sklearn
        `GridSearchCV` interface.
    
    Returns
    -------
    predictions : pandas.Series
        Shape [[n_folds, fold_size],] (i.e. a MultiIndex over folds).
    probabilities : pandas.DataFrame, optional
        Shape [[n_folds, fold_size], n_classes]. Only returned if
        `return_probabilities` is set.
    
    Other Parameters
    ----------------
    confounds : pandas.DataFrame, optional
        Shape [???, n_confounds]. Must be indexable by `data.index`.
    demean_confounds : bool, optional
        If `True`, confounds are normalised along the features axis (respecting
        the train/test split). This is almost always needed to stop e.g.
        conflation of mean effects and the confound variance.
    normalise : bool, optional
        If `True`, data is normalised along the features axis (respecting the
        train/test split).
    cv_iter : optional
        A cross-validation generator from `sklearn.model_selection`.
        Default: `StratifiedKFold`.
    groups : pandas.Series, optional
        Shape [???,]. Must be indexable by `data.index`. Passed to `cv_iter` to
        allow for stratification based on group membership.
    inner_cv_kwargs : dict, optional
        Passed to `optimise_classifier()` to modify the inner
        cross-validation loop for hyperparameter tuning.
    return_probabilities : bool, optional
        Whether to calculate the class probabilities from the classifier (which
        must therefore implement `predict_proba()`).
    verbose : bool, optional
    """
    # samples = Panel [n_samples, n_observations, n_features]
    print = builtins.print if verbose else lambda *a, **k: None

    data, labels, confounds = utils.sanitise_inputs(data, labels, confounds)
    # Working with codes is safer (e.g. sklearn doesn't like `dtype=object`)
    codes = labels.cat.codes
    if groups is not None:
        groups = groups[data.index]
        groups = groups.astype('category').cat.codes

    print("Classifying data...")
    print("No. of features: {:d}".format(data.shape[1]))
    print("No. of observations: {:d}".format(data.shape[0]))
    if confounds is not None:
        print("No. of confounds: {:d}".format(confounds.shape[1]))
    print("No. of classes: {:d}".format(len(labels.cat.categories)))
    print("Classes: {}".format(", ".join(map(str, labels.cat.categories))))
    print()

    # Initialise storage for results
    predictions = []
    if return_probabilities:
        probabilities = []

    if cv_iter is None:
        # Try to keep at least one class in each fold
        n_splits = min(labels.value_counts())
        n_splits = min(10, max(3, n_splits))
        cv_iter = sklearn.model_selection.StratifiedKFold(n_splits=n_splits,
                                                          shuffle=True)

    # Outer cross-validation loop
    n = 1
    n_folds = cv_iter.get_n_splits(data, codes, groups)
    for train_inds, test_inds in cv_iter.split(data, codes, groups):
        # Train / test split
        train_inds, test_inds = data.index[train_inds], data.index[test_inds]
        X_train, X_test = data.loc[train_inds, :], data.loc[test_inds, :]
        y_train, y_test = codes[train_inds], codes[test_inds]

        if confounds is not None:
            C_train = confounds.loc[train_inds, :]
            C_test = confounds.loc[test_inds, :]
            # Normalise (respecting decision to remove means)
            scaler = sklearn.preprocessing.StandardScaler(
                with_mean=demean_confounds)
            scaler.fit(C_train)
            # `with_mean`: If True, center the data before scaling.
            C_train = scaler.transform(C_train)
            C_test = scaler.transform(C_test)
            # Remove confounds (but don't remove the data means)
            regression = sklearn.linear_model.LinearRegression(
                fit_intercept=False)
            regression.fit(C_train, X_train)
            X_train = X_train - regression.predict(C_train)
            X_test = X_test - regression.predict(C_test)

        if normalise:
            scaler = sklearn.preprocessing.StandardScaler().fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

        # Fit to data!
        clf = optimise_classifier(X_train, y_train, classifier, parameters,
                                  **inner_cv_kwargs)
        #clf.fit(X_train, y_train)

        # Record performance
        pred = pd.Categorical.from_codes(clf.predict(X_test),
                                         labels.cat.categories)
        predictions.append(
            pd.Series(data=pred,
                      index=test_inds,
                      name='Predicted_' + labels.name))
        if return_probabilities:
            clf_categories = labels.cat.categories[
                clf.best_estimator_.classes_]
            probabilities.append(
                pd.DataFrame(data=clf.predict_proba(X_test),
                             index=test_inds,
                             columns=clf_categories))

        print("Finished iteration {} of {}".format(n, n_folds))
        if len(clf.best_params_) > 0:
            print(clf.best_params_)
        n += 1

    # Turn into multiindex over folds
    predictions = pd.concat(predictions,
                            axis='index',
                            keys=range(n_folds),
                            names=['Fold', data.index.name])
    if return_probabilities:
        probabilities = pd.concat(probabilities,
                                  axis='index',
                                  keys=range(n_folds),
                                  names=['Fold', data.index.name])
        # If not all classes in a given `y_train`, then the missing classes get
        # filled as NaN when concatenating. This is probably what we want: it's
        # a clear flag that the classifier is solving a different problem
        #probabilities = probabilities.fillna(0.0)

    print()
    if return_probabilities:
        return predictions, probabilities
    else:
        return predictions
예제 #5
0
def visualise_data(data,
                   labels=None,
                   *,
                   confounds=None,
                   demean_confounds=True):
    """
    Plots several summaries of a data set.
    
    Parameters
    ----------
    data : pandas.DataFrame
        Shape [n_observations, n_features].
    labels : pandas.Series, optional
        Shape [???,]. Must be indexable by `data.index`.
    
    Other Parameters
    ----------------
    confounds : pandas.DataFrame, optional
        Shape [???, n_confounds]. Must be indexable by `data.index`.
    demean_confounds : bool, optional
        If `True`, confounds are normalised along the features axis.
    """
    # samples = Panel [n_samples, n_observations, n_features]

    data, labels, confounds = utils.sanitise_inputs(data, labels, confounds)
    n_o, n_f = data.shape
    if labels is not None:
        n_c = len(labels.cat.categories)
    else:
        n_c = 1  # All same class
    if confounds is not None:
        data = utils.remove_confounds(data, confounds, demean_confounds)

    #--------------------------------------------------------------------------

    # Plot data distributions
    plot_kws = {}
    if n_o > 100:
        plot_kws['s'] = 5
        plot_kws['edgecolor'] = None
    if n_f <= 10:
        # Full interactions for small data
        if labels is None:
            fig = sns.pairplot(data, plot_kws=plot_kws)
        else:
            fig = sns.pairplot(data.join(labels),
                               hue=labels.name,
                               vars=data.columns,
                               plot_kws=plot_kws)
            # Need to specify vars as pairplot checks values not
            # categorical-ness (e.g. labels=[1,1,2,3] is numeric...)
            # https://github.com/mwaskom/seaborn/issues/919#issuecomment-366872386
    elif n_f * n_c <= 75:
        # For intermediate data, just plot marginals
        norm_data = data.apply(sklearn.preprocessing.scale)
        if labels is None:
            fig = sns.catplot(data=norm_data, kind='violin', inner='quartile')
        else:
            plot_data = norm_data.join(labels).melt(labels.name,
                                                    var_name='Feature',
                                                    value_name='Value')
            fig = sns.catplot(data=plot_data,
                              x='Feature',
                              y='Value',
                              hue=labels.name,
                              order=data.columns,
                              kind='boxen',
                              legend_out=False)
        fig.set_xticklabels(rotation=45, horizontalalignment='right')
        fig.ax.set_title("Feature distributions")
    else:
        # Top PCA components for big data
        norm_data = data.apply(sklearn.preprocessing.scale)
        pca = sklearn.decomposition.PCA(
            n_components=5).fit_transform(norm_data)
        pca = pd.DataFrame(
            data=pca,
            index=data.index,
            columns=['PCA: #{:d}'.format(i) for i in range(pca.shape[1])])
        if labels is None:
            fig = sns.pairplot(pca, plot_kws=plot_kws)
        else:
            fig = sns.pairplot(pca.join(labels),
                               hue=labels.name,
                               vars=pca.columns,
                               plot_kws=plot_kws)

    try:
        fig.fig.tight_layout()
        # This makes sure we don't lose any axes labels etc, but also makes the
        # legend almost impossible to see for `sns.pairplot()`... However,
        # there is currently no way to customise the legend creation so it
        # stays for now.
    except NameError:
        pass

    #--------------------------------------------------------------------------

    # Correlations between features
    if n_f <= 200:
        fig, ax = plt.subplots()
        ax = sns.heatmap(data.corr(),
                         ax=ax,
                         square=True,
                         vmin=-1.0,
                         center=0.0,
                         vmax=1.0,
                         cmap='RdBu_r',
                         cbar_kws={'label': "Correlation coefficient"})
        #ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
        ax.set_title("Feature correlations")
        fig.tight_layout()

    #--------------------------------------------------------------------------

    return
예제 #6
0
def visualise_manifold(data,
                       labels=None,
                       *,
                       confounds=None,
                       demean_confounds=True,
                       normalise=True,
                       manifold=sklearn.manifold.TSNE(init='pca')):
    """
    Plots the data set on a low-dimensional manifold.
    
    Parameters
    ----------
    data : pandas.DataFrame
        Shape [n_observations, n_features].
    labels : pandas.Series, optional
        Shape [???,]. Must be indexable by `data.index`.
    
    Other Parameters
    ----------------
    confounds : pandas.DataFrame, optional
        Shape [???, n_confounds]. Must be indexable by `data.index`.
    demean_confounds : bool, optional
        If `True`, confounds are normalised along the features axis.
    normalise : bool, optional
        If `True`, data is normalised along the features axis.
    manifold : optional
        Instance of a class from `sklearn.manifold` or `sklearn.decomposition`.
        Use this to pass in a different algorithm for dimensionality reduction.
    """
    # samples = Panel [n_samples, n_observations, n_features]

    data, labels, confounds = utils.sanitise_inputs(data, labels, confounds)
    n_o, n_f = data.shape
    if labels is not None:
        n_c = len(labels.cat.categories)
    else:
        n_c = 1  # All same class
    if confounds is not None:
        data = utils.remove_confounds(data, confounds, demean_confounds)

    # Preprocess data
    if normalise:
        data = data.apply(sklearn.preprocessing.scale)
    # Concatenate samples...
    if n_f > 50:
        # Reduce data before passing to manifold?
        pass

    # Find the data in the embedding space
    manifold.set_params(n_components=2)  # Sanity check...
    embedding = manifold.fit_transform(data)
    # Separate samples...

    # And plot
    fig, ax = plt.subplots()
    # Plot samples...
    if labels is None:
        ax.plot(embedding[:, 0], embedding[:, 1], 'o', markersize=10)
    else:
        for c in range(n_c):
            ax.plot(embedding[labels.cat.codes == c, 0],
                    embedding[labels.cat.codes == c, 1],
                    'o',
                    markersize=10,
                    markeredgecolor='k',
                    label=labels.cat.categories[c])
        ax.legend(title=labels.name)

    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title("Data manifold")
    fig.tight_layout()

    return