def run_test(data, test, *args, groups=None, **kwargs): """ Runs test on each column of data, correcting for multiple comparisons. Parameters ---------- data : pandas.DataFrame Shape [n_observations, n_features]. test : function Called as `test(data, *args, **kwargs)`, and must return the test statistic and p-value (as per the `scipy.stats` API). groups : pd.Series, optional If this is present, `data` is stratified based on group, and the call is then `test(data[g1], data[g2], ..., *args, **kwargs)`. Returns ------- statistics : pandas.Series Shape [n_features,]. pvalues : pandas.Series Shape [n_features,]. """ data, groups, _ = utils.sanitise_inputs(data, groups, None) statistics = np.full([data.shape[1]], np.NaN) pvalues = np.full([data.shape[1]], np.NaN) for c, col in enumerate(data.columns): if groups is None: [statistics[c], pvalues[c]] = test(data[col], *args, **kwargs) else: [statistics[c], pvalues[c]] = test( *[data.loc[groups == g, col] for g in groups.cat.categories], *args, **kwargs) _, pvalues, _, _ = mt.multipletests(pvalues) statistics = pd.Series(data=statistics, index=data.columns) pvalues = pd.Series(data=pvalues, index=data.columns) return statistics, pvalues
def run_tests(data, labels=None, *, confounds=None, demean_confounds=True, alpha=0.05): """ Runs several simple statistical tests, primarily for group differences. Parameters ---------- data : pandas.DataFrame Shape [n_observations, n_features]. labels : pandas.Series Shape [???,]. Must be indexable by `data.index`. Other Parameters ---------------- confounds : pandas.DataFrame, optional Shape [???, n_confounds]. Must be indexable by `data.index`. demean_confounds : bool, optional If `True`, confounds are normalised along the features axis. alpha : float, optional Only print results of tests where `p < alpha`. """ # Dummy labels (i.e. just a single group) if labels is None: labels = ['Group'] * data.shape[0] labels = pd.Series(data=labels, index=data.index, dtype='category') data, labels, confounds = utils.sanitise_inputs(data, labels, confounds) if confounds is not None: data = utils.remove_confounds(data, confounds, demean_confounds) print("Running statistical tests...") print("Correction for multiple comparisons is across features, not tests.") print("No. of features: {:d}".format(data.shape[1])) print("No. of observations: {:d}".format(data.shape[0])) if confounds is not None: print("No. of confounds: {:d}".format(confounds.shape[1])) print("No. of classes: {:d}".format(len(labels.cat.categories))) print("Classes: {}".format(", ".join(map(str, labels.cat.categories)))) print() # Test groups individually for cat in labels.cat.categories: print("{}: {}".format(labels.name, cat)) c_data = data.loc[labels == cat, :] print("No. of observations: {:d}".format(c_data.shape[0])) # Are observations normally distributed? # https://doi.org/10.1080/00949655.2010.520163 print("Shapiro-Wilk test for non-normality:") s, p = run_test(c_data, scipy.stats.shapiro) print_test_results(s, p, alpha=alpha) print() # And test for group differences etc... if len(labels.cat.categories) >= 2: #print("ANOVA for difference in group means:") #s, p = run_test(data, scipy.stats.f_oneway, groups=labels) #print_test_results(s, p, alpha=alpha) #print() print("Kruskal-Wallis H-test for difference in group medians:") s, p = run_test(data, scipy.stats.kruskal, groups=labels) print_test_results(s, p, alpha=alpha) print() print("Levene test for difference in group variances:") s, p = run_test(data, scipy.stats.levene, groups=labels) print_test_results(s, p, alpha=alpha) print() else: # Not clear from the docs if `scipy.stats.wilcoxon` is a suitable # alternative in the one-sample case print("t-test for non-zero means:") s, p = run_test(data, scipy.stats.ttest_1samp, popmean=0.0) print_test_results(s, p, alpha=alpha) print() return
def explain_classifier(data, labels, classifier, parameters={}, *, confounds=None, demean_confounds=True, l1_reg='bic', **kwargs): """ Generates an interpretation of the classifier output. Parameters ---------- data : pandas.DataFrame Shape [n_observations, n_features]. labels : pandas.Series Shape [???,]. Must be indexable by `data.index`. classifier A classifier (or pipeline) conforming to the sklearn interface, which must also provide `predict_proba()`. parameters : dict, optional A set of hyperparameters to tune, conforming to the sklearn `GridSearchCV` interface. Returns ------- explanation This contains the data, fitted classifier, and the explanation in the form of a shap `KernelExplainer` and a set of SHAP values. Other Parameters ---------------- confounds : pandas.DataFrame, optional Shape [???, n_confounds]. Must be indexable by `data.index`. demean_confounds : bool, optional If `True`, confounds are normalised along the features axis. l1_reg : optional Passed to `shap.KernelExplainer.shap_values()`. **kwargs All other keyword args are passed to `optimise_classifier()` to modify the inner cross-validation loop for hyperparameter tuning. """ print("Generating a classifier explanation...") data, labels, confounds = utils.sanitise_inputs(data, labels, confounds) # Preprocess data data = data.apply(sklearn.preprocessing.scale) if confounds is not None: data = utils.remove_confounds(data, confounds, demean_confounds) # Fit to data, and tune hyperparameters clf = optimise_classifier(data, labels.cat.codes, classifier, parameters, **kwargs) print("Classifier parameters optimised.") print(clf.best_params_) clf = clf.best_estimator_ explainer = shap.KernelExplainer(clf.predict_proba, data, link='logit', keep_index=True) shap_values = explainer.shap_values(data, l1_reg=l1_reg) print("SHAP values generated.") # And wrap all the results up explanation = types.SimpleNamespace() explanation.data = data explanation.labels = labels explanation.classifier = clf explanation.clf_categories = labels.cat.categories[clf.classes_] explanation.explainer = explainer explanation.shap_values = shap_values print() return explanation
def predict(data, labels, classifier, parameters={}, *, confounds=None, demean_confounds=True, normalise=True, cv_iter=None, groups=None, inner_cv_kwargs={}, return_probabilities=False, verbose=True): """ Predicts `labels` from `data` using nested cross-validation. Parameters ---------- data : pandas.DataFrame Shape [n_observations, n_features]. labels : pandas.Series Shape [???,]. Must be indexable by `data.index`. classifier A classifier (or pipeline) conforming to the sklearn interface. See `get_default_classifier()`. parameters : dict, optional A set of hyperparameters to tune, conforming to the sklearn `GridSearchCV` interface. Returns ------- predictions : pandas.Series Shape [[n_folds, fold_size],] (i.e. a MultiIndex over folds). probabilities : pandas.DataFrame, optional Shape [[n_folds, fold_size], n_classes]. Only returned if `return_probabilities` is set. Other Parameters ---------------- confounds : pandas.DataFrame, optional Shape [???, n_confounds]. Must be indexable by `data.index`. demean_confounds : bool, optional If `True`, confounds are normalised along the features axis (respecting the train/test split). This is almost always needed to stop e.g. conflation of mean effects and the confound variance. normalise : bool, optional If `True`, data is normalised along the features axis (respecting the train/test split). cv_iter : optional A cross-validation generator from `sklearn.model_selection`. Default: `StratifiedKFold`. groups : pandas.Series, optional Shape [???,]. Must be indexable by `data.index`. Passed to `cv_iter` to allow for stratification based on group membership. inner_cv_kwargs : dict, optional Passed to `optimise_classifier()` to modify the inner cross-validation loop for hyperparameter tuning. return_probabilities : bool, optional Whether to calculate the class probabilities from the classifier (which must therefore implement `predict_proba()`). verbose : bool, optional """ # samples = Panel [n_samples, n_observations, n_features] print = builtins.print if verbose else lambda *a, **k: None data, labels, confounds = utils.sanitise_inputs(data, labels, confounds) # Working with codes is safer (e.g. sklearn doesn't like `dtype=object`) codes = labels.cat.codes if groups is not None: groups = groups[data.index] groups = groups.astype('category').cat.codes print("Classifying data...") print("No. of features: {:d}".format(data.shape[1])) print("No. of observations: {:d}".format(data.shape[0])) if confounds is not None: print("No. of confounds: {:d}".format(confounds.shape[1])) print("No. of classes: {:d}".format(len(labels.cat.categories))) print("Classes: {}".format(", ".join(map(str, labels.cat.categories)))) print() # Initialise storage for results predictions = [] if return_probabilities: probabilities = [] if cv_iter is None: # Try to keep at least one class in each fold n_splits = min(labels.value_counts()) n_splits = min(10, max(3, n_splits)) cv_iter = sklearn.model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True) # Outer cross-validation loop n = 1 n_folds = cv_iter.get_n_splits(data, codes, groups) for train_inds, test_inds in cv_iter.split(data, codes, groups): # Train / test split train_inds, test_inds = data.index[train_inds], data.index[test_inds] X_train, X_test = data.loc[train_inds, :], data.loc[test_inds, :] y_train, y_test = codes[train_inds], codes[test_inds] if confounds is not None: C_train = confounds.loc[train_inds, :] C_test = confounds.loc[test_inds, :] # Normalise (respecting decision to remove means) scaler = sklearn.preprocessing.StandardScaler( with_mean=demean_confounds) scaler.fit(C_train) # `with_mean`: If True, center the data before scaling. C_train = scaler.transform(C_train) C_test = scaler.transform(C_test) # Remove confounds (but don't remove the data means) regression = sklearn.linear_model.LinearRegression( fit_intercept=False) regression.fit(C_train, X_train) X_train = X_train - regression.predict(C_train) X_test = X_test - regression.predict(C_test) if normalise: scaler = sklearn.preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # Fit to data! clf = optimise_classifier(X_train, y_train, classifier, parameters, **inner_cv_kwargs) #clf.fit(X_train, y_train) # Record performance pred = pd.Categorical.from_codes(clf.predict(X_test), labels.cat.categories) predictions.append( pd.Series(data=pred, index=test_inds, name='Predicted_' + labels.name)) if return_probabilities: clf_categories = labels.cat.categories[ clf.best_estimator_.classes_] probabilities.append( pd.DataFrame(data=clf.predict_proba(X_test), index=test_inds, columns=clf_categories)) print("Finished iteration {} of {}".format(n, n_folds)) if len(clf.best_params_) > 0: print(clf.best_params_) n += 1 # Turn into multiindex over folds predictions = pd.concat(predictions, axis='index', keys=range(n_folds), names=['Fold', data.index.name]) if return_probabilities: probabilities = pd.concat(probabilities, axis='index', keys=range(n_folds), names=['Fold', data.index.name]) # If not all classes in a given `y_train`, then the missing classes get # filled as NaN when concatenating. This is probably what we want: it's # a clear flag that the classifier is solving a different problem #probabilities = probabilities.fillna(0.0) print() if return_probabilities: return predictions, probabilities else: return predictions
def visualise_data(data, labels=None, *, confounds=None, demean_confounds=True): """ Plots several summaries of a data set. Parameters ---------- data : pandas.DataFrame Shape [n_observations, n_features]. labels : pandas.Series, optional Shape [???,]. Must be indexable by `data.index`. Other Parameters ---------------- confounds : pandas.DataFrame, optional Shape [???, n_confounds]. Must be indexable by `data.index`. demean_confounds : bool, optional If `True`, confounds are normalised along the features axis. """ # samples = Panel [n_samples, n_observations, n_features] data, labels, confounds = utils.sanitise_inputs(data, labels, confounds) n_o, n_f = data.shape if labels is not None: n_c = len(labels.cat.categories) else: n_c = 1 # All same class if confounds is not None: data = utils.remove_confounds(data, confounds, demean_confounds) #-------------------------------------------------------------------------- # Plot data distributions plot_kws = {} if n_o > 100: plot_kws['s'] = 5 plot_kws['edgecolor'] = None if n_f <= 10: # Full interactions for small data if labels is None: fig = sns.pairplot(data, plot_kws=plot_kws) else: fig = sns.pairplot(data.join(labels), hue=labels.name, vars=data.columns, plot_kws=plot_kws) # Need to specify vars as pairplot checks values not # categorical-ness (e.g. labels=[1,1,2,3] is numeric...) # https://github.com/mwaskom/seaborn/issues/919#issuecomment-366872386 elif n_f * n_c <= 75: # For intermediate data, just plot marginals norm_data = data.apply(sklearn.preprocessing.scale) if labels is None: fig = sns.catplot(data=norm_data, kind='violin', inner='quartile') else: plot_data = norm_data.join(labels).melt(labels.name, var_name='Feature', value_name='Value') fig = sns.catplot(data=plot_data, x='Feature', y='Value', hue=labels.name, order=data.columns, kind='boxen', legend_out=False) fig.set_xticklabels(rotation=45, horizontalalignment='right') fig.ax.set_title("Feature distributions") else: # Top PCA components for big data norm_data = data.apply(sklearn.preprocessing.scale) pca = sklearn.decomposition.PCA( n_components=5).fit_transform(norm_data) pca = pd.DataFrame( data=pca, index=data.index, columns=['PCA: #{:d}'.format(i) for i in range(pca.shape[1])]) if labels is None: fig = sns.pairplot(pca, plot_kws=plot_kws) else: fig = sns.pairplot(pca.join(labels), hue=labels.name, vars=pca.columns, plot_kws=plot_kws) try: fig.fig.tight_layout() # This makes sure we don't lose any axes labels etc, but also makes the # legend almost impossible to see for `sns.pairplot()`... However, # there is currently no way to customise the legend creation so it # stays for now. except NameError: pass #-------------------------------------------------------------------------- # Correlations between features if n_f <= 200: fig, ax = plt.subplots() ax = sns.heatmap(data.corr(), ax=ax, square=True, vmin=-1.0, center=0.0, vmax=1.0, cmap='RdBu_r', cbar_kws={'label': "Correlation coefficient"}) #ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right') ax.set_title("Feature correlations") fig.tight_layout() #-------------------------------------------------------------------------- return
def visualise_manifold(data, labels=None, *, confounds=None, demean_confounds=True, normalise=True, manifold=sklearn.manifold.TSNE(init='pca')): """ Plots the data set on a low-dimensional manifold. Parameters ---------- data : pandas.DataFrame Shape [n_observations, n_features]. labels : pandas.Series, optional Shape [???,]. Must be indexable by `data.index`. Other Parameters ---------------- confounds : pandas.DataFrame, optional Shape [???, n_confounds]. Must be indexable by `data.index`. demean_confounds : bool, optional If `True`, confounds are normalised along the features axis. normalise : bool, optional If `True`, data is normalised along the features axis. manifold : optional Instance of a class from `sklearn.manifold` or `sklearn.decomposition`. Use this to pass in a different algorithm for dimensionality reduction. """ # samples = Panel [n_samples, n_observations, n_features] data, labels, confounds = utils.sanitise_inputs(data, labels, confounds) n_o, n_f = data.shape if labels is not None: n_c = len(labels.cat.categories) else: n_c = 1 # All same class if confounds is not None: data = utils.remove_confounds(data, confounds, demean_confounds) # Preprocess data if normalise: data = data.apply(sklearn.preprocessing.scale) # Concatenate samples... if n_f > 50: # Reduce data before passing to manifold? pass # Find the data in the embedding space manifold.set_params(n_components=2) # Sanity check... embedding = manifold.fit_transform(data) # Separate samples... # And plot fig, ax = plt.subplots() # Plot samples... if labels is None: ax.plot(embedding[:, 0], embedding[:, 1], 'o', markersize=10) else: for c in range(n_c): ax.plot(embedding[labels.cat.codes == c, 0], embedding[labels.cat.codes == c, 1], 'o', markersize=10, markeredgecolor='k', label=labels.cat.categories[c]) ax.legend(title=labels.name) ax.set_xticks([]) ax.set_yticks([]) ax.set_title("Data manifold") fig.tight_layout() return