def _run_univariate_tests(self, X, y, control='N2', n_jobs=-1): stats, pvals, _ = univariate_tests( X, y, control=control, test=self.test, comparison_type=self.comparison_type, multitest_correction=self.multitest_method, n_jobs=n_jobs) effects = get_effect_sizes( X, y, control=control, test=self.test, comparison_type=self.comparison_type) test_res = pd.DataFrame(pvals.min(axis=1), columns=['p-value']) # In most cases, the pvals and effects have the same shape # (when we do group-by-group comparisons, we get group-by-group # effect sizes too, and when we do multi-class comparisons we get one # effect size). # But for the Kruskal-Wallis case, we cannot get one effect size for the # test, so we get group-by-group effect sizes instead and keep the max. # In this case pvals has only one column, but effects has more than one # columns if pvals.shape==effects.shape: test_res['effect_size'] = effects.values[pvals.isin(pvals.min(axis=1)).values] else: test_res['effect_size'] = effects.max(axis=1) self.test_results = test_res return
def single_feature_window_mutant_worm_stats(metadata, features, save_dir, window=2, feature='motion_mode_paused_fraction', pvalue_threshold=0.05, fdr_method='fdr_by'): """ T-tests comparing BW vs fepD for each mutant worm """ # 7 worm strains: N2 vs 'cat-2', 'eat-4', 'osm-5', 'pdfr-1', 'tax-2', 'unc-25' # 2 bacteria strains: BW vs fepD # 1 feature: 'motion_mode_paused_fraction' # 1 window: 2 (corresponding to 30 minutes on food, just after first BL stimulus) # focus on just one window = 30min just after BL (window=2) window_metadata = metadata[metadata['window']==window] # statistics: perform t-tests comparing fepD vs BW for each worm strain worm_strain_list = list(window_metadata['worm_strain'].unique()) ttest_list = [] for worm in worm_strain_list: worm_window_meta = window_metadata[window_metadata['worm_strain']==worm] worm_window_feat = features[[feature]].reindex(worm_window_meta.index) stats, pvals, reject = univariate_tests(X=worm_window_feat, y=worm_window_meta['bacteria_strain'], control='BW', test='t-test', comparison_type='binary_each_group', multitest_correction=fdr_method, alpha=PVAL_THRESH, n_permutation_test=None) # get effect sizes effect_sizes = get_effect_sizes(X=worm_window_feat, y=worm_window_meta['bacteria_strain'], control='BW', effect_type=None, linked_test='t-test') # compile t-test results stats.columns = ['stats_' + str(c) for c in stats.columns] pvals.columns = ['pvals_' + str(c) for c in pvals.columns] reject.columns = ['reject_' + str(c) for c in reject.columns] effect_sizes.columns = ['effect_size_' + str(c) for c in effect_sizes.columns] ttest_df = pd.concat([stats, effect_sizes, pvals, reject], axis=1) # record the worm strain as the index instead of the feature ttest_df = ttest_df.rename(index={feature:worm}) ttest_list.append(ttest_df) ttest_path = Path(save_dir) / 'pairwise_ttests' /\ 'ttest_mutant_worm_fepD_vs_BW_window_{}_results.csv'.format(window) ttest_path.parent.mkdir(exist_ok=True, parents=True) ttest_results = pd.concat(ttest_list, axis=0) ttest_results.to_csv(ttest_path, header=True, index=True) return
def dead_keio_stats(features, metadata, args): """ Perform statistical analyses on dead Keio experiment results: - t-tests for each feature comparing each strain vs control for paired antioxidant treatment conditions - t-tests for each feature comparing each strain antioxidant treatment to negative control (no antioxidant) Inputs ------ features, metadata : pd.DataFrame Clean feature summaries and accompanying metadata args : Object Python object with the following attributes: - drop_size_features : bool - norm_features_only : bool - percentile_to_use : str - remove_outliers : bool - control_dict : dict - n_top_feats : int - tierpsy_top_feats_dir (if n_top_feats) : str - test : str - f_test : bool - pval_threshold : float - fdr_method : str - n_sig_features : int """ print("\nInvestigating variation in worm behaviour on dead vs alive hit Keio strains") # assert there will be no errors due to case-sensitivity assert len(metadata[STRAIN_COLNAME].unique()) == len(metadata[STRAIN_COLNAME].str.upper().unique()) assert all(type(b) == np.bool_ for b in metadata[TREATMENT_COLNAME].unique()) # Load Tierpsy feature set + subset (columns) for selected features only features = select_feat_set(features, 'tierpsy_{}'.format(args.n_top_feats), append_bluelight=True) features = features[[f for f in features.columns if 'path_curvature' not in f]] assert not features.isna().any().any() #n_feats = features.shape[1] strain_list = list(metadata[STRAIN_COLNAME].unique()) assert CONTROL_STRAIN in strain_list # print mean sample size sample_size = df_summary_stats(metadata, columns=[STRAIN_COLNAME, TREATMENT_COLNAME]) print("Mean sample size of %s: %d" % (STRAIN_COLNAME, int(sample_size['n_samples'].mean()))) # construct save paths (args.save_dir / topfeats? etc) save_dir = get_save_dir(args) stats_dir = save_dir / "Stats" / args.fdr_method ##### ANOVA ##### # make path to save ANOVA results test_path = stats_dir / 'ANOVA_results.csv' test_path.parent.mkdir(exist_ok=True, parents=True) # ANOVA across strains for significant feature differences if len(metadata[STRAIN_COLNAME].unique()) > 2: stats, pvals, reject = univariate_tests(X=features, y=metadata[STRAIN_COLNAME], test='ANOVA', control=CONTROL_STRAIN, comparison_type='multiclass', multitest_correction=None, # uncorrected alpha=args.pval_threshold, n_permutation_test=None) # 'all' # get effect sizes effect_sizes = get_effect_sizes(X=features, y=metadata[STRAIN_COLNAME], control=CONTROL_STRAIN, effect_type=None, linked_test='ANOVA') # correct for multiple comparisons reject_corrected, pvals_corrected = _multitest_correct(pvals, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save results (corrected) test_results = pd.concat([stats, effect_sizes, pvals_corrected, reject_corrected], axis=1) test_results.columns = ['stats','effect_size','pvals','reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals test_results.to_csv(test_path, header=True, index=True) nsig = test_results['reject'].sum() print("%d features (%.f%%) signficantly different among '%s'" % (nsig, len(test_results.index)/nsig, STRAIN_COLNAME)) ##### t-tests ##### for strain in strain_list: strain_meta = metadata[metadata[STRAIN_COLNAME]==strain] strain_feat = features.reindex(strain_meta.index) ### t-tests for each feature comparing live vs dead behaviour ttest_path_uncorrected = stats_dir / '{}_uncorrected.csv'.format((t_test + '_' + strain)) ttest_path = stats_dir / '{}_results.csv'.format((t_test + '_' + strain)) ttest_path.parent.mkdir(exist_ok=True, parents=True) # perform t-tests (without correction for multiple testing) stats_t, pvals_t, reject_t = univariate_tests(X=strain_feat, y=strain_meta[TREATMENT_COLNAME], control=CONTROL_TREATMENT, test=t_test, comparison_type='binary_each_group', multitest_correction=None, alpha=0.05) # get effect sizes for comparisons effect_sizes_t = get_effect_sizes(X=strain_feat, y=strain_meta[TREATMENT_COLNAME], control=CONTROL_TREATMENT, effect_type=None, linked_test=t_test) # compile + save t-test results (uncorrected) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True) # correct for multiple comparisons pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns] reject_t, pvals_t = _multitest_correct(pvals_t, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save t-test results (corrected) pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_corrected.to_csv(ttest_path, header=True, index=True) # record t-test significant features (not ordered) fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list() #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0]) print("%d significant features for %s on any %s vs %s (%s, %s, P<%.2f)" % (len(fset_ttest), strain, TREATMENT_COLNAME, CONTROL_TREATMENT, t_test, args.fdr_method, args.pval_threshold)) if len(fset_ttest) > 0: ttest_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format((t_test + '_' + strain)) write_list_to_file(fset_ttest, ttest_sigfeats_path) ##### for LIVE bacteria: compare each strain with control ##### live_metadata = metadata[metadata['dead']==False] live_features = features.reindex(live_metadata.index) ttest_path_uncorrected = stats_dir / '{}_live_uncorrected.csv'.format(t_test) ttest_path = stats_dir / '{}_live_results.csv'.format(t_test) ttest_path.parent.mkdir(exist_ok=True, parents=True) # perform t-tests (without correction for multiple testing) stats_t, pvals_t, reject_t = univariate_tests(X=live_features, y=live_metadata[STRAIN_COLNAME], control=CONTROL_STRAIN, test=t_test, comparison_type='binary_each_group', multitest_correction=None, alpha=0.05) # get effect sizes for comparisons effect_sizes_t = get_effect_sizes(X=live_features, y=live_metadata[STRAIN_COLNAME], control=CONTROL_STRAIN, effect_type=None, linked_test=t_test) # compile + save t-test results (uncorrected) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True) # correct for multiple comparisons pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns] reject_t, pvals_t = _multitest_correct(pvals_t, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save t-test results (corrected) pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_corrected.to_csv(ttest_path, header=True, index=True) # record t-test significant features (not ordered) fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list() #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0]) print("LIVE BACTERIA: %d significant features for any %s vs %s (%s, %s, P<%.2f)" %\ (len(fset_ttest), STRAIN_COLNAME, CONTROL_STRAIN, t_test, args.fdr_method, args.pval_threshold)) if len(fset_ttest) > 0: ttest_sigfeats_path = stats_dir / '{}_live_sigfeats.txt'.format(t_test) write_list_to_file(fset_ttest, ttest_sigfeats_path) ##### for DEAD bacteria: compare each strain with control ##### dead_metadata = metadata[metadata['dead']==True] dead_features = features.reindex(dead_metadata.index) ttest_path_uncorrected = stats_dir / '{}_dead_uncorrected.csv'.format(t_test) ttest_path = stats_dir / '{}_dead_results.csv'.format(t_test) ttest_path.parent.mkdir(exist_ok=True, parents=True) # perform t-tests (without correction for multiple testing) stats_t, pvals_t, reject_t = univariate_tests(X=dead_features, y=dead_metadata[STRAIN_COLNAME], control=CONTROL_STRAIN, test=t_test, comparison_type='binary_each_group', multitest_correction=None, alpha=0.05) # get effect sizes for comparisons effect_sizes_t = get_effect_sizes(X=dead_features, y=dead_metadata[STRAIN_COLNAME], control=CONTROL_STRAIN, effect_type=None, linked_test=t_test) # compile + save t-test results (uncorrected) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True) # correct for multiple comparisons pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns] reject_t, pvals_t = _multitest_correct(pvals_t, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save t-test results (corrected) pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_corrected.to_csv(ttest_path, header=True, index=True) # record t-test significant features (not ordered) fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list() #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0]) print("DEAD BACTERIA: %d significant features for any %s vs %s (%s, %s, P<%.2f)" %\ (len(fset_ttest), STRAIN_COLNAME, CONTROL_STRAIN, t_test, args.fdr_method, args.pval_threshold)) if len(fset_ttest) > 0: ttest_sigfeats_path = stats_dir / '{}_dead_sigfeats.txt'.format(t_test) write_list_to_file(fset_ttest, ttest_sigfeats_path)
# Create table to store statistics results grouped = feat_df.join(meta_df[GROUPING_VAR]).groupby(by=GROUPING_VAR) stats_table = grouped.mean().T mean_cols = ['mean ' + v for v in stats_table.columns.to_list()] stats_table.columns = mean_cols for group in grouped.size().index: # store sample size stats_table['sample size {}'.format(group)] = grouped.size().loc[group] # ANOVA / Kruskal-Wallis tests if (TEST_NAME == "ANOVA" or TEST_NAME == "Kruskal"): if len(run_strain_list) > 2: stats, pvals, reject = univariate_tests(X=feat_df, y=meta_df[GROUPING_VAR], control=CONTROL, test=TEST_NAME, comparison_type='multiclass', multitest_correction=args.fdr_method, fdr=0.05, n_jobs=-1) # Record name of statistical test used (kruskal/f_oneway) col = '{} p-value'.format(TEST_NAME) stats_table[col] = pvals.loc[stats_table.index, TEST_NAME] # Sort pvals + record significant features pvals = pvals.sort_values(by=[TEST_NAME], ascending=True) fset = list(pvals.index[np.where(pvals < args.pval_threshold)[0]]) if len(fset) > 0: print("\n%d significant features found by %s for %s (run %d, P<%.2f, %s)" %\ (len(fset), TEST_NAME, GROUPING_VAR, run, args.pval_threshold, args.fdr_method))
# Drop feature columns with zero standard deviation features_df = feat_filter_std(features_df, threshold=0.0) # Fill in NaNs with global mean features_df = features_df.fillna(features_df.mean(axis=0)) feature_list = features_df.columns.to_list() strain_list = list(metadata_df[args.strain_colname].unique()) ### statistics # ANOVA to test to variation among strains if len(metadata_df[args.strain_colname].unique()) > 2: stats, pvals, reject = univariate_tests( X=features_df, y=metadata_df[args.strain_colname], control=args.control, test='ANOVA', comparison_type='multiclass', multitest_correction='fdr_by', alpha=0.05) # get effect sizes effect_sizes = get_effect_sizes(X=features_df, y=metadata_df[args.strain_colname], control=args.control, effect_type=None, linked_test='ANOVA') # compile + save results test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1) test_results.columns = ['stats', 'effect_size', 'pvals', 'reject'] test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals anova_save_path = save_dir / 'stats' / 'ANOVA_results.csv'
# drop NaN entries eggs = eggs.dropna(subset=['gene_name', 'number_eggs_1hr']) strain_list = [CONTROL_STRAIN] + [ s for s in eggs['gene_name'].unique() if s != CONTROL_STRAIN ] # 1. perform chi-sq tests to see if number of eggs laid is significantly different from control # for any strain # perform ANOVA (correct for multiple comparisons) - is there variation in egg count across strains? stats, pvals, reject = univariate_tests(X=eggs[['number_eggs_1hr']], y=eggs['gene_name'], test='ANOVA', control=CONTROL_STRAIN, comparison_type='multiclass', multitest_correction='fdr_by', alpha=0.05, n_permutation_test=None) # 'all' # get effect sizes effect_sizes = get_effect_sizes(X=eggs[['number_eggs_1hr']], y=eggs['gene_name'], control=CONTROL_STRAIN, effect_type=None, linked_test='ANOVA') # compile test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1) test_results.columns = ['stats', 'effect_size', 'pvals', 'reject'] test_results = test_results.sort_values(by=['pvals'],
subset=['gene_name', 'antioxidant', 'number_eggs_24hrs']) strain_list = [CONTROL_STRAIN] + [ s for s in eggs['gene_name'].unique() if s != CONTROL_STRAIN ] antioxidant_list = [CONTROL_ANTIOXIDANT] + [ a for a in eggs['antioxidant'].unique() if a != CONTROL_ANTIOXIDANT ] # 1. perform chi-sq tests to see if number of eggs laid is significantly different from control # perform ANOVA - is there variation in egg laying across antioxidants? (pooled strain data) stats, pvals, reject = univariate_tests(X=eggs[['number_eggs_24hrs']], y=eggs['antioxidant'], test='ANOVA', control=CONTROL_ANTIOXIDANT, comparison_type='multiclass', multitest_correction='fdr_by', alpha=0.05, n_permutation_test=None) # 'all' # get effect sizes effect_sizes = get_effect_sizes(X=eggs[['number_eggs_24hrs']], y=eggs['antioxidant'], control=CONTROL_ANTIOXIDANT, effect_type=None, linked_test='ANOVA') # compile test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1) test_results.columns = ['stats', 'effect_size', 'pvals', 'reject'] test_results = test_results.sort_values(by=['pvals'],
def acute_rescue_stats(features, metadata, save_dir, control_strain, control_antioxidant, control_window, fdr_method='fdr_by', pval_threshold=0.05): """ Pairwise t-tests for each window comparing worm 'motion mode paused fraction' on Keio mutants vs BW control # One could fit a multiple linear regression model: to account for strain*antioxidant in a # single model: Y (motion_mode) = b0 + b1*X1 (strain) + b2*X2 (antiox) + e (error) # But this is a different type of question: we care about difference in means between # fepD vs BW (albeit under different antioxidant treatments), and not about modelling their # relationship, therefore individual t-tests (multiple-test-corrected) should suffice 1. For each treatment condition, t-tests comparing fepD vs BW for motion_mode 2. For fepD and BW separately, f-tests for equal variance among antioxidant treatment groups, then ANOVA tests for significant differences between antioxidants, then individual t-tests comparing each treatment to control Inputs ------ features, metadata : pandas.DataFrame window_list : list List of windows (int) to perform statistics (separately for each window provided, p-values are adjusted for multiple test correction) save_dir : str Directory to save statistics results control_strain control_antioxidant fdr_method """ stats_dir = Path(save_dir) / "Stats" / args.fdr_method stats_dir.mkdir(parents=True, exist_ok=True) strain_list = [control_strain] + [s for s in set(metadata['gene_name'].unique()) if s != control_strain] antiox_list = [control_antioxidant] + [a for a in set(metadata['antioxidant'].unique()) if a != control_antioxidant] window_list = [control_window] + [w for w in set(metadata['window'].unique()) if w != control_window] # categorical variables to investigate: 'gene_name', 'antioxidant' and 'window' print("\nInvestigating difference in fraction of worms paused between hit strain and control " + "(for each window), in the presence/absence of antioxidants:\n") # print mean sample size sample_size = df_summary_stats(metadata, columns=['gene_name', 'antioxidant', 'window']) print("Mean sample size of strain/antioxidant for each window: %d" %\ (int(sample_size['n_samples'].mean()))) # For each strain separately... for strain in strain_list: strain_meta = metadata[metadata['gene_name']==strain] strain_feat = features.reindex(strain_meta.index) # 1. Is there any variation in fraction paused wtr antioxidant treatment? # - ANOVA on pooled window data, then pairwise t-tests for each antioxidant print("Performing ANOVA on pooled window data for significant variation in fraction " + "of worms paused among different antioxidant treatments for %s..." % strain) # perform ANOVA (correct for multiple comparisons) stats, pvals, reject = univariate_tests(X=strain_feat[[FEATURE]], y=strain_meta['antioxidant'], test='ANOVA', control=control_antioxidant, comparison_type='multiclass', multitest_correction=fdr_method, alpha=pval_threshold, n_permutation_test=None) # 'all' # get effect sizes effect_sizes = get_effect_sizes(X=strain_feat[[FEATURE]], y=strain_meta['antioxidant'], control=control_antioxidant, effect_type=None, linked_test='ANOVA') # compile test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1) test_results.columns = ['stats','effect_size','pvals','reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals # save results anova_path = Path(stats_dir) / 'ANOVA_{}_variation_across_antioxidants.csv'.format(strain) test_results.to_csv(anova_path, header=True, index=True) print("Performing t-tests comparing each antioxidant treatment to None (pooled window data)") stats_t, pvals_t, reject_t = univariate_tests(X=strain_feat[[FEATURE]], y=strain_meta['antioxidant'], test='t-test', control=control_antioxidant, comparison_type='binary_each_group', multitest_correction=fdr_method, alpha=pval_threshold) effect_sizes_t = get_effect_sizes(X=strain_feat[[FEATURE]], y=strain_meta['antioxidant'], control=control_antioxidant, effect_type=None, linked_test='t-test') # compile + save t-test results stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_results = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_save_path = stats_dir / 't-test_{}_antioxidant_results.csv'.format(strain) ttest_save_path.parent.mkdir(exist_ok=True, parents=True) ttest_results.to_csv(ttest_save_path, header=True, index=True) # 2. Is there any variation in fraction paused wrt window (time) across the videos? # - ANOVA on pooled antioxidant data, then pairwise for each window print("Performing ANOVA on pooled antioxidant data for significant variation in fraction " + "of worms paused across (bluelight) window summaries for %s..." % strain) # perform ANOVA (correct for multiple comparisons) stats, pvals, reject = univariate_tests(X=strain_feat[[FEATURE]], y=strain_meta['window'], test='ANOVA', control=control_window, comparison_type='multiclass', multitest_correction=fdr_method, alpha=pval_threshold, n_permutation_test=None) # get effect sizes effect_sizes = get_effect_sizes(X=strain_feat[[FEATURE]], y=strain_meta['window'], control=control_window, effect_type=None, linked_test='ANOVA') # compile test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1) test_results.columns = ['stats','effect_size','pvals','reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals # save results anova_path = Path(stats_dir) / 'ANOVA_{}_variation_across_windows.csv'.format(strain) test_results.to_csv(anova_path, header=True, index=True) print("Performing t-tests comparing each window with the first (pooled antioxidant data)") stats_t, pvals_t, reject_t = univariate_tests(X=strain_feat[[FEATURE]], y=strain_meta['window'], test='t-test', control=control_window, comparison_type='binary_each_group', multitest_correction=fdr_method, alpha=pval_threshold) effect_sizes_t = get_effect_sizes(X=strain_feat[[FEATURE]], y=strain_meta['window'], control=control_window, effect_type=None, linked_test='t-test') # compile + save t-test results stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_results = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_save_path = stats_dir / 't-test_{}_window_results.csv'.format(strain) ttest_save_path.parent.mkdir(exist_ok=True, parents=True) ttest_results.to_csv(ttest_save_path, header=True, index=True) # Pairwise t-tests - is there a difference between strain vs control? control_meta = metadata[metadata['gene_name']==control_strain] control_feat = features.reindex(control_meta.index) control_df = control_meta.join(control_feat[[FEATURE]]) for strain in strain_list[1:]: # skip control_strain at first index postion strain_meta = metadata[metadata['gene_name']==strain] strain_feat = features.reindex(strain_meta.index) strain_df = strain_meta.join(strain_feat[[FEATURE]]) # 3. Is there a difference between strain vs control at any window? print("\nPairwise t-tests for each window (pooled antioxidants) comparing fraction of " + "worms paused on %s vs control:" % strain) stats, pvals, reject = pairwise_ttest(control_df, strain_df, feature_list=[FEATURE], group_by='window', fdr_method=fdr_method, fdr=0.05) # compile table of results stats.columns = ['stats_' + str(c) for c in stats.columns] pvals.columns = ['pvals_' + str(c) for c in pvals.columns] reject.columns = ['reject_' + str(c) for c in reject.columns] test_results = pd.concat([stats, pvals, reject], axis=1) # save results ttest_strain_path = stats_dir / 'pairwise_ttests' / 'window' /\ '{}_window_results.csv'.format(strain) ttest_strain_path.parent.mkdir(parents=True, exist_ok=True) test_results.to_csv(ttest_strain_path, header=True, index=True) # for each antioxidant treatment condition... for antiox in antiox_list: print("Pairwise t-tests for each window comparing fraction of " + "worms paused on %s vs control with '%s'" % (strain, antiox)) antiox_control_df = control_df[control_df['antioxidant']==antiox] antiox_strain_df = strain_df[strain_df['antioxidant']==antiox] stats, pvals, reject = pairwise_ttest(antiox_control_df, antiox_strain_df, feature_list=[FEATURE], group_by='window', fdr_method=fdr_method, fdr=0.05) # compile table of results stats.columns = ['stats_' + str(c) for c in stats.columns] pvals.columns = ['pvals_' + str(c) for c in pvals.columns] reject.columns = ['reject_' + str(c) for c in reject.columns] test_results = pd.concat([stats, pvals, reject], axis=1) # save results ttest_strain_path = stats_dir / 'pairwise_ttests' / 'window' /\ '{0}_{1}_window_results.csv'.format(strain, antiox) ttest_strain_path.parent.mkdir(parents=True, exist_ok=True) test_results.to_csv(ttest_strain_path, header=True, index=True) # 4. Is there a difference between strain vs control for any antioxidant? print("\nPairwise t-tests for each antioxidant (pooled windows) comparing fraction of " + "worms paused on %s vs control:" % strain) stats, pvals, reject = pairwise_ttest(control_df, strain_df, feature_list=[FEATURE], group_by='antioxidant', fdr_method=fdr_method, fdr=0.05) # compile table of results stats.columns = ['stats_' + str(c) for c in stats.columns] pvals.columns = ['pvals_' + str(c) for c in pvals.columns] reject.columns = ['reject_' + str(c) for c in reject.columns] test_results = pd.concat([stats, pvals, reject], axis=1) # save results ttest_strain_path = stats_dir / 'pairwise_ttests' / 'antioxidant' /\ '{}_antioxidant_results.csv'.format(strain) ttest_strain_path.parent.mkdir(parents=True, exist_ok=True) test_results.to_csv(ttest_strain_path, header=True, index=True) # For each window... for window in window_list: print("Pairwise t-tests for each antioxidant comparing fraction of " + "worms paused on %s vs control at window %d" % (strain, window)) window_control_df = control_df[control_df['window']==window] window_strain_df = strain_df[strain_df['window']==window] stats, pvals, reject = pairwise_ttest(window_control_df, window_strain_df, feature_list=[FEATURE], group_by='antioxidant', fdr_method=fdr_method, fdr=0.05) # compile table of results stats.columns = ['stats_' + str(c) for c in stats.columns] pvals.columns = ['pvals_' + str(c) for c in pvals.columns] reject.columns = ['reject_' + str(c) for c in reject.columns] test_results = pd.concat([stats, pvals, reject], axis=1) # save results ttest_strain_path = stats_dir / 'pairwise_ttests' / 'antioxidant' /\ '{0}_window{1}_antioxidant_results.csv'.format(strain, window) ttest_strain_path.parent.mkdir(parents=True, exist_ok=True) test_results.to_csv(ttest_strain_path, header=True, index=True) return
def antioxidant_stats(features, metadata, args): """ Perform statistical analyses on Keio antioxidant rescue experiment results: - ANOVA tests for significant feature variation between strains (for each antioxidant treatment in turn) - ANOVA tests for significant feature variation in antioxidant treatment (for each strain in turn) - t-tests for each feature comparing each strain vs control for paired antioxidant treatment conditions - t-tests for each feature comparing each strain antioxidant treatment to negative control (no antioxidant) Inputs ------ features, metadata : pd.DataFrame Clean feature summaries and accompanying metadata args : Object Python object with the following attributes: - drop_size_features : bool - norm_features_only : bool - percentile_to_use : str - remove_outliers : bool - control_dict : dict - n_top_feats : int - tierpsy_top_feats_dir (if n_top_feats) : str - test : str - f_test : bool - pval_threshold : float - fdr_method : str - n_sig_features : int """ # categorical variables to investigate: 'gene_name' and 'antioxidant' print( "\nInvestigating variation in worm behaviour on hit strains treated with different antioxidants" ) # assert there will be no errors due to case-sensitivity assert len(metadata[STRAIN_COLNAME].unique()) == len( metadata[STRAIN_COLNAME].str.upper().unique()) assert len(metadata[TREATMENT_COLNAME].unique()) == len( metadata[TREATMENT_COLNAME].str.upper().unique()) assert not features.isna().any().any() strain_list = list(metadata[STRAIN_COLNAME].unique()) antioxidant_list = list(metadata[TREATMENT_COLNAME].unique()) assert CONTROL_STRAIN in strain_list and CONTROL_TREATMENT in antioxidant_list # print mean sample size sample_size = df_summary_stats(metadata, columns=[STRAIN_COLNAME, TREATMENT_COLNAME]) print("Mean sample size of %s: %d" % (STRAIN_COLNAME, int(sample_size['n_samples'].mean()))) # construct save paths (args.save_dir / topfeats? etc) save_dir = get_save_dir(args) stats_dir = save_dir / "Stats" / args.fdr_method ### For each antioxidant treatment in turn... for antiox in antioxidant_list: print("\n%s" % antiox) meta_antiox = metadata[metadata[TREATMENT_COLNAME] == antiox] feat_antiox = features.reindex(meta_antiox.index) ### ANOVA tests for significant variation between strains # make path to save ANOVA results test_path_unncorrected = stats_dir / '{}_uncorrected.csv'.format( (args.test + '_' + antiox)) test_path = stats_dir / '{}_results.csv'.format( (args.test + '_' + antiox)) test_path.parent.mkdir(exist_ok=True, parents=True) if len(meta_antiox[STRAIN_COLNAME].unique()) > 2: # perform ANOVA + record results before & after correcting for multiple comparisons stats, pvals, reject = univariate_tests( X=feat_antiox, y=meta_antiox[STRAIN_COLNAME], test=args.test, control=CONTROL_STRAIN, comparison_type='multiclass', multitest_correction=None, # uncorrected alpha=args.pval_threshold, n_permutation_test=None) # 'all' # get effect sizes effect_sizes = get_effect_sizes(X=feat_antiox, y=meta_antiox[STRAIN_COLNAME], control=CONTROL_STRAIN, effect_type=None, linked_test=args.test) # compile + save results (uncorrected) test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1) test_results.columns = ['stats', 'effect_size', 'pvals', 'reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values( by=['pvals'], ascending=True) # rank pvals test_results.to_csv(test_path_unncorrected, header=True, index=True) # correct for multiple comparisons reject_corrected, pvals_corrected = _multitest_correct( pvals, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save results (corrected) test_results = pd.concat( [stats, effect_sizes, pvals_corrected, reject_corrected], axis=1) test_results.columns = ['stats', 'effect_size', 'pvals', 'reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values( by=['pvals'], ascending=True) # rank pvals test_results.to_csv(test_path, header=True, index=True) print( "%s differences in '%s' across strains on %s (%s, P<%.2f, %s)" % (("SIGNIFICANT" if reject_corrected.loc[FEATURE, args.test] else "No significant"), FEATURE, antiox, args.test, args.pval_threshold, args.fdr_method)) else: print("\nWARNING: Not enough %s groups for %s (n=%d)" %\ (STRAIN_COLNAME, args.test, len(strain_list))) ### t-tests comparing each strain vs control for each antioxidant treatment conditions if len(meta_antiox[STRAIN_COLNAME].unique()) == 2 or ( len(meta_antiox[STRAIN_COLNAME].unique()) > 2 and reject_corrected.loc[FEATURE, args.test]): # t-test to use t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sum ttest_path_uncorrected = stats_dir / '{}_uncorrected.csv'.format( (t_test + '_' + antiox)) ttest_path = stats_dir / '{}_results.csv'.format( (t_test + '_' + antiox)) ttest_path.parent.mkdir(exist_ok=True, parents=True) # perform t-tests (without correction for multiple testing) stats_t, pvals_t, reject_t = univariate_tests( X=feat_antiox, y=meta_antiox[STRAIN_COLNAME], control=CONTROL_STRAIN, test=t_test, comparison_type='binary_each_group', multitest_correction=None, alpha=0.05) # get effect sizes for comparisons effect_sizes_t = get_effect_sizes(X=feat_antiox, y=meta_antiox[STRAIN_COLNAME], control=CONTROL_STRAIN, effect_type=None, linked_test=t_test) # compile + save t-test results (uncorrected) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = [ 'effect_size_' + str(c) for c in effect_sizes_t.columns ] ttest_uncorrected = pd.concat( [stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True) # correct for multiple comparisons pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns] reject_t, pvals_t = _multitest_correct( pvals_t, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save t-test results (corrected) pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] ttest_corrected = pd.concat( [stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_corrected.to_csv(ttest_path, header=True, index=True) nsig = reject_t.loc[FEATURE].sum() print("%d %ss differ from %s in '%s' on %s (%s, P<%.2f, %s)" % (nsig, STRAIN_COLNAME, CONTROL_STRAIN, FEATURE, antiox, t_test, args.pval_threshold, args.fdr_method)) ### For each strain in turn... for strain in strain_list: print("\n%s" % strain) meta_strain = metadata[metadata[STRAIN_COLNAME] == strain] feat_strain = features.reindex(meta_strain.index) ### ANOVA tests for significant feature variation in antioxidant treatment # make path to save ANOVA results test_path_unncorrected = stats_dir / '{}_uncorrected.csv'.format( (args.test + '_' + strain)) test_path = stats_dir / '{}_results.csv'.format( (args.test + '_' + strain)) test_path.parent.mkdir(exist_ok=True, parents=True) if len(meta_strain[TREATMENT_COLNAME].unique()) > 2: # perform ANOVA + record results before & after correcting for multiple comparisons stats, pvals, reject = univariate_tests( X=feat_strain, y=meta_strain[TREATMENT_COLNAME], test=args.test, control=CONTROL_TREATMENT, comparison_type='multiclass', multitest_correction=None, # uncorrected alpha=args.pval_threshold, n_permutation_test=None) # 'all' # get effect sizes effect_sizes = get_effect_sizes(X=feat_strain, y=meta_strain[TREATMENT_COLNAME], control=CONTROL_TREATMENT, effect_type=None, linked_test=args.test) # compile + save results (uncorrected) test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1) test_results.columns = ['stats', 'effect_size', 'pvals', 'reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values( by=['pvals'], ascending=True) # rank pvals test_results.to_csv(test_path_unncorrected, header=True, index=True) # correct for multiple comparisons reject_corrected, pvals_corrected = _multitest_correct( pvals, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save results (corrected) test_results = pd.concat( [stats, effect_sizes, pvals_corrected, reject_corrected], axis=1) test_results.columns = ['stats', 'effect_size', 'pvals', 'reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values( by=['pvals'], ascending=True) # rank pvals test_results.to_csv(test_path, header=True, index=True) print("%s differences in '%s' across %ss for %s (%s, P<%.2f, %s)" % (("SIGNIFICANT" if reject_corrected.loc[FEATURE, args.test] else "No"), FEATURE, TREATMENT_COLNAME, strain, args.test, args.pval_threshold, args.fdr_method)) else: print("\nWARNING: Not enough %s groups for %s (n=%d)" %\ (TREATMENT_COLNAME, args.test, len(antioxidant_list))) ### t-tests comparing each antioxidant treatment to no antioxidant for each strain if len(meta_strain[TREATMENT_COLNAME].unique()) == 2 or ( len(meta_strain[TREATMENT_COLNAME].unique()) > 2 and reject_corrected.loc[FEATURE, args.test]): # t-test to use t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sum ttest_path_uncorrected = stats_dir / '{}_uncorrected.csv'.format( (t_test + '_' + strain)) ttest_path = stats_dir / '{}_results.csv'.format( (t_test + '_' + strain)) ttest_path.parent.mkdir(exist_ok=True, parents=True) # perform t-tests (without correction for multiple testing) stats_t, pvals_t, reject_t = univariate_tests( X=feat_strain, y=meta_strain[TREATMENT_COLNAME], control=CONTROL_TREATMENT, test=t_test, comparison_type='binary_each_group', multitest_correction=None, alpha=0.05) # get effect sizes for comparisons effect_sizes_t = get_effect_sizes(X=feat_strain, y=meta_strain[TREATMENT_COLNAME], control=CONTROL_TREATMENT, effect_type=None, linked_test=t_test) # compile + save t-test results (uncorrected) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = [ 'effect_size_' + str(c) for c in effect_sizes_t.columns ] ttest_uncorrected = pd.concat( [stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True) # correct for multiple comparisons pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns] reject_t, pvals_t = _multitest_correct( pvals_t, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save t-test results (corrected) pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] ttest_corrected = pd.concat( [stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_corrected.to_csv(ttest_path, header=True, index=True) nsig = reject_t.loc[FEATURE].sum() print("%d %ss differ from %s in '%s' for %s (%s, P<%.2f, %s)" % (nsig, TREATMENT_COLNAME, CONTROL_TREATMENT, FEATURE, strain, t_test, args.pval_threshold, args.fdr_method)) ### Pairwise t-tests comparing strain vs control behaviour on each antioxidant print("\nPerforming pairwise t-tests:") # subset for control data control_strain_meta = metadata[metadata[STRAIN_COLNAME] == CONTROL_STRAIN] control_strain_feat = features.reindex(control_strain_meta.index) control_df = control_strain_meta.join(control_strain_feat) for strain in strain_list: if strain == CONTROL_STRAIN: continue # subset for strain data strain_meta = metadata[metadata[STRAIN_COLNAME] == strain] strain_feat = features.reindex(strain_meta.index) strain_df = strain_meta.join(strain_feat) # perform pairwise t-tests comparing strain with control for each antioxidant treatment stats, pvals, reject = pairwise_ttest(control_df, strain_df, feature_list=[FEATURE], group_by=TREATMENT_COLNAME, fdr_method=args.fdr_method, fdr=args.pval_threshold) # compile table of results stats.columns = ['stats_' + str(c) for c in stats.columns] pvals.columns = ['pvals_' + str(c) for c in pvals.columns] reject.columns = ['reject_' + str(c) for c in reject.columns] test_results = pd.concat([stats, pvals, reject], axis=1) # save results ttest_strain_path = stats_dir / 'pairwise_ttests' / '{}_results.csv'.format( strain + "_vs_" + CONTROL_STRAIN) ttest_strain_path.parent.mkdir(parents=True, exist_ok=True) test_results.to_csv(ttest_strain_path, header=True, index=True) for antiox in antioxidant_list: print("%s difference in '%s' between %s vs %s on %s (paired t-test, P=%.3f, %s)" %\ (("SIGNIFICANT" if reject.loc[FEATURE, 'reject_{}'.format(antiox)] else "No"), FEATURE, strain, CONTROL_STRAIN, antiox, pvals.loc[FEATURE, 'pvals_{}'.format(antiox)], args.fdr_method))
def keio_stats(features, metadata, args): """ Perform statistical analyses on Keio screen results: - ANOVA tests for significant between strain variation among all strains for each feature - t-tests for significant differences between each strain and control for each feature - k-significant feature selection for agreement with ANOVA significant feature set Inputs ------ features, metadata : pd.DataFrame Clean feature summaries and accompanying metadata args : Object Python object with the following attributes: - drop_size_features : bool - norm_features_only : bool - percentile_to_use : str - remove_outliers : bool - omit_strains : list - grouping_variable : str - control_dict : dict - collapse_control : bool - n_top_feats : int - tierpsy_top_feats_dir (if n_top_feats) : str - test : str - f_test : bool - pval_threshold : float - fdr_method : str - n_sig_features : int """ # categorical variable to investigate, eg.'gene_name' grouping_var = args.grouping_variable print("\nInvestigating '%s' variation" % grouping_var) # assert there will be no errors duee to case-sensitivity assert len(metadata[grouping_var].unique()) == len(metadata[grouping_var].str.upper().unique()) # Subset results (rows) to omit selected strains if args.omit_strains is not None: features, metadata = subset_results(features, metadata, grouping_var, args.omit_strains) # Load Tierpsy Top feature set + subset (columns) for top feats only if args.n_top_feats is not None: top_feats_path = Path(args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(str(args.n_top_feats)) topfeats = load_topfeats(top_feats_path, add_bluelight=args.align_bluelight, remove_path_curvature=True, header=None) # Drop features that are not in results top_feats_list = [feat for feat in list(topfeats) if feat in features.columns] features = features[top_feats_list] assert not features.isna().any().any() strain_list = list(metadata[grouping_var].unique()) control = args.control_dict[grouping_var] # control strain to use assert control in strain_list if args.collapse_control: print("Collapsing control data (mean of each day)") features, metadata = average_plate_control_data(features, metadata, control=control, grouping_var=grouping_var, plate_var='imaging_plate_id') _ = df_summary_stats(metadata) # summary df # TODO: plot from? # Record mean sample size per group mean_sample_size = int(np.round(metadata.join(features).groupby([grouping_var], as_index=False).size().mean())) print("Mean sample size: %d" % mean_sample_size) # construct save paths (args.save_dir / topfeats? etc) save_dir = get_save_dir(args) stats_dir = save_dir / grouping_var / "Stats" / args.fdr_method plot_dir = save_dir / grouping_var / "Plots" / args.fdr_method #%% F-test for equal variances # Compare variance in samples with control (and correct for multiple comparisons) # Sample size matters in that unequal variances don't pose a problem for a t-test with # equal sample sizes. So as long as your sample sizes are equal, you don't have to worry about # homogeneity of variances. If they are not equal, perform F-tests first to see if variance is # equal before doing a t-test if args.f_test: levene_stats_path = stats_dir / 'levene_results.csv' levene_stats = levene_f_test(features, metadata, grouping_var, p_value_threshold=args.pval_threshold, multitest_method=args.fdr_method, saveto=levene_stats_path, del_if_exists=False) # if p < 0.05 then variances are not equal, and sample size matters prop_eqvar = (levene_stats['pval'] > args.pval_threshold).sum() / len(levene_stats['pval']) print("Percentage equal variance %.1f%%" % (prop_eqvar * 100)) #%% ANOVA / Kruskal-Wallis tests for significantly different features across groups test_path_unncorrected = stats_dir / '{}_results_uncorrected.csv'.format(args.test) test_path = stats_dir / '{}_results.csv'.format(args.test) if not (test_path.exists() and test_path_unncorrected.exists()): test_path.parent.mkdir(exist_ok=True, parents=True) if (args.test == "ANOVA" or args.test == "Kruskal"): if len(strain_list) > 2: # perform ANOVA + record results before & after correcting for multiple comparisons stats, pvals, reject = univariate_tests(X=features, y=metadata[grouping_var], control=control, test=args.test, comparison_type='multiclass', multitest_correction=None, # uncorrected alpha=args.pval_threshold, n_permutation_test='all') # get effect sizes effect_sizes = get_effect_sizes(X=features, y=metadata[grouping_var], control=control, effect_type=None, linked_test=args.test) # compile + save results (uncorrected) test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1) test_results.columns = ['stats','effect_size','pvals','reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals test_results.to_csv(test_path_unncorrected, header=True, index=True) # correct for multiple comparisons reject_corrected, pvals_corrected = _multitest_correct(pvals, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save results (corrected) test_results = pd.concat([stats, effect_sizes, pvals_corrected, reject_corrected], axis=1) test_results.columns = ['stats','effect_size','pvals','reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals test_results.to_csv(test_path, header=True, index=True) # use reject mask to find significant feature set fset = pvals.loc[reject[args.test]].sort_values(by=args.test, ascending=True).index.to_list() #assert set(fset) == set(anova_corrected['pvals'].index[np.where(anova_corrected['pvals'] < #args.pval_threshold)[0]]) if len(fset) > 0: print("%d significant features found by %s for '%s' (P<%.2f, %s)" % (len(fset), args.test, grouping_var, args.pval_threshold, args.fdr_method)) anova_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format(args.test) write_list_to_file(fset, anova_sigfeats_path) else: fset = [] print("\nWARNING: Not enough groups for %s for '%s' (n=%d groups)" %\ (args.test, grouping_var, len(strain_list))) #%% Linear Mixed Models (LMMs), accounting for day-to-day variation # NB: Ideally report: parameter | beta | lower-95 | upper-95 | random effect (SD) elif args.test == 'LMM': with warnings.catch_warnings(): # Filter warnings as parameter is often on the boundary warnings.filterwarnings("ignore") #warnings.simplefilter("ignore", ConvergenceWarning) (signif_effect, low_effect, error, mask, pvals ) = compounds_with_low_effect_univariate(feat=features, drug_name=metadata[grouping_var], drug_dose=None, random_effect=metadata[args.lmm_random_effect], control=control, test=args.test, comparison_type='multiclass', multitest_method=args.fdr_method) assert len(error) == 0 # save pvals pvals.to_csv(test_path_unncorrected, header=True, index=True) # save significant features -- if any strain significant for any feature fset = pvals.columns[(pvals < args.pval_threshold).any()].to_list() if len(fset) > 0: lmm_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format(args.test) write_list_to_file(fset, lmm_sigfeats_path) # save significant effect strains if len(signif_effect) > 0: print(("%d significant features found (%d significant %ss vs %s control, "\ % (len(fset), len(signif_effect), grouping_var.replace('_',' '), control) if len(signif_effect) > 0 else\ "No significant differences found between %s "\ % grouping_var.replace('_',' ')) + "after accounting for %s variation, %s, P<%.2f, %s)"\ % (args.lmm_random_effect.split('_yyyymmdd')[0], args.test, args.pval_threshold, args.fdr_method)) signif_effect_path = stats_dir / '{}_signif_effect_strains.txt'.format(args.test) write_list_to_file(signif_effect, signif_effect_path) else: raise IOError("Test '{}' not recognised".format(args.test)) #%% t-tests / Mann-Whitney tests # t-test to use t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sum ttest_path_uncorrected = stats_dir / '{}_results_uncorrected.csv'.format(t_test) ttest_path = stats_dir / '{}_results.csv'.format(t_test) if not (ttest_path_uncorrected.exists() and ttest_path.exists()): ttest_path.parent.mkdir(exist_ok=True, parents=True) if len(fset) > 0 or len(strain_list) == 2: # perform t-tests (without correction for multiple testing) stats_t, pvals_t, reject_t = univariate_tests(X=features, y=metadata[grouping_var], control=control, test=t_test, comparison_type='binary_each_group', multitest_correction=None, alpha=0.05) # get effect sizes for comparisons effect_sizes_t = get_effect_sizes(X=features, y=metadata[grouping_var], control=control, effect_type=None, linked_test=t_test) # compile + save t-test results (uncorrected) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True) # correct for multiple comparisons pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns] reject_t, pvals_t = _multitest_correct(pvals_t, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save t-test results (corrected) pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_corrected.to_csv(ttest_path, header=True, index=True) # record t-test significant features (not ordered) fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list() #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0]) print("%d significant features found for any %s vs %s (%s, P<%.2f)" %\ (len(fset_ttest), grouping_var, control, t_test, args.pval_threshold)) if len(fset_ttest) > 0: ttest_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format(t_test) write_list_to_file(fset_ttest, ttest_sigfeats_path) #%% K significant features ksig_uncorrected_path = stats_dir / 'k_significant_features_uncorrected.csv' ksig_corrected_path = stats_dir / 'k_significant_features.csv' if not (ksig_uncorrected_path.exists() and ksig_corrected_path.exists()): ksig_corrected_path.parent.mkdir(exist_ok=True, parents=True) fset_ksig, (scores, pvalues_ksig), support = k_significant_feat(feat=features, y_class=metadata[grouping_var], k=len(fset), score_func='f_classif', scale=None, feat_names=None, plot=False, k_to_plot=None, close_after_plotting=True, saveto=None, #k_sigfeat_dir figsize=None, title=None, xlabel=None) # compile + save k-significant features (uncorrected) ksig_table = pd.concat([pd.Series(scores), pd.Series(pvalues_ksig)], axis=1) ksig_table.columns = ['scores','pvals'] ksig_table.index = fset_ksig ksig_table.to_csv(ksig_uncorrected_path, header=True, index=True) # Correct for multiple comparisons _, ksig_table['pvals'] = _multitest_correct(ksig_table['pvals'], multitest_method=args.fdr_method, fdr=args.pval_threshold) # save k-significant features (corrected) ksig_table.to_csv(ksig_corrected_path, header=True, index=True) #%% mRMR feature selection: minimum Redunduncy, Maximum Relevance ##### mrmr_dir = plot_dir / 'mrmr' mrmr_dir.mkdir(exist_ok=True, parents=True) mrmr_results_path = mrmr_dir / "mrmr_results.csv" if not mrmr_results_path.exists(): estimator = Pipeline([('scaler', StandardScaler()), ('estimator', LogisticRegression())]) y = metadata[grouping_var].values (mrmr_feat_set, mrmr_scores, mrmr_support) = mRMR_feature_selection(features, y_class=y, k=10, redundancy_func='pearson_corr', relevance_func='kruskal', n_bins=10, mrmr_criterion='MID', plot=True, k_to_plot=5, close_after_plotting=True, saveto=mrmr_dir, figsize=None) # save results mrmr_table = pd.concat([pd.Series(mrmr_feat_set), pd.Series(mrmr_scores)], axis=1) mrmr_table.columns = ['feature','score'] mrmr_table.to_csv(mrmr_results_path, header=True, index=False) n_cv = 5 cv_scores_mrmr = cross_val_score(estimator, features[mrmr_feat_set], y, cv=n_cv) cv_scores_mrmr = pd.DataFrame(cv_scores_mrmr, columns=['cv_score']) cv_scores_mrmr.to_csv(mrmr_dir / "cv_scores.csv", header=True, index=False) print('MRMR CV Score: %f (n=%d)' % (np.mean(cv_scores_mrmr), n_cv)) else: # load mrmr results mrmr_table = pd.read_csv(mrmr_results_path) mrmr_feat_set = mrmr_table['feature'].to_list() print("\nTop %d features found by MRMR:" % len(mrmr_feat_set)) for feat in mrmr_feat_set: print(feat)
def single_feature_window_stats(metadata, features, group_by, control, save_dir, windows=None, feat='motion_mode_paused_fraction', pvalue_threshold=0.05, fdr_method='fdr_by'): """ Pairwise t-tests for each window comparing a feature of worm behaviour on mutant strains vs control Parameters ---------- metadata : pandas.DataFrame features : pandas.DataFrame Dataframe of compiled window summaries group_by : str Column name of variable containing control and other groups to compare, eg. 'gene_name' control : str Name of control group in 'group_by' column in metadata save_dir : str Path to directory to save results files windows : list List of window numbers at which to compare strains (corrected for multiple testing) feat : str Feature to test pvalue_threshold : float P-value significance threshold fdr_method : str Multiple testing correction method to use """ import pandas as pd from pathlib import Path from statistical_testing.stats_helper import pairwise_ttest from statistical_testing.perform_keio_stats import df_summary_stats from visualisation.plotting_helper import sig_asterix from write_data.write import write_list_to_file from tierpsytools.analysis.statistical_tests import univariate_tests, get_effect_sizes # categorical variables to investigate: 'gene_name' and 'window' print( "\nInvestigating variation in fraction of worms paused between hit strains and control " + "(for each window)") # check there will be no errors due to case-sensitivity assert len(metadata[group_by].unique()) == len( metadata[group_by].str.upper().unique()) # subset for list of windows if windows is None: windows = sorted(metadata['window'].unique()) else: assert all(w in sorted(metadata['window'].unique()) for w in windows) metadata = metadata[metadata['window'].isin(windows)] features = features[[feat]].reindex(metadata.index) # print mean sample size sample_size = df_summary_stats(metadata, columns=[group_by, 'window']) print("Mean sample size of %s/window: %d" % (group_by, int(sample_size['n_samples'].mean()))) control_meta = metadata[metadata[group_by] == control] control_feat = features.reindex(control_meta.index) control_df = control_meta.join(control_feat) n = len(metadata[group_by].unique()) strain_list = list( [s for s in metadata[group_by].unique() if s != control]) fset = [] if n > 2: # Perform ANOVA - is there variation among strains at each window? anova_path = Path( save_dir) / 'ANOVA' / 'ANOVA_{}window_results.csv'.format( len(windows)) anova_path.parent.mkdir(parents=True, exist_ok=True) stats, pvals, reject = univariate_tests( X=features, y=metadata[group_by], control=control, test='ANOVA', comparison_type='multiclass', multitest_correction=fdr_method, alpha=pvalue_threshold, n_permutation_test=None) # get effect sizes effect_sizes = get_effect_sizes(X=features, y=metadata[group_by], control=control, effect_type=None, linked_test='ANOVA') # compile + save results test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1) test_results.columns = ['stats', 'effect_size', 'pvals', 'reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values( by=['pvals'], ascending=True) # rank by p-value test_results.to_csv(anova_path, header=True, index=True) # use reject mask to find significant feature set fset = pvals.loc[reject['ANOVA']].sort_values( by='ANOVA', ascending=True).index.to_list() if len(fset) > 0: print("%d significant features found by ANOVA for '%s' (P<%.2f, %s)" %\ (len(fset), group_by, pvalue_threshold, fdr_method)) anova_sigfeats_path = anova_path.parent / 'ANOVA_sigfeats.txt' write_list_to_file(fset, anova_sigfeats_path) if n == 2 or len(fset) > 0: # pairwise t-tests for strain in strain_list: print( "\nPairwise t-tests for each window comparing fraction of worms paused " + "on %s vs control" % strain) ttest_strain_path = Path(save_dir) / 'pairwise_ttests' /\ '{}_window_results.csv'.format(strain) ttest_strain_path.parent.mkdir(parents=True, exist_ok=True) strain_meta = metadata[metadata[group_by] == strain] strain_feat = features.reindex(strain_meta.index) strain_df = strain_meta.join(strain_feat[[feat]]) stats, pvals, reject = pairwise_ttest(control_df, strain_df, feature_list=[feat], group_by='window', fdr_method=fdr_method, fdr=pvalue_threshold) # compile table of results stats.columns = ['stats_' + str(c) for c in stats.columns] pvals.columns = ['pvals_' + str(c) for c in pvals.columns] reject.columns = ['reject_' + str(c) for c in reject.columns] test_results = pd.concat([stats, pvals, reject], axis=1) # save results test_results.to_csv(ttest_strain_path, header=True, index=True) for window in windows: print("%s difference in '%s' between %s vs %s in window %s (t-test, P=%.3f, %s)" %\ (("SIGNIFICANT" if reject.loc[feat, 'reject_{}'.format(window)] else "No"), feat, strain, control, window, pvals.loc[feat, 'pvals_{}'.format(window)], fdr_method)) return
def control_variation(feat, meta, args, variables=['date_yyyymmdd','instrument_name','imaging_run_number'], n_sig_features=None): """ Analyse variation in control data with respect to each categorical variable in 'variables' Inputs ------ feat, meta : pd.DataFrame Matching features summaries and metadata for control data args : Object Python object with the following attributes: - remove_outliers : bool - grouping_variable : str - control_dict : dict - test : str - pval_threshold : float - fdr_method : str - n_sig_features : int - n_top_feats : int - drop_size_features : bool - norm_features_only : bool - percentile_to_use : str - remove_outliers : bool variables : list List of categorical random variables to analyse variation in control data """ assert set(feat.index) == set(meta.index) save_dir = get_save_dir(args) / "control" # Stats test to use assert args.test in ['ANOVA','Kruskal','LMM'] t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sums for grouping_var in tqdm(variables): # convert grouping variable column to factor (categorical) meta[grouping_var] = meta[grouping_var].astype(str) # get control group for eg. date_yyyymmdd control_group = str(args.control_dict[grouping_var]) print("\nInvestigating variation in '%s' (control: '%s')" % (grouping_var, control_group)) # Record mean sample size per group mean_sample_size = int(np.round(meta.groupby([grouping_var]).size().mean())) print("Mean sample size: %d" % mean_sample_size) group_list = list(meta[grouping_var].unique()) stats_dir = save_dir / "Stats" / grouping_var plot_dir = save_dir / "Plots" / grouping_var ##### STATISTICS ##### stats_path = stats_dir / '{}_results.csv'.format(args.test) # LMM/ANOVA/Kruskal ttest_path = stats_dir / '{}_results.csv'.format(t_test) if not np.logical_and(stats_path.exists(), ttest_path.exists()): stats_path.parent.mkdir(exist_ok=True, parents=True) ttest_path.parent.mkdir(exist_ok=True, parents=True) ### ANOVA / Kruskal-Wallis tests for significantly different features across groups if (args.test == "ANOVA" or args.test == "Kruskal"): if len(group_list) > 2: stats, pvals, reject = univariate_tests(X=feat, y=meta[grouping_var], control=control_group, test=args.test, comparison_type='multiclass', multitest_correction=args.fdr_method, alpha=0.05) # get effect sizes effect_sizes = get_effect_sizes(X=feat, y=meta[grouping_var], control=control_group, effect_type=None, linked_test=args.test) anova_table = pd.concat([stats, effect_sizes, pvals, reject], axis=1) anova_table.columns = ['stats','effect_size','pvals','reject'] anova_table['significance'] = sig_asterix(anova_table['pvals']) # Sort pvals + record significant features anova_table = anova_table.sort_values(by=['pvals'], ascending=True) fset = list(anova_table['pvals'].index[np.where(anova_table['pvals'] < args.pval_threshold)[0]]) # Save statistics results + significant feature set to file anova_table.to_csv(stats_path, header=True, index=True) if len(fset) > 0: anova_sigfeats_path = Path(str(stats_path).replace('_results.csv', '_sigfeats.txt')) write_list_to_file(fset, anova_sigfeats_path) print("\n%d significant features found by %s for '%s' (P<%.2f, %s)" %\ (len(fset), args.test, grouping_var, args.pval_threshold, args.fdr_method)) else: fset = [] print("\nWARNING: Not enough groups for %s for '%s' (n=%d groups)" %\ (args.test, grouping_var, len(group_list))) ### t-tests / Mann-Whitney tests if len(fset) > 0 or len(group_list) == 2: stats_t, pvals_t, reject_t = univariate_tests(X=feat, y=meta[grouping_var], control=control_group, test=t_test, comparison_type='binary_each_group', multitest_correction=args.fdr_method, alpha=0.05) effect_sizes_t = get_effect_sizes(X=feat, y=meta[grouping_var], control=control_group, effect_type=None, linked_test=t_test) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_table = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) # Record t-test significant feature set (NOT ORDERED) fset_ttest = list(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0]) # Save t-test results to file ttest_table.to_csv(ttest_path, header=True, index=True) # Save test results to CSV if len(fset_ttest) > 0: ttest_sigfeats_path = Path(str(ttest_path).replace('_results.csv', '_sigfeats.txt')) write_list_to_file(fset_ttest, ttest_sigfeats_path) print("%d signficant features found for any %s vs %s (%s, P<%.2f)" %\ (len(fset_ttest), grouping_var, control_group, t_test, args.pval_threshold)) # Barplot of number of significantly different features for each strain barplot_sigfeats(test_pvalues_df=pvals_t, saveDir=plot_dir, p_value_threshold=args.pval_threshold, test_name=t_test) ### Load statistics results # Read ANOVA results and record significant features print("\nLoading statistics results") if len(group_list) > 2: anova_table = pd.read_csv(stats_path, index_col=0) pvals = anova_table.sort_values(by='pvals', ascending=True)['pvals'] fset = pvals[pvals < args.pval_threshold].index.to_list() print("%d significant features found by %s (P<%.2f)" % (len(fset), args.test, args.pval_threshold)) # Read t-test results and record significant features (NOT ORDERED) ttest_table = pd.read_csv(ttest_path, index_col=0) pvals_t = ttest_table[[c for c in ttest_table if "pvals_" in c]] fset_ttest = pvals_t[(pvals_t < args.pval_threshold).sum(axis=1) > 0].index.to_list() print("%d significant features found by %s (P<%.2f)" % (len(fset_ttest), t_test, args.pval_threshold)) # Use t-test significant feature set if comparing just 2 strains if len(group_list) == 2: fset = fset_ttest if not n_sig_features: if args.n_sig_features is not None: n_sig_features = args.n_sig_features else: n_sig_features = len(fset) ##### Plotting ##### superplot_dir = plot_dir / 'superplots' if len(fset) > 1: for feature in tqdm(fset[:n_sig_features]): # plot variation in variable with respect to 'date_yyyymmdd' superplot(feat, meta, feature, x1=grouping_var, x2=None if grouping_var == 'date_yyyymmdd' else 'date_yyyymmdd', saveDir=superplot_dir, pvals=pvals_t if grouping_var == 'date_yyyymmdd' else None, pval_threshold=args.pval_threshold, show_points=True, plot_means=True, dodge=True) # plot variation in variable with respect to 'instrument_name' superplot(feat, meta, feature, x1=grouping_var, x2=None if grouping_var == 'instrument_name' else 'instrument_name', saveDir=superplot_dir, pvals=pvals_t if grouping_var == 'instrument_name' else None, pval_threshold=args.pval_threshold, show_points=True, plot_means=True, dodge=True) # plot variation in variable with respect to 'imaging_ruun_number' superplot(feat, meta, feature, x1=grouping_var, x2=None if grouping_var == 'imaging_run_number' else 'imaging_run_number', saveDir=superplot_dir, pvals=pvals_t if grouping_var == 'imaging_run_number' else None, pval_threshold=args.pval_threshold, show_points=True, plot_means=True, dodge=True) # # Boxplots of significant features by ANOVA/LMM (all groups) # boxplots_grouped(feat_meta_df=meta.join(feat), # group_by=grouping_var, # control_group=str(control_group), # test_pvalues_df=pvals_t.T, # ranked by test pvalue significance # feature_set=fset, # saveDir=(plot_dir / 'grouped_boxplots'), # max_feats2plt=args.n_sig_features, # max_groups_plot_cap=None, # p_value_threshold=args.pval_threshold, # drop_insignificant=False, # sns_colour_palette="tab10", # figsize=[6, (len(group_list)/3 if len(group_list)>10 else 12)]) # Individual boxplots of significant features by pairwise t-test (each group vs control) # boxplots_sigfeats(feat_meta_df=meta.join(feat), # test_pvalues_df=pvals_t, # group_by=grouping_var, # control_strain=control_group, # feature_set=fset, #['speed_norm_50th_bluelight'], # saveDir=plot_dir / 'paired_boxplots', # max_feats2plt=args.n_sig_features, # p_value_threshold=args.pval_threshold, # drop_insignificant=True, # verbose=False) # from tierpsytools.analysis.significant_features import plot_feature_boxplots # plot_feature_boxplots(feat_to_plot=fset, # y_class=grouping_var, # scores=pvalues.rank(axis=1), # feat=feat, # pvalues=np.asarray(pvalues).flatten(), # saveto=None, # close_after_plotting=False) ##### Hierarchical Clustering Analysis ##### print("\nPerforming hierarchical clustering analysis...") assert not feat.isna().sum(axis=1).any() assert not (feat.std(axis=1) == 0).any() # Z-normalise data featZ = feat.apply(zscore, axis=0) #featZ = (feat-feat.mean())/feat.std() # minus mean, divide by std #from tierpsytools.preprocessing.scaling_class import scalingClass #scaler = scalingClass(scaling='standardize') #featZ = scaler.fit_transform(feat) # NOT NEEDED? # # Drop features with NaN values after normalising # n_cols = len(featZ.columns) # featZ.dropna(axis=1, inplace=True) # n_dropped = n_cols - len(featZ.columns) # if n_dropped > 0: # print("Dropped %d features after normalisation (NaN)" % n_dropped) ### Control clustermap # control data is clustered and feature order is stored and applied to full data if len(group_list) > 1 and len(group_list) < 50 and grouping_var != 'date_yyyymmdd': control_clustermap_path = plot_dir / 'heatmaps' / (grouping_var + '_date_clustermap.pdf') cg = plot_clustermap(featZ, meta, group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else [grouping_var, 'date_yyyymmdd']), col_linkage=None, method=METHOD,#[linkage, complete, average, weighted, centroid] metric=METRIC, figsize=[15,8], sub_adj={'bottom':0.02,'left':0.02,'top':1,'right':0.85}, label_size=12, show_xlabels=False, saveto=control_clustermap_path) #col_linkage = cg.dendrogram_col.calculated_linkage clustered_features = np.array(featZ.columns)[cg.dendrogram_col.reordered_ind] else: clustered_features = None ## Save z-normalised values # z_stats = featZ.join(meta[grouping_var]).groupby(by=grouping_var).mean().T # z_stats.columns = ['z-mean_' + v for v in z_stats.columns.to_list()] # z_stats.to_csv(z_stats_path, header=True, index=None) # Clustermap of full data full_clustermap_path = plot_dir / 'heatmaps' / (grouping_var + '_clustermap.pdf') fg = plot_clustermap(featZ, meta, group_by=grouping_var, col_linkage=None, method=METHOD, metric=METRIC, figsize=[15,8], sub_adj={'bottom':0.02,'left':0.02,'top':1,'right':0.9}, label_size=12, saveto=full_clustermap_path) # If no control clustering (due to no day variation) then use clustered features for all # strains to order barcode heatmaps if clustered_features is None: clustered_features = np.array(featZ.columns)[fg.dendrogram_col.reordered_ind] if len(group_list) > 2: pvals_heatmap = anova_table.loc[clustered_features, 'pvals'] elif len(group_list) == 2: pvals_heatmap = pvals_t.loc[clustered_features, pvals_t.columns[0]] pvals_heatmap.name = 'P < {}'.format(args.pval_threshold) assert all(f in featZ.columns for f in pvals_heatmap.index) # Plot barcode heatmap (grouping by date) if len(group_list) > 1 and len(group_list) < 50 and grouping_var != 'date_yyyymmdd': heatmap_date_path = plot_dir / 'heatmaps' / (grouping_var + '_date_heatmap.pdf') plot_barcode_heatmap(featZ=featZ[clustered_features], meta=meta, group_by=[grouping_var, 'date_yyyymmdd'], pvalues_series=pvals_heatmap, p_value_threshold=args.pval_threshold, selected_feats=fset if len(fset) > 0 else None, saveto=heatmap_date_path, figsize=[20,7], sns_colour_palette="Pastel1") # Plot group-mean heatmap (averaged across days) heatmap_path = plot_dir / 'heatmaps' / (grouping_var + '_heatmap.pdf') plot_barcode_heatmap(featZ=featZ[clustered_features], meta=meta, group_by=[grouping_var], pvalues_series=pvals_heatmap, p_value_threshold=args.pval_threshold, selected_feats=fset if len(fset) > 0 else None, saveto=heatmap_path, figsize=[20, (int(len(group_list) / 4) if len(group_list) > 10 else 6)], sns_colour_palette="Pastel1") ##### Principal Components Analysis ##### print("Performing principal components analysis") if args.remove_outliers: outlier_path = plot_dir / 'mahalanobis_outliers.pdf' feat, inds = remove_outliers_pca(df=feat, features_to_analyse=None, saveto=outlier_path) meta = meta.reindex(feat.index) # reindex metadata featZ = feat.apply(zscore, axis=0) # re-normalise data # Drop features with NaN values after normalising n_cols = len(featZ.columns) featZ.dropna(axis=1, inplace=True) n_dropped = n_cols - len(featZ.columns) if n_dropped > 0: print("Dropped %d features after normalisation (NaN)" % n_dropped) #from tierpsytools.analysis.decomposition import plot_pca pca_dir = plot_dir / 'PCA' _ = plot_pca(featZ, meta, group_by=grouping_var, control=control_group, var_subset=None, saveDir=pca_dir, PCs_to_keep=10, n_feats2print=10, sns_colour_palette="plasma", n_dims=2, label_size=15, figsize=[9,8], sub_adj={'bottom':0.13,'left':0.12,'top':0.98,'right':0.98}, # legend_loc='upper right', # n_colours=20, hypercolor=False)