# Convert metadata column dtypes, ie. stringsAsFactors, no floats, Δ, etc metadata = fix_dtypes(metadata) metadata['food_type'] = [f.replace("Δ","_") for f in metadata['food_type']] features, metadata = clean_summary_results(features, metadata) # Load feature list from file if args.feature_list_from_csv is not None: assert Path(args.feature_list_from_csv).exists() feature_list = pd.read_csv(args.feature_list_from_csv) feature_list = list(feature_list[feature_list.columns[0]].unique()) elif args.n_top_feats is not None: top_feats_path = Path(args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(str(args.n_top_feats)) topfeats = load_topfeats(top_feats_path, add_bluelight=True, remove_path_curvature=True, header=None) # Drop features that are not in results feature_list = [feat for feat in list(topfeats) if feat in features.columns] features = features[feature_list] print("%d features loaded." % len(feature_list)) # # Subset data for given imaging run # from filter_data.clean_feature_summaries import subset_results # run_feats, run_meta = subset_results(features, metadata, 'imaging_run_number', [IMAGING_RUN]) # Time-series plots of day/run variation for selected features for feat in tqdm(feature_list): superplot(features, metadata,
def keio_stats(features, metadata, args): """ Perform statistical analyses on Keio screen results: - ANOVA tests for significant between strain variation among all strains for each feature - t-tests for significant differences between each strain and control for each feature - k-significant feature selection for agreement with ANOVA significant feature set Inputs ------ features, metadata : pd.DataFrame Clean feature summaries and accompanying metadata args : Object Python object with the following attributes: - drop_size_features : bool - norm_features_only : bool - percentile_to_use : str - remove_outliers : bool - omit_strains : list - grouping_variable : str - control_dict : dict - collapse_control : bool - n_top_feats : int - tierpsy_top_feats_dir (if n_top_feats) : str - test : str - f_test : bool - pval_threshold : float - fdr_method : str - n_sig_features : int """ # categorical variable to investigate, eg.'gene_name' grouping_var = args.grouping_variable print("\nInvestigating '%s' variation" % grouping_var) # assert there will be no errors duee to case-sensitivity assert len(metadata[grouping_var].unique()) == len(metadata[grouping_var].str.upper().unique()) # Subset results (rows) to omit selected strains if args.omit_strains is not None: features, metadata = subset_results(features, metadata, grouping_var, args.omit_strains) # Load Tierpsy Top feature set + subset (columns) for top feats only if args.n_top_feats is not None: top_feats_path = Path(args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(str(args.n_top_feats)) topfeats = load_topfeats(top_feats_path, add_bluelight=args.align_bluelight, remove_path_curvature=True, header=None) # Drop features that are not in results top_feats_list = [feat for feat in list(topfeats) if feat in features.columns] features = features[top_feats_list] assert not features.isna().any().any() strain_list = list(metadata[grouping_var].unique()) control = args.control_dict[grouping_var] # control strain to use assert control in strain_list if args.collapse_control: print("Collapsing control data (mean of each day)") features, metadata = average_plate_control_data(features, metadata, control=control, grouping_var=grouping_var, plate_var='imaging_plate_id') _ = df_summary_stats(metadata) # summary df # TODO: plot from? # Record mean sample size per group mean_sample_size = int(np.round(metadata.join(features).groupby([grouping_var], as_index=False).size().mean())) print("Mean sample size: %d" % mean_sample_size) # construct save paths (args.save_dir / topfeats? etc) save_dir = get_save_dir(args) stats_dir = save_dir / grouping_var / "Stats" / args.fdr_method plot_dir = save_dir / grouping_var / "Plots" / args.fdr_method #%% F-test for equal variances # Compare variance in samples with control (and correct for multiple comparisons) # Sample size matters in that unequal variances don't pose a problem for a t-test with # equal sample sizes. So as long as your sample sizes are equal, you don't have to worry about # homogeneity of variances. If they are not equal, perform F-tests first to see if variance is # equal before doing a t-test if args.f_test: levene_stats_path = stats_dir / 'levene_results.csv' levene_stats = levene_f_test(features, metadata, grouping_var, p_value_threshold=args.pval_threshold, multitest_method=args.fdr_method, saveto=levene_stats_path, del_if_exists=False) # if p < 0.05 then variances are not equal, and sample size matters prop_eqvar = (levene_stats['pval'] > args.pval_threshold).sum() / len(levene_stats['pval']) print("Percentage equal variance %.1f%%" % (prop_eqvar * 100)) #%% ANOVA / Kruskal-Wallis tests for significantly different features across groups test_path_unncorrected = stats_dir / '{}_results_uncorrected.csv'.format(args.test) test_path = stats_dir / '{}_results.csv'.format(args.test) if not (test_path.exists() and test_path_unncorrected.exists()): test_path.parent.mkdir(exist_ok=True, parents=True) if (args.test == "ANOVA" or args.test == "Kruskal"): if len(strain_list) > 2: # perform ANOVA + record results before & after correcting for multiple comparisons stats, pvals, reject = univariate_tests(X=features, y=metadata[grouping_var], control=control, test=args.test, comparison_type='multiclass', multitest_correction=None, # uncorrected alpha=args.pval_threshold, n_permutation_test='all') # get effect sizes effect_sizes = get_effect_sizes(X=features, y=metadata[grouping_var], control=control, effect_type=None, linked_test=args.test) # compile + save results (uncorrected) test_results = pd.concat([stats, effect_sizes, pvals, reject], axis=1) test_results.columns = ['stats','effect_size','pvals','reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals test_results.to_csv(test_path_unncorrected, header=True, index=True) # correct for multiple comparisons reject_corrected, pvals_corrected = _multitest_correct(pvals, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save results (corrected) test_results = pd.concat([stats, effect_sizes, pvals_corrected, reject_corrected], axis=1) test_results.columns = ['stats','effect_size','pvals','reject'] test_results['significance'] = sig_asterix(test_results['pvals']) test_results = test_results.sort_values(by=['pvals'], ascending=True) # rank pvals test_results.to_csv(test_path, header=True, index=True) # use reject mask to find significant feature set fset = pvals.loc[reject[args.test]].sort_values(by=args.test, ascending=True).index.to_list() #assert set(fset) == set(anova_corrected['pvals'].index[np.where(anova_corrected['pvals'] < #args.pval_threshold)[0]]) if len(fset) > 0: print("%d significant features found by %s for '%s' (P<%.2f, %s)" % (len(fset), args.test, grouping_var, args.pval_threshold, args.fdr_method)) anova_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format(args.test) write_list_to_file(fset, anova_sigfeats_path) else: fset = [] print("\nWARNING: Not enough groups for %s for '%s' (n=%d groups)" %\ (args.test, grouping_var, len(strain_list))) #%% Linear Mixed Models (LMMs), accounting for day-to-day variation # NB: Ideally report: parameter | beta | lower-95 | upper-95 | random effect (SD) elif args.test == 'LMM': with warnings.catch_warnings(): # Filter warnings as parameter is often on the boundary warnings.filterwarnings("ignore") #warnings.simplefilter("ignore", ConvergenceWarning) (signif_effect, low_effect, error, mask, pvals ) = compounds_with_low_effect_univariate(feat=features, drug_name=metadata[grouping_var], drug_dose=None, random_effect=metadata[args.lmm_random_effect], control=control, test=args.test, comparison_type='multiclass', multitest_method=args.fdr_method) assert len(error) == 0 # save pvals pvals.to_csv(test_path_unncorrected, header=True, index=True) # save significant features -- if any strain significant for any feature fset = pvals.columns[(pvals < args.pval_threshold).any()].to_list() if len(fset) > 0: lmm_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format(args.test) write_list_to_file(fset, lmm_sigfeats_path) # save significant effect strains if len(signif_effect) > 0: print(("%d significant features found (%d significant %ss vs %s control, "\ % (len(fset), len(signif_effect), grouping_var.replace('_',' '), control) if len(signif_effect) > 0 else\ "No significant differences found between %s "\ % grouping_var.replace('_',' ')) + "after accounting for %s variation, %s, P<%.2f, %s)"\ % (args.lmm_random_effect.split('_yyyymmdd')[0], args.test, args.pval_threshold, args.fdr_method)) signif_effect_path = stats_dir / '{}_signif_effect_strains.txt'.format(args.test) write_list_to_file(signif_effect, signif_effect_path) else: raise IOError("Test '{}' not recognised".format(args.test)) #%% t-tests / Mann-Whitney tests # t-test to use t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sum ttest_path_uncorrected = stats_dir / '{}_results_uncorrected.csv'.format(t_test) ttest_path = stats_dir / '{}_results.csv'.format(t_test) if not (ttest_path_uncorrected.exists() and ttest_path.exists()): ttest_path.parent.mkdir(exist_ok=True, parents=True) if len(fset) > 0 or len(strain_list) == 2: # perform t-tests (without correction for multiple testing) stats_t, pvals_t, reject_t = univariate_tests(X=features, y=metadata[grouping_var], control=control, test=t_test, comparison_type='binary_each_group', multitest_correction=None, alpha=0.05) # get effect sizes for comparisons effect_sizes_t = get_effect_sizes(X=features, y=metadata[grouping_var], control=control, effect_type=None, linked_test=t_test) # compile + save t-test results (uncorrected) stats_t.columns = ['stats_' + str(c) for c in stats_t.columns] pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] effect_sizes_t.columns = ['effect_size_' + str(c) for c in effect_sizes_t.columns] ttest_uncorrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_uncorrected.to_csv(ttest_path_uncorrected, header=True, index=True) # correct for multiple comparisons pvals_t.columns = [c.split("_")[-1] for c in pvals_t.columns] reject_t, pvals_t = _multitest_correct(pvals_t, multitest_method=args.fdr_method, fdr=args.pval_threshold) # compile + save t-test results (corrected) pvals_t.columns = ['pvals_' + str(c) for c in pvals_t.columns] reject_t.columns = ['reject_' + str(c) for c in reject_t.columns] ttest_corrected = pd.concat([stats_t, effect_sizes_t, pvals_t, reject_t], axis=1) ttest_corrected.to_csv(ttest_path, header=True, index=True) # record t-test significant features (not ordered) fset_ttest = pvals_t[np.asmatrix(reject_t)].index.unique().to_list() #assert set(fset_ttest) == set(pvals_t.index[(pvals_t < args.pval_threshold).sum(axis=1) > 0]) print("%d significant features found for any %s vs %s (%s, P<%.2f)" %\ (len(fset_ttest), grouping_var, control, t_test, args.pval_threshold)) if len(fset_ttest) > 0: ttest_sigfeats_path = stats_dir / '{}_sigfeats.txt'.format(t_test) write_list_to_file(fset_ttest, ttest_sigfeats_path) #%% K significant features ksig_uncorrected_path = stats_dir / 'k_significant_features_uncorrected.csv' ksig_corrected_path = stats_dir / 'k_significant_features.csv' if not (ksig_uncorrected_path.exists() and ksig_corrected_path.exists()): ksig_corrected_path.parent.mkdir(exist_ok=True, parents=True) fset_ksig, (scores, pvalues_ksig), support = k_significant_feat(feat=features, y_class=metadata[grouping_var], k=len(fset), score_func='f_classif', scale=None, feat_names=None, plot=False, k_to_plot=None, close_after_plotting=True, saveto=None, #k_sigfeat_dir figsize=None, title=None, xlabel=None) # compile + save k-significant features (uncorrected) ksig_table = pd.concat([pd.Series(scores), pd.Series(pvalues_ksig)], axis=1) ksig_table.columns = ['scores','pvals'] ksig_table.index = fset_ksig ksig_table.to_csv(ksig_uncorrected_path, header=True, index=True) # Correct for multiple comparisons _, ksig_table['pvals'] = _multitest_correct(ksig_table['pvals'], multitest_method=args.fdr_method, fdr=args.pval_threshold) # save k-significant features (corrected) ksig_table.to_csv(ksig_corrected_path, header=True, index=True) #%% mRMR feature selection: minimum Redunduncy, Maximum Relevance ##### mrmr_dir = plot_dir / 'mrmr' mrmr_dir.mkdir(exist_ok=True, parents=True) mrmr_results_path = mrmr_dir / "mrmr_results.csv" if not mrmr_results_path.exists(): estimator = Pipeline([('scaler', StandardScaler()), ('estimator', LogisticRegression())]) y = metadata[grouping_var].values (mrmr_feat_set, mrmr_scores, mrmr_support) = mRMR_feature_selection(features, y_class=y, k=10, redundancy_func='pearson_corr', relevance_func='kruskal', n_bins=10, mrmr_criterion='MID', plot=True, k_to_plot=5, close_after_plotting=True, saveto=mrmr_dir, figsize=None) # save results mrmr_table = pd.concat([pd.Series(mrmr_feat_set), pd.Series(mrmr_scores)], axis=1) mrmr_table.columns = ['feature','score'] mrmr_table.to_csv(mrmr_results_path, header=True, index=False) n_cv = 5 cv_scores_mrmr = cross_val_score(estimator, features[mrmr_feat_set], y, cv=n_cv) cv_scores_mrmr = pd.DataFrame(cv_scores_mrmr, columns=['cv_score']) cv_scores_mrmr.to_csv(mrmr_dir / "cv_scores.csv", header=True, index=False) print('MRMR CV Score: %f (n=%d)' % (np.mean(cv_scores_mrmr), n_cv)) else: # load mrmr results mrmr_table = pd.read_csv(mrmr_results_path) mrmr_feat_set = mrmr_table['feature'].to_list() print("\nTop %d features found by MRMR:" % len(mrmr_feat_set)) for feat in mrmr_feat_set: print(feat)
def compare_strains_keio(features, metadata, args): """ Compare Keio single-gene deletion mutants with wild-type BW25113 control and look to see if they signfiicantly alter N2 C. elegans behaviour while feeding. Subset results to omit selected strains (optional) Inputs ------ features, metadata : pd.DataFrame Matching features summaries and metadata args : Object Python object with the following attributes: - drop_size_features : bool - norm_features_only : bool - percentile_to_use : str - remove_outliers : bool - omit_strains : list - grouping_variable : str - control_dict : dict - collapse_control : bool - n_top_feats : int - tierpsy_top_feats_dir (if n_top_feats) : str - test : str - f_test : bool - pval_threshold : float - fdr_method : str - n_sig_features : int """ assert set(features.index) == set(metadata.index) # categorical variable to investigate, eg.'gene_name' grouping_var = args.grouping_variable n_strains = len(metadata[grouping_var].unique()) assert n_strains == len( metadata[grouping_var].str.upper().unique()) # check case-sensitivity print("\nInvestigating '%s' variation (%d samples)" % (grouping_var, n_strains)) # Subset results (rows) to omit selected strains if args.omit_strains is not None: features, metadata = subset_results(features, metadata, column=grouping_var, groups=args.omit_strains, omit=True) control = args.control_dict[grouping_var] # control strain to use # Load Tierpsy Top feature set + subset (columns) for top feats only if args.n_top_feats is not None: top_feats_path = Path( args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format( str(args.n_top_feats)) topfeats = load_topfeats(top_feats_path, add_bluelight=True, remove_path_curvature=True, header=None) # Drop features that are not in results top_feats_list = [ feat for feat in list(topfeats) if feat in features.columns ] features = features[top_feats_list] ##### Control variation ##### control_metadata = metadata[metadata[grouping_var] == control] control_features = features.reindex(control_metadata.index) # Clean data after subset - to remove features with zero std control_feat_clean, control_meta_clean = clean_summary_results( control_features, control_metadata, max_value_cap=False, imputeNaN=False) if args.analyse_control: control_variation(control_feat_clean, control_meta_clean, args, variables=[ k for k in args.control_dict.keys() if k != grouping_var ], n_sig_features=10) if args.collapse_control: print("\nCollapsing control data (mean of each day)") features, metadata = average_plate_control_data(features, metadata) # Record mean sample size per group mean_sample_size = int( np.round( metadata.join(features).groupby([grouping_var], as_index=False).size().mean())) print("Mean sample size: %d" % mean_sample_size) save_dir = get_save_dir(args) stats_dir = save_dir / grouping_var / "Stats" / args.fdr_method plot_dir = save_dir / grouping_var / "Plots" / args.fdr_method ##### STATISTICS ##### # ============================================================================= # ##### Pairplot Tierpsy Features - Pairwise correlation matrix ##### # if args.n_top_feats == 16: # g = sns.pairplot(features, height=1.5) # for ax in g.axes.flatten(): # # rotate x and y axis labels # ax.set_xlabel(ax.get_xlabel(), rotation = 90) # ax.set_ylabel(ax.get_ylabel(), rotation = 0) # plt.subplots_adjust(left=0.3, bottom=0.3) # plt.show() # ============================================================================= if not args.use_corrected_pvals: anova_path = stats_dir / '{}_results_uncorrected.csv'.format(args.test) else: anova_path = stats_dir / '{}_results.csv'.format(args.test) # load results + record significant features print("\nLoading statistics results") anova_table = pd.read_csv(anova_path, index_col=0) pvals = anova_table.sort_values( by='pvals', ascending=True)['pvals'] # rank features by p-value fset = pvals[pvals < args.pval_threshold].index.to_list() print( "\n%d significant features found by %s (P<%.2f, %s)" % (len(fset), args.test, args.pval_threshold, ('uncorrected' if not args.use_corrected_pvals else args.fdr_method))) ### k-significant features if len(fset) > 0: # Compare k sigfeat and ANOVA significant feature set overlap if not args.use_corrected_pvals: k_sigfeats_path = stats_dir / "k_significant_features_uncorrected.csv" else: k_sigfeats_path = stats_dir / "k_significant_features.csv" ksig_table = pd.read_csv(k_sigfeats_path, index_col=0) fset_ksig = ksig_table[ ksig_table['pvals'] < args.pval_threshold].index.to_list() fset_overlap = set(fset).intersection(set(fset_ksig)) prop_overlap = len(fset_overlap) / len(fset) print("%.1f%% overlap with k-significant features" % (prop_overlap * 100)) if prop_overlap < 0.5 and len(fset) > 100: print( "WARNING: Inconsistency in statistics for feature set agreement between " + "%s and k significant features!" % args.test) if args.use_k_sig_feats_overlap: fset = list(ksig_table.loc[fset_overlap].sort_values( by='pvals', ascending=True).index) ### t-test t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sum if not args.use_corrected_pvals: ttest_path = stats_dir / '{}_results_uncorrected.csv'.format( t_test) else: ttest_path = stats_dir / '{}_results.csv'.format(t_test) # read t-test results + record significant features (NOT ORDERED) ttest_table = pd.read_csv(ttest_path, index_col=0) pvals_t = ttest_table[[c for c in ttest_table if "pvals_" in c]] pvals_t.columns = [c.split('pvals_')[-1] for c in pvals_t.columns] fset_ttest = pvals_t[(pvals_t < args.pval_threshold).sum( axis=1) > 0].index.to_list() print("%d significant features found by %s (P<%.2f, %s)" % (len(fset_ttest), t_test, args.pval_threshold, ('uncorrected' if not args.use_corrected_pvals else args.fdr_method))) else: print("No significant features found for %s by %s" % (grouping_var, args.test)) ##### PLOTTING ##### if len(fset) > 0: # Rank strains by number of sigfeats by t-test ranked_nsig = (pvals_t < args.pval_threshold).sum(axis=0).sort_values( ascending=False) # Select top hit strains by n sigfeats (select strains with > 5 sigfeats as hit strains?) hit_strains_nsig = ranked_nsig[ranked_nsig > 0].index.to_list() #hit_nuo = ranked_nsig[[i for i in ranked_nsig[ranked_nsig > 0].index if 'nuo' in i]] # if no sigfaets, subset for top strains ranked by lowest p-value by t-test for any feature print("%d significant strains (with 1 or more significant features)" % len(hit_strains_nsig)) if len(hit_strains_nsig) > 0: write_list_to_file(hit_strains_nsig, stats_dir / 'hit_strains.txt') # Rank strains by lowest p-value for any feature ranked_pval = pvals_t.min(axis=0).sort_values(ascending=True) # Select top 100 hit strains by lowest p-value for any feature hit_strains_pval = ranked_pval[ ranked_pval < args.pval_threshold].index.to_list() hit_strains_pval = ranked_pval.index[:N_LOWEST_PVAL].to_list() write_list_to_file( hit_strains_pval, stats_dir / 'lowest{}_pval.txt'.format(N_LOWEST_PVAL)) print("\nPlotting ranked strains by number of significant features") ranked_nsig_path = plot_dir / ( 'ranked_number_sigfeats' + '_' + ('uncorrected' if args.fdr_method is None else args.fdr_method) + '.png') plt.ioff() plt.close('all') fig, ax = plt.subplots(figsize=(20, 6)) ax.plot(ranked_nsig) if len(ranked_nsig.index) > 250: ax.set_xticklabels([]) else: ax.set_xticklabels(ranked_nsig.index.to_list(), rotation=90, fontsize=5) plt.xlabel("Strains (ranked)", fontsize=12, labelpad=10) plt.ylabel("Number of significant features", fontsize=12, labelpad=10) plt.subplots_adjust(left=0.08, right=0.98, bottom=0.15) plt.savefig(ranked_nsig_path, dpi=600) print("Plotting ranked strains by lowest p-value of any feature") lowest_pval_path = plot_dir / ( 'ranked_lowest_pval' + '_' + ('uncorrected' if args.fdr_method is None else args.fdr_method) + '.png') plt.close('all') fig, ax = plt.subplots(figsize=(20, 6)) ax.plot(ranked_pval) plt.axhline(y=args.pval_threshold, c='dimgray', ls='--') if len(ranked_nsig.index) > 250: ax.set_xticklabels([]) else: ax.set_xticklabels(ranked_nsig.index.to_list(), rotation=90, fontsize=5) plt.xlabel("Strains (ranked)", fontsize=12, labelpad=10) plt.ylabel("Lowest p-value by t-test", fontsize=12, labelpad=10) plt.subplots_adjust(left=0.08, right=0.98, bottom=0.15) plt.savefig(lowest_pval_path, dpi=600) plt.close() print("\nMaking errorbar plots") errorbar_sigfeats(features, metadata, group_by=grouping_var, fset=fset, control=control, rank_by='mean', max_feats2plt=args.n_sig_features, figsize=[20, 10], fontsize=5, ms=8, elinewidth=1.5, fmt='.', tight_layout=[0.01, 0.01, 0.99, 0.99], saveDir=plot_dir / 'errorbar') # ============================================================================= # print("Making boxplots") # boxplots_grouped(feat_meta_df=metadata.join(features), # group_by=grouping_var, # control_group=control, # test_pvalues_df=(pvals_t.T if len(fset) > 0 else None), # feature_set=fset, # max_feats2plt=args.n_sig_features, # max_groups_plot_cap=None, # p_value_threshold=args.pval_threshold, # drop_insignificant=False, # sns_colour_palette="tab10", # figsize=[6,130], # saveDir=plot_dir / ('boxplots' + '_' + ( # 'uncorrected' if args.fdr_method is None else args.fdr_method) + # '.png')) # ============================================================================= # If no sigfeats, subset for top strains ranked by lowest p-value by t-test for any feature if len(hit_strains_nsig) == 0: print( "\Saving lowest %d strains ranked by p-value for any feature" % N_LOWEST_PVAL) write_list_to_file(hit_strains_pval, stats_dir / 'Top100_lowest_pval.txt') hit_strains = hit_strains_pval elif len(hit_strains_nsig) > 0: hit_strains = hit_strains_nsig # Individual boxplots of significant features by pairwise t-test (each group vs control) boxplots_sigfeats( features, y_class=metadata[grouping_var], control=control, pvals=pvals_t, z_class=metadata['date_yyyymmdd'], feature_set=None, saveDir=plot_dir / 'paired_boxplots', p_value_threshold=args.pval_threshold, drop_insignificant=True if len(hit_strains) > 0 else False, max_sig_feats=args.n_sig_features, max_strains=N_LOWEST_PVAL if len(hit_strains_nsig) == 0 else None, sns_colour_palette="tab10", verbose=False) if SUBSET_HIT_STRAINS: strain_list = [control] + hit_strains[:TOP_N_HITS] print("Subsetting for Top%d hit strains" % (len(strain_list) - 1)) features, metadata = subset_results(features, metadata, column=grouping_var, groups=strain_list, verbose=False) else: strain_list = list(metadata[grouping_var].unique()) # ============================================================================= # # NOT NECESSARY FOR ALL STRAINS - LOOK AT CONTROL ONLY FOR THIS # # superplots of variation with respect to 'date_yyyymmdd' # print("\nPlotting superplots of date variation for significant features") # for feat in tqdm(fset[:args.n_sig_features]): # # plot day variation # superplot(features, metadata, feat, # x1='date_yyyymmdd', # x2=None, # saveDir=plot_dir / 'superplots', # figsize=[24,6], # show_points=False, # plot_means=True, # dodge=False) # # plot run number vs day variation # superplot(features, metadata, feat, # x1='date_yyyymmdd', # x2='imaging_run_number', # saveDir=plot_dir / 'superplots', # figsize=[24,6], # show_points=False, # plot_means=True, # dodge=True) # # plot plate number variation # superplot(features, metadata, feat, # x1='date_yyyymmdd', # x2='source_plate_id', # saveDir=plot_dir / 'superplots', # figsize=[24,6], # show_points=False, # plot_means=True, # dodge=True) # # plot instrument name variation # superplot(features, metadata, feat, # x1='date_yyyymmdd', # x2='instrument_name', # saveDir=plot_dir / 'superplots', # figsize=[24,6], # show_points=False, # plot_means=True, # dodge=True) # ============================================================================= # from tierpsytools.analysis.significant_features import plot_feature_boxplots # plot_feature_boxplots(feat_to_plot=features, # y_class=metadata[grouping_var], # scores=pvals_t.rank(axis=1), # pvalues=np.asarray(pvals_t).flatten(), # saveto=None, # close_after_plotting=True) ##### Hierarchical Clustering Analysis ##### # Z-normalise control data control_featZ = control_features.apply(zscore, axis=0) #featZ = (features-features.mean())/features.std() # minus mean, divide by std #from tierpsytools.preprocessing.scaling_class import scalingClass #scaler = scalingClass(scaling='standardize') #featZ = scaler.fit_transform(features) ### Control clustermap # control data is clustered and feature order is stored and applied to full data print("\nPlotting control clustermap") control_clustermap_path = plot_dir / 'heatmaps' / 'date_clustermap.pdf' cg = plot_clustermap( control_featZ, control_metadata, group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else [grouping_var, 'date_yyyymmdd']), method=METHOD, metric=METRIC, figsize=[20, 6], sub_adj={ 'bottom': 0.05, 'left': 0, 'top': 1, 'right': 0.85 }, saveto=control_clustermap_path, label_size=15, show_xlabels=False) # control clustermap with labels if args.n_top_feats <= 256: control_clustermap_path = plot_dir / 'heatmaps' / 'date_clustermap_label.pdf' cg = plot_clustermap( control_featZ, control_metadata, group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else [grouping_var, 'date_yyyymmdd']), method=METHOD, metric=METRIC, figsize=[20, 10], sub_adj={ 'bottom': 0.5, 'left': 0, 'top': 1, 'right': 0.85 }, saveto=control_clustermap_path, label_size=(15, 15), show_xlabels=True) #col_linkage = cg.dendrogram_col.calculated_linkage control_clustered_features = np.array( control_featZ.columns)[cg.dendrogram_col.reordered_ind] ### Full clustermap # Z-normalise data for all strains featZ = features.apply(zscore, axis=0) ## Save z-normalised values # z_stats = featZ.join(hit_metadata[grouping_var]).groupby(by=grouping_var).mean().T # z_stats.columns = ['z-mean_' + v for v in z_stats.columns.to_list()] # z_stats.to_csv(z_stats_path, header=True, index=None) # Clustermap of full data print("Plotting all strains clustermap") full_clustermap_path = plot_dir / 'heatmaps' / (grouping_var + '_clustermap.pdf') fg = plot_clustermap(featZ, metadata, group_by=grouping_var, row_colours=None, method=METHOD, metric=METRIC, figsize=[20, 30], sub_adj={ 'bottom': 0.01, 'left': 0, 'top': 1, 'right': 0.95 }, saveto=full_clustermap_path, label_size=8, show_xlabels=False) if args.n_top_feats <= 256: full_clustermap_path = plot_dir / 'heatmaps' / ( grouping_var + '_clustermap_label.pdf') fg = plot_clustermap(featZ, metadata, group_by=grouping_var, row_colours=None, method=METHOD, metric=METRIC, figsize=[20, 40], sub_adj={ 'bottom': 0.18, 'left': 0, 'top': 1, 'right': 0.95 }, saveto=full_clustermap_path, label_size=(15, 10), show_xlabels=True) # clustered feature order for all strains _ = np.array(featZ.columns)[fg.dendrogram_col.reordered_ind] pvals_heatmap = anova_table.loc[control_clustered_features, 'pvals'] pvals_heatmap.name = 'P < {}'.format(args.pval_threshold) assert all(f in featZ.columns for f in pvals_heatmap.index) # Plot heatmap (averaged for each sample) if len(metadata[grouping_var].unique()) < 250: print("\nPlotting barcode heatmap") heatmap_path = plot_dir / 'heatmaps' / (grouping_var + '_heatmap.pdf') plot_barcode_heatmap( featZ=featZ[control_clustered_features], meta=metadata, group_by=[grouping_var], pvalues_series=pvals_heatmap, p_value_threshold=args.pval_threshold, selected_feats=None, # fset if len(fset) > 0 else None saveto=heatmap_path, figsize=[20, 30], sns_colour_palette="Pastel1", label_size=10) ##### Principal Components Analysis ##### pca_dir = plot_dir / 'PCA' # remove outlier samples from PCA if args.remove_outliers: outlier_path = pca_dir / 'mahalanobis_outliers.pdf' features, inds = remove_outliers_pca(df=features, saveto=outlier_path) metadata = metadata.reindex(features.index) # reindex metadata featZ = features.apply(zscore, axis=0) # re-normalise data # Drop features with NaN values after normalising n_cols = len(featZ.columns) featZ.dropna(axis=1, inplace=True) n_dropped = n_cols - len(featZ.columns) if n_dropped > 0: print("Dropped %d features after normalisation (NaN)" % n_dropped) coloured_strains_pca = [control] + hit_strains[:15] coloured_strains_pca = [ s for s in coloured_strains_pca if s in metadata[grouping_var].unique() ] #from tierpsytools.analysis.decomposition import plot_pca _ = plot_pca(featZ, metadata, group_by=grouping_var, control=control, var_subset=coloured_strains_pca, saveDir=pca_dir, PCs_to_keep=10, n_feats2print=10, kde=False, sns_colour_palette="plasma", n_dims=2, label_size=8, sub_adj={ 'bottom': 0.13, 'left': 0.13, 'top': 0.95, 'right': 0.88 }, legend_loc=[1.02, 0.6], hypercolor=False) # add details of COG category information to metadata # (using hard-coded dict of info from Baba et al. 2006 paper) metadata['COG_category'] = metadata['COG_category'].map(COG_category_dict) # plot pca coloured by Keio COG category _ = plot_pca(featZ, metadata, group_by='COG_category', control=None, var_subset=list(metadata['COG_category'].dropna().unique()), saveDir=pca_dir / 'COG', PCs_to_keep=10, n_feats2print=10, kde=False, n_dims=2, hypercolor=False, label_size=8, figsize=[12, 8], sub_adj={ 'bottom': 0.1, 'left': 0.1, 'top': 0.95, 'right': 0.7 }, legend_loc=[1.02, 0.6], sns_colour_palette="plasma") ##### t-distributed Stochastic Neighbour Embedding ##### print("\nPerforming tSNE") tsne_dir = plot_dir / 'tSNE' perplexities = [mean_sample_size ] # NB: should be roughly equal to group size _ = plot_tSNE(featZ, metadata, group_by=grouping_var, var_subset=coloured_strains_pca, saveDir=tsne_dir, perplexities=perplexities, figsize=[8, 8], label_size=8, marker_size=20, sns_colour_palette="plasma") print("\nPerforming tSNE") tsne_dir = plot_dir / 'tSNE' perplexities = [mean_sample_size ] # NB: should be roughly equal to group size _ = plot_tSNE(featZ, metadata, group_by='COG_category', var_subset=list(metadata['COG_category'].dropna().unique()), saveDir=tsne_dir / 'COG_category', perplexities=perplexities, figsize=[8, 8], label_size=8, marker_size=20, sns_colour_palette="plasma") ##### Uniform Manifold Projection ##### print("\nPerforming UMAP") umap_dir = plot_dir / 'UMAP' n_neighbours = [mean_sample_size ] # NB: should be roughly equal to group size min_dist = 0.1 # Minimum distance parameter _ = plot_umap(featZ, metadata, group_by=grouping_var, var_subset=coloured_strains_pca, saveDir=umap_dir, n_neighbours=n_neighbours, min_dist=min_dist, figsize=[8, 8], label_size=8, marker_size=20, sns_colour_palette="plasma")
# Read clean feature summaries + metadata print("Loading metadata and feature summary results...") features = pd.read_csv(FEATURES_PATH) metadata = pd.read_csv(METADATA_PATH, dtype={ 'comments': str, 'source_plate_id': str }) # Load Tierpsy Top feature set + subset (columns) for top feats only if N_TOP_FEATS is not None: top_feats_path = Path(TOP_FEATS_DIR) / "tierpsy_{}.csv".format( str(N_TOP_FEATS)) topfeats = load_topfeats( top_feats_path, add_bluelight=True, remove_path_curvature=True, header=None) #TODO: check tierpsytools for equiv func # Drop features that are not in results top_feats_list = [ feat for feat in list(topfeats) if feat in features.columns ] features = features[top_feats_list] n_strains, n_feats = len(metadata['gene_name'].unique()), len( features.columns) save_path = Path(STRAIN_LIST_SAVE_DIR) / ("%d_strains_%d_features" % (n_strains, n_feats)) ##### Hierarchical clustering #####