Path(args.results_dir).parent / "Analysis" / "Superplots") combined_feats_path = Path(args.results_dir) / "full_features.csv" combined_fnames_path = Path(args.results_dir) / "full_filenames.csv" # NB: leaves the df in a "long format" that seaborn 'likes' features, metadata = read_hydra_metadata(feat_file=combined_feats_path, fname_file=combined_fnames_path, meta_file=args.compiled_metadata_path, add_bluelight=True) # Convert metadata column dtypes, ie. stringsAsFactors, no floats, Δ, etc metadata = fix_dtypes(metadata) metadata['food_type'] = [f.replace("Δ","_") for f in metadata['food_type']] features, metadata = clean_summary_results(features, metadata) # Load feature list from file if args.feature_list_from_csv is not None: assert Path(args.feature_list_from_csv).exists() feature_list = pd.read_csv(args.feature_list_from_csv) feature_list = list(feature_list[feature_list.columns[0]].unique()) elif args.n_top_feats is not None: top_feats_path = Path(args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(str(args.n_top_feats)) topfeats = load_topfeats(top_feats_path, add_bluelight=True, remove_path_curvature=True, header=None) # Drop features that are not in results feature_list = [feat for feat in list(topfeats) if feat in features.columns] features = features[feature_list]
# compile window summaries features, metadata = process_feature_summaries(metadata_path=metadata_path, results_dir=RES_DIR, compile_day_summaries=True, imaging_dates=IMAGING_DATES, align_bluelight=False, window_summaries=True, n_wells=N_WELLS) # clean results features, metadata = clean_summary_results(features, metadata, feature_columns=None, nan_threshold_row=NAN_THRESHOLD_ROW, nan_threshold_col=NAN_THRESHOLD_COL, max_value_cap=1e15, imputeNaN=True, min_nskel_per_video=MIN_NSKEL_PER_VIDEO, min_nskel_sum=MIN_NSKEL_SUM, drop_size_related_feats=False, norm_feats_only=False, percentile_to_use=None) assert not features.isna().sum(axis=1).any() assert not (features.std(axis=1) == 0).any() # save features metadata.to_csv(META_PATH, index=False) features.to_csv(FEAT_PATH, index=False) else: # load clean metadata and features
imaging_dates=args.dates, add_well_annotations=args.add_well_annotations) # Process feature summary results features, metadata = process_feature_summaries(metadata_path, RESULTS_DIR, compile_day_summaries=args.compile_day_summaries, imaging_dates=args.dates, align_bluelight=args.align_bluelight) # Clean: remove data with too many NaNs/zero std and impute remaining NaNs features, metadata = clean_summary_results(features, metadata, feature_columns=None, imputeNaN=args.impute_nans, nan_threshold=args.nan_threshold, max_value_cap=args.max_value_cap, drop_size_related_feats=args.drop_size_features, norm_feats_only=args.norm_features_only, percentile_to_use=args.percentile_to_use) # Load supplementary info + append to metadata if not 'COG category' in metadata.columns: supplementary_7 = load_supplementary_7(args.path_sup_info) updated_metadata = append_supplementary_7(metadata, supplementary_7) # # Calculate duration on food + duration in L1 diapause # metadata = duration_on_food(metadata) # metadata = duration_L1_diapause(metadata) #%% Subset results
for s in metadata['gene_name'] ] #['BW\u0394'+g if not g == 'BW' else 'wild_type' for g in metadata['gene_name']] # Create is_bad_well column - refer to manual metadata for bad 35mm petri plates metadata['is_bad_well'] = False # Clean results - Remove bad well data + features with too many NaNs/zero std # + impute remaining NaNs features, metadata = clean_summary_results( features, metadata, feature_columns=None, nan_threshold_row=args.nan_threshold_row, nan_threshold_col=args.nan_threshold_col, max_value_cap=args.max_value_cap, imputeNaN=args.impute_nans, min_nskel_per_video=args.min_nskel_per_video, min_nskel_sum=args.min_nskel_sum, drop_size_related_feats=args.drop_size_features, norm_feats_only=args.norm_features_only, percentile_to_use=args.percentile_to_use) assert not features.isna().sum(axis=1).any() assert not (features.std(axis=1) == 0).any() if ALL_WINDOWS: WINDOW_LIST = list(WINDOW_FRAME_DICT.keys()) args.save_dir = Path(args.save_dir) / 'all_windows' perform_fast_effect_stats(features, metadata, WINDOW_LIST, args)
def analyse_acute_rescue(features, metadata, save_dir, control_strain, control_antioxidant, control_window, fdr_method='fdr_by', pval_threshold=0.05, remove_outliers=False): stats_dir = Path(save_dir) / "Stats" / fdr_method plot_dir = Path(save_dir) / "Plots" / fdr_method strain_list = [control_strain] + [s for s in metadata['gene_name'].unique() if s != control_strain] antiox_list = [control_antioxidant] + [a for a in metadata['antioxidant'].unique() if a != control_antioxidant] window_list = [control_window] + [w for w in metadata['window'].unique() if w != control_window] # categorical variables to investigate: 'gene_name', 'antioxidant' and 'window' print("\nInvestigating difference in fraction of worms paused between hit strain and control " + "(for each window), in the presence/absence of antioxidants:\n") # print mean sample size sample_size = df_summary_stats(metadata, columns=['gene_name', 'antioxidant', 'window']) print("Mean sample size of strain/antioxidant for each window: %d" %\ (int(sample_size['n_samples'].mean()))) # plot dates as different colours (in loop) date_lut = dict(zip(list(metadata['date_yyyymmdd'].unique()), sns.color_palette('Set1', n_colors=len(metadata['date_yyyymmdd'].unique())))) for strain in strain_list[1:]: # skip control_strain at first index postion plot_meta = metadata[np.logical_or(metadata['gene_name']==strain, metadata['gene_name']==control_strain)] plot_feat = features.reindex(plot_meta.index) plot_df = plot_meta.join(plot_feat[[FEATURE]]) # Is there a difference between strain vs control at any window? (pooled antioxidant data) print("Plotting windows for %s vs control" % strain) plt.close('all') fig, ax = plt.subplots(figsize=((len(window_list) if len(window_list) >= 20 else 12),8)) ax = sns.boxplot(x='window', y=FEATURE, hue='gene_name', hue_order=strain_list, order=window_list, data=plot_df, palette='Set3', dodge=True, ax=ax) for date in date_lut.keys(): date_df = plot_df[plot_df['date_yyyymmdd']==date] ax = sns.stripplot(x='window', y=FEATURE, hue='gene_name', order=window_list, hue_order=strain_list, data=date_df, palette={control_strain:date_lut[date], strain:date_lut[date]}, alpha=0.7, size=4, dodge=True, ax=ax) n_labs = len(plot_df['gene_name'].unique()) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right') # scale plot to omit outliers (>2.5*IQR from mean) if scale_outliers_box: grouped_strain = plot_df.groupby('window') y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers # Computing IQR Q1 = grouped_strain[FEATURE].quantile(0.25) Q3 = grouped_strain[FEATURE].quantile(0.75) IQR = Q3 - Q1 plt.ylim(-0.02, max(y_bar) + 3 * max(IQR)) # load t-test results + annotate p-values on plot for ii, window in enumerate(window_list): ttest_strain_path = stats_dir / 'pairwise_ttests' / 'window' /\ '{}_window_results.csv'.format(strain) ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0) strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] p = strain_pvals_t.loc[FEATURE, str(window)] text = ax.get_xticklabels()[ii] assert text.get_text() == str(window) p_text = 'P<0.001' if p < 0.001 else 'P=%.3f' % p #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max() #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50 trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) plt.plot([ii-.3, ii-.3, ii+.3, ii+.3], [0.98, 0.99, 0.99, 0.98], #[y+h, y+2*h, y+2*h, y+h], lw=1.5, c='k', transform=trans) ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans, rotation=(0 if len(window_list) <= 20 else 90)) ax.set_xticks(range(len(window_list)+1)) xlabels = [str(int(WINDOW_FRAME_DICT[w][0]/60)) for w in window_list] ax.set_xticklabels(xlabels) x_text = 'Time (minutes)' if ALL_WINDOWS else 'Time of bluelight 10-second burst (minutes)' ax.set_xlabel(x_text, fontsize=15, labelpad=10) ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10) fig_savepath = plot_dir / 'window_boxplots' / strain / (FEATURE + '.png') fig_savepath.parent.mkdir(parents=True, exist_ok=True) plt.savefig(fig_savepath) # Is there a difference between strain vs control for any antioxidant? (pooled window data) plt.close('all') fig, ax = plt.subplots(figsize=(10,8)) ax = sns.boxplot(x='antioxidant', y=FEATURE, hue='gene_name', hue_order=strain_list, data=plot_df, palette='Set3', dodge=True, order=antiox_list) ax = sns.swarmplot(x='antioxidant', y=FEATURE, hue='gene_name', hue_order=strain_list, data=plot_df, color='k', alpha=0.7, size=4, dodge=True, order=antiox_list) n_labs = len(plot_df['gene_name'].unique()) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right') ax.set_xlabel('antioxidant', fontsize=15, labelpad=10) ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10) # scale plot to omit outliers (>2.5*IQR from mean) if scale_outliers_box: grouped_strain = plot_df.groupby('antioxidant') y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers # Computing IQR Q1 = grouped_strain[FEATURE].quantile(0.25) Q3 = grouped_strain[FEATURE].quantile(0.75) IQR = Q3 - Q1 plt.ylim(min(y_bar) - 2.5 * max(IQR), max(y_bar) + 2.5 * max(IQR)) # annotate p-values for ii, antiox in enumerate(antiox_list): ttest_strain_path = stats_dir / 'pairwise_ttests' / 'antioxidant' /\ '{}_antioxidant_results.csv'.format(strain) ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0) strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] p = strain_pvals_t.loc[FEATURE, antiox] text = ax.get_xticklabels()[ii] assert text.get_text() == antiox p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max() #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50 trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], [0.8, 0.81, 0.81, 0.8], #[y+h, y+2*h, y+2*h, y+h], lw=1.5, c='k', transform=trans) ax.text(ii, 0.82, p_text, fontsize=9, ha='center', va='bottom', transform=trans) fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / (FEATURE + '.png') fig_savepath.parent.mkdir(parents=True, exist_ok=True) plt.savefig(fig_savepath) # Plot for each strain separately to see whether antioxidants had an effect at all for strain in strain_list: plt.close('all') fig, ax = plt.subplots(figsize=(10,8)) ax = sns.boxplot(x='antioxidant', y=FEATURE, order=antiox_list, dodge=True, data=plot_df[plot_df['gene_name']==strain]) ax = sns.swarmplot(x='antioxidant', y=FEATURE, order=antiox_list, dodge=True, data=plot_df[plot_df['gene_name']==strain], alpha=0.7, size=4, color='k') n_labs = len(plot_df['antioxidant'].unique()) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right') ax.set_xlabel('antioxidant', fontsize=15, labelpad=10) ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10) # scale plot to omit outliers (>2.5*IQR from mean) if scale_outliers_box: grouped_strain = plot_df.groupby('antioxidant') y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers # Computing IQR Q1 = grouped_strain[FEATURE].quantile(0.25) Q3 = grouped_strain[FEATURE].quantile(0.75) IQR = Q3 - Q1 plt.ylim(min(y_bar) - 1 * max(IQR), max(y_bar) + 2.5 * max(IQR)) # annotate p-values for ii, antiox in enumerate(antiox_list): if antiox == control_antioxidant: continue # load antioxidant results for strain ttest_strain_path = stats_dir / 't-test_{}_antioxidant_results.csv'.format(strain) ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0) strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] p = strain_pvals_t.loc[FEATURE, antiox] text = ax.get_xticklabels()[ii] assert text.get_text() == antiox p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) #plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], [0.98, 0.99, 0.98, 0.99], lw=1.5, c='k', transform=trans) ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans) plt.title(strain, fontsize=18, pad=30) fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / (FEATURE + '.png') fig_savepath.parent.mkdir(parents=True, exist_ok=True) plt.savefig(fig_savepath) # Hierarchical Clustering Analysis # - Clustermap of features by strain, to see if data cluster into groups # - Control data is clustered first, feature order is stored and ordering applied to # full data for comparison # subset for Tierpsy top16 features only features = select_feat_set(features, tierpsy_set_name='tierpsy_16', append_bluelight=False) # Ensure no NaNs or features with zero standard deviation before normalisation assert not features.isna().sum(axis=0).any() assert not (features.std(axis=0) == 0).any() # Extract data for control control_feat_df = features[metadata['gene_name']==control_strain] control_meta_df = metadata.reindex(control_feat_df.index) control_feat_df, control_meta_df = clean_summary_results(features=control_feat_df, metadata=control_meta_df, imputeNaN=False) #zscores = (df-df.mean())/df.std() # minus mean, divide by std controlZ_feat_df = control_feat_df.apply(zscore, axis=0) # plot clustermap for control control_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format(control_strain) cg = plot_clustermap(featZ=controlZ_feat_df, meta=control_meta_df, row_colours=True, group_by=['gene_name','antioxidant'], col_linkage=None, method='complete',#[linkage, complete, average, weighted, centroid] figsize=(20,10), show_xlabels=True, label_size=15, sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85}, saveto=control_clustermap_path, bluelight_col_colours=False) # extract clustered feature order clustered_features = np.array(controlZ_feat_df.columns)[cg.dendrogram_col.reordered_ind] featZ_df = features.apply(zscore, axis=0) # Save stats table to CSV # if not stats_path.exists(): # # Add z-normalised values # z_stats = featZ_df.join(meta_df[GROUPING_VAR]).groupby(by=GROUPING_VAR).mean().T # z_mean_cols = ['z-mean ' + v for v in z_stats.columns.to_list()] # z_stats.columns = z_mean_cols # stats_table = stats_table.join(z_stats) # first_cols = [m for m in stats_table.columns if 'mean' in m] # last_cols = [c for c in stats_table.columns if c not in first_cols] # first_cols.extend(last_cols) # stats_table = stats_table[first_cols].reset_index() # first_cols.insert(0, 'feature') # stats_table.columns = first_cols # stats_table['feature'] = [' '.join(f.split('_')) for f in stats_table['feature']] # stats_table = stats_table.sort_values(by='{} p-value'.format((T_TEST_NAME if # len(run_strain_list) == 2 else TEST_NAME)), ascending=True) # stats_table_path = stats_dir / 'stats_summary_table.csv' # stats_table.to_csv(stats_table_path, header=True, index=None) # Clustermap of full data - antioxidants full_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format('gene_antioxidant') _ = plot_clustermap(featZ=featZ_df, meta=metadata, group_by=['gene_name','antioxidant'], col_linkage=None, method='complete', figsize=(20,10), show_xlabels=True, label_size=15, sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85}, saveto=full_clustermap_path, bluelight_col_colours=False) # Heatmap of strain/antioxidant treatment, ordered by control clustered feature order heatmap_date_path = plot_dir / 'heatmaps' / 'gene_antioxidant_heatmap.pdf' plot_barcode_heatmap(featZ=featZ_df[clustered_features], meta=metadata, group_by=['gene_name','antioxidant'], pvalues_series=None, saveto=heatmap_date_path, figsize=(20,6), sns_colour_palette="Pastel1") # Clustermap of full data - windows full_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format('gene_window') _ = plot_clustermap(featZ=featZ_df, meta=metadata, group_by=['gene_name','window'], col_linkage=None, method='complete', figsize=(20,10), show_xlabels=True, label_size=15, sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85}, saveto=full_clustermap_path, bluelight_col_colours=False) # Principal Components Analysis (PCA) if remove_outliers: outlier_path = plot_dir / 'mahalanobis_outliers.pdf' features, inds = remove_outliers_pca(df=features, features_to_analyse=None, saveto=outlier_path) metadata = metadata.reindex(features.index) featZ_df = features.apply(zscore, axis=0) # project data + plot PCA #from tierpsytools.analysis.decomposition import plot_pca pca_dir = plot_dir / 'PCA' _ = plot_pca(featZ=featZ_df, meta=metadata, group_by='gene_name', n_dims=2, control=control_strain, var_subset=None, saveDir=pca_dir, PCs_to_keep=10, n_feats2print=10, sns_colour_palette="Set1", figsize=(12,8), sub_adj={'bottom':0.1,'left':0.1,'top':0.95,'right':0.7}, legend_loc=[1.02,0.6], hypercolor=False) # t-distributed Stochastic Neighbour Embedding (tSNE) tsne_dir = plot_dir / 'tSNE' perplexities = [5,15,30] # NB: perplexity parameter should be roughly equal to group size _ = plot_tSNE(featZ=featZ_df, meta=metadata, group_by='gene_name', var_subset=None, saveDir=tsne_dir, perplexities=perplexities, figsize=(8,8), label_size=15, size=20, sns_colour_palette="Set1") # Uniform Manifold Projection (UMAP) umap_dir = plot_dir / 'UMAP' n_neighbours = [5,15,30] # NB: n_neighbours parameter should be roughly equal to group size min_dist = 0.1 # Minimum distance parameter _ = plot_umap(featZ=featZ_df, meta=metadata, group_by='gene_name', var_subset=None, saveDir=umap_dir, n_neighbours=n_neighbours, min_dist=min_dist, figsize=(8,8), label_size=15, size=20, sns_colour_palette="Set1") _ = plot_pca_2var(featZ=featZ_df, meta=metadata, var1='gene_name', var2='antioxidant', saveDir=pca_dir, PCs_to_keep=10, n_feats2print=10, sns_colour_palette="Set1", label_size=15, figsize=[9,8], sub_adj={'bottom':0,'left':0,'top':1,'right':1}) return
def compare_strains_keio(features, metadata, args): """ Compare Keio single-gene deletion mutants with wild-type BW25113 control and look to see if they signfiicantly alter N2 C. elegans behaviour while feeding. Subset results to omit selected strains (optional) Inputs ------ features, metadata : pd.DataFrame Matching features summaries and metadata args : Object Python object with the following attributes: - drop_size_features : bool - norm_features_only : bool - percentile_to_use : str - remove_outliers : bool - omit_strains : list - grouping_variable : str - control_dict : dict - collapse_control : bool - n_top_feats : int - tierpsy_top_feats_dir (if n_top_feats) : str - test : str - f_test : bool - pval_threshold : float - fdr_method : str - n_sig_features : int """ assert set(features.index) == set(metadata.index) # categorical variable to investigate, eg.'gene_name' grouping_var = args.grouping_variable n_strains = len(metadata[grouping_var].unique()) assert n_strains == len( metadata[grouping_var].str.upper().unique()) # check case-sensitivity print("\nInvestigating '%s' variation (%d samples)" % (grouping_var, n_strains)) # Subset results (rows) to omit selected strains if args.omit_strains is not None: features, metadata = subset_results(features, metadata, column=grouping_var, groups=args.omit_strains, omit=True) control = args.control_dict[grouping_var] # control strain to use # Load Tierpsy Top feature set + subset (columns) for top feats only if args.n_top_feats is not None: top_feats_path = Path( args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format( str(args.n_top_feats)) topfeats = load_topfeats(top_feats_path, add_bluelight=True, remove_path_curvature=True, header=None) # Drop features that are not in results top_feats_list = [ feat for feat in list(topfeats) if feat in features.columns ] features = features[top_feats_list] ##### Control variation ##### control_metadata = metadata[metadata[grouping_var] == control] control_features = features.reindex(control_metadata.index) # Clean data after subset - to remove features with zero std control_feat_clean, control_meta_clean = clean_summary_results( control_features, control_metadata, max_value_cap=False, imputeNaN=False) if args.analyse_control: control_variation(control_feat_clean, control_meta_clean, args, variables=[ k for k in args.control_dict.keys() if k != grouping_var ], n_sig_features=10) if args.collapse_control: print("\nCollapsing control data (mean of each day)") features, metadata = average_plate_control_data(features, metadata) # Record mean sample size per group mean_sample_size = int( np.round( metadata.join(features).groupby([grouping_var], as_index=False).size().mean())) print("Mean sample size: %d" % mean_sample_size) save_dir = get_save_dir(args) stats_dir = save_dir / grouping_var / "Stats" / args.fdr_method plot_dir = save_dir / grouping_var / "Plots" / args.fdr_method ##### STATISTICS ##### # ============================================================================= # ##### Pairplot Tierpsy Features - Pairwise correlation matrix ##### # if args.n_top_feats == 16: # g = sns.pairplot(features, height=1.5) # for ax in g.axes.flatten(): # # rotate x and y axis labels # ax.set_xlabel(ax.get_xlabel(), rotation = 90) # ax.set_ylabel(ax.get_ylabel(), rotation = 0) # plt.subplots_adjust(left=0.3, bottom=0.3) # plt.show() # ============================================================================= if not args.use_corrected_pvals: anova_path = stats_dir / '{}_results_uncorrected.csv'.format(args.test) else: anova_path = stats_dir / '{}_results.csv'.format(args.test) # load results + record significant features print("\nLoading statistics results") anova_table = pd.read_csv(anova_path, index_col=0) pvals = anova_table.sort_values( by='pvals', ascending=True)['pvals'] # rank features by p-value fset = pvals[pvals < args.pval_threshold].index.to_list() print( "\n%d significant features found by %s (P<%.2f, %s)" % (len(fset), args.test, args.pval_threshold, ('uncorrected' if not args.use_corrected_pvals else args.fdr_method))) ### k-significant features if len(fset) > 0: # Compare k sigfeat and ANOVA significant feature set overlap if not args.use_corrected_pvals: k_sigfeats_path = stats_dir / "k_significant_features_uncorrected.csv" else: k_sigfeats_path = stats_dir / "k_significant_features.csv" ksig_table = pd.read_csv(k_sigfeats_path, index_col=0) fset_ksig = ksig_table[ ksig_table['pvals'] < args.pval_threshold].index.to_list() fset_overlap = set(fset).intersection(set(fset_ksig)) prop_overlap = len(fset_overlap) / len(fset) print("%.1f%% overlap with k-significant features" % (prop_overlap * 100)) if prop_overlap < 0.5 and len(fset) > 100: print( "WARNING: Inconsistency in statistics for feature set agreement between " + "%s and k significant features!" % args.test) if args.use_k_sig_feats_overlap: fset = list(ksig_table.loc[fset_overlap].sort_values( by='pvals', ascending=True).index) ### t-test t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sum if not args.use_corrected_pvals: ttest_path = stats_dir / '{}_results_uncorrected.csv'.format( t_test) else: ttest_path = stats_dir / '{}_results.csv'.format(t_test) # read t-test results + record significant features (NOT ORDERED) ttest_table = pd.read_csv(ttest_path, index_col=0) pvals_t = ttest_table[[c for c in ttest_table if "pvals_" in c]] pvals_t.columns = [c.split('pvals_')[-1] for c in pvals_t.columns] fset_ttest = pvals_t[(pvals_t < args.pval_threshold).sum( axis=1) > 0].index.to_list() print("%d significant features found by %s (P<%.2f, %s)" % (len(fset_ttest), t_test, args.pval_threshold, ('uncorrected' if not args.use_corrected_pvals else args.fdr_method))) else: print("No significant features found for %s by %s" % (grouping_var, args.test)) ##### PLOTTING ##### if len(fset) > 0: # Rank strains by number of sigfeats by t-test ranked_nsig = (pvals_t < args.pval_threshold).sum(axis=0).sort_values( ascending=False) # Select top hit strains by n sigfeats (select strains with > 5 sigfeats as hit strains?) hit_strains_nsig = ranked_nsig[ranked_nsig > 0].index.to_list() #hit_nuo = ranked_nsig[[i for i in ranked_nsig[ranked_nsig > 0].index if 'nuo' in i]] # if no sigfaets, subset for top strains ranked by lowest p-value by t-test for any feature print("%d significant strains (with 1 or more significant features)" % len(hit_strains_nsig)) if len(hit_strains_nsig) > 0: write_list_to_file(hit_strains_nsig, stats_dir / 'hit_strains.txt') # Rank strains by lowest p-value for any feature ranked_pval = pvals_t.min(axis=0).sort_values(ascending=True) # Select top 100 hit strains by lowest p-value for any feature hit_strains_pval = ranked_pval[ ranked_pval < args.pval_threshold].index.to_list() hit_strains_pval = ranked_pval.index[:N_LOWEST_PVAL].to_list() write_list_to_file( hit_strains_pval, stats_dir / 'lowest{}_pval.txt'.format(N_LOWEST_PVAL)) print("\nPlotting ranked strains by number of significant features") ranked_nsig_path = plot_dir / ( 'ranked_number_sigfeats' + '_' + ('uncorrected' if args.fdr_method is None else args.fdr_method) + '.png') plt.ioff() plt.close('all') fig, ax = plt.subplots(figsize=(20, 6)) ax.plot(ranked_nsig) if len(ranked_nsig.index) > 250: ax.set_xticklabels([]) else: ax.set_xticklabels(ranked_nsig.index.to_list(), rotation=90, fontsize=5) plt.xlabel("Strains (ranked)", fontsize=12, labelpad=10) plt.ylabel("Number of significant features", fontsize=12, labelpad=10) plt.subplots_adjust(left=0.08, right=0.98, bottom=0.15) plt.savefig(ranked_nsig_path, dpi=600) print("Plotting ranked strains by lowest p-value of any feature") lowest_pval_path = plot_dir / ( 'ranked_lowest_pval' + '_' + ('uncorrected' if args.fdr_method is None else args.fdr_method) + '.png') plt.close('all') fig, ax = plt.subplots(figsize=(20, 6)) ax.plot(ranked_pval) plt.axhline(y=args.pval_threshold, c='dimgray', ls='--') if len(ranked_nsig.index) > 250: ax.set_xticklabels([]) else: ax.set_xticklabels(ranked_nsig.index.to_list(), rotation=90, fontsize=5) plt.xlabel("Strains (ranked)", fontsize=12, labelpad=10) plt.ylabel("Lowest p-value by t-test", fontsize=12, labelpad=10) plt.subplots_adjust(left=0.08, right=0.98, bottom=0.15) plt.savefig(lowest_pval_path, dpi=600) plt.close() print("\nMaking errorbar plots") errorbar_sigfeats(features, metadata, group_by=grouping_var, fset=fset, control=control, rank_by='mean', max_feats2plt=args.n_sig_features, figsize=[20, 10], fontsize=5, ms=8, elinewidth=1.5, fmt='.', tight_layout=[0.01, 0.01, 0.99, 0.99], saveDir=plot_dir / 'errorbar') # ============================================================================= # print("Making boxplots") # boxplots_grouped(feat_meta_df=metadata.join(features), # group_by=grouping_var, # control_group=control, # test_pvalues_df=(pvals_t.T if len(fset) > 0 else None), # feature_set=fset, # max_feats2plt=args.n_sig_features, # max_groups_plot_cap=None, # p_value_threshold=args.pval_threshold, # drop_insignificant=False, # sns_colour_palette="tab10", # figsize=[6,130], # saveDir=plot_dir / ('boxplots' + '_' + ( # 'uncorrected' if args.fdr_method is None else args.fdr_method) + # '.png')) # ============================================================================= # If no sigfeats, subset for top strains ranked by lowest p-value by t-test for any feature if len(hit_strains_nsig) == 0: print( "\Saving lowest %d strains ranked by p-value for any feature" % N_LOWEST_PVAL) write_list_to_file(hit_strains_pval, stats_dir / 'Top100_lowest_pval.txt') hit_strains = hit_strains_pval elif len(hit_strains_nsig) > 0: hit_strains = hit_strains_nsig # Individual boxplots of significant features by pairwise t-test (each group vs control) boxplots_sigfeats( features, y_class=metadata[grouping_var], control=control, pvals=pvals_t, z_class=metadata['date_yyyymmdd'], feature_set=None, saveDir=plot_dir / 'paired_boxplots', p_value_threshold=args.pval_threshold, drop_insignificant=True if len(hit_strains) > 0 else False, max_sig_feats=args.n_sig_features, max_strains=N_LOWEST_PVAL if len(hit_strains_nsig) == 0 else None, sns_colour_palette="tab10", verbose=False) if SUBSET_HIT_STRAINS: strain_list = [control] + hit_strains[:TOP_N_HITS] print("Subsetting for Top%d hit strains" % (len(strain_list) - 1)) features, metadata = subset_results(features, metadata, column=grouping_var, groups=strain_list, verbose=False) else: strain_list = list(metadata[grouping_var].unique()) # ============================================================================= # # NOT NECESSARY FOR ALL STRAINS - LOOK AT CONTROL ONLY FOR THIS # # superplots of variation with respect to 'date_yyyymmdd' # print("\nPlotting superplots of date variation for significant features") # for feat in tqdm(fset[:args.n_sig_features]): # # plot day variation # superplot(features, metadata, feat, # x1='date_yyyymmdd', # x2=None, # saveDir=plot_dir / 'superplots', # figsize=[24,6], # show_points=False, # plot_means=True, # dodge=False) # # plot run number vs day variation # superplot(features, metadata, feat, # x1='date_yyyymmdd', # x2='imaging_run_number', # saveDir=plot_dir / 'superplots', # figsize=[24,6], # show_points=False, # plot_means=True, # dodge=True) # # plot plate number variation # superplot(features, metadata, feat, # x1='date_yyyymmdd', # x2='source_plate_id', # saveDir=plot_dir / 'superplots', # figsize=[24,6], # show_points=False, # plot_means=True, # dodge=True) # # plot instrument name variation # superplot(features, metadata, feat, # x1='date_yyyymmdd', # x2='instrument_name', # saveDir=plot_dir / 'superplots', # figsize=[24,6], # show_points=False, # plot_means=True, # dodge=True) # ============================================================================= # from tierpsytools.analysis.significant_features import plot_feature_boxplots # plot_feature_boxplots(feat_to_plot=features, # y_class=metadata[grouping_var], # scores=pvals_t.rank(axis=1), # pvalues=np.asarray(pvals_t).flatten(), # saveto=None, # close_after_plotting=True) ##### Hierarchical Clustering Analysis ##### # Z-normalise control data control_featZ = control_features.apply(zscore, axis=0) #featZ = (features-features.mean())/features.std() # minus mean, divide by std #from tierpsytools.preprocessing.scaling_class import scalingClass #scaler = scalingClass(scaling='standardize') #featZ = scaler.fit_transform(features) ### Control clustermap # control data is clustered and feature order is stored and applied to full data print("\nPlotting control clustermap") control_clustermap_path = plot_dir / 'heatmaps' / 'date_clustermap.pdf' cg = plot_clustermap( control_featZ, control_metadata, group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else [grouping_var, 'date_yyyymmdd']), method=METHOD, metric=METRIC, figsize=[20, 6], sub_adj={ 'bottom': 0.05, 'left': 0, 'top': 1, 'right': 0.85 }, saveto=control_clustermap_path, label_size=15, show_xlabels=False) # control clustermap with labels if args.n_top_feats <= 256: control_clustermap_path = plot_dir / 'heatmaps' / 'date_clustermap_label.pdf' cg = plot_clustermap( control_featZ, control_metadata, group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else [grouping_var, 'date_yyyymmdd']), method=METHOD, metric=METRIC, figsize=[20, 10], sub_adj={ 'bottom': 0.5, 'left': 0, 'top': 1, 'right': 0.85 }, saveto=control_clustermap_path, label_size=(15, 15), show_xlabels=True) #col_linkage = cg.dendrogram_col.calculated_linkage control_clustered_features = np.array( control_featZ.columns)[cg.dendrogram_col.reordered_ind] ### Full clustermap # Z-normalise data for all strains featZ = features.apply(zscore, axis=0) ## Save z-normalised values # z_stats = featZ.join(hit_metadata[grouping_var]).groupby(by=grouping_var).mean().T # z_stats.columns = ['z-mean_' + v for v in z_stats.columns.to_list()] # z_stats.to_csv(z_stats_path, header=True, index=None) # Clustermap of full data print("Plotting all strains clustermap") full_clustermap_path = plot_dir / 'heatmaps' / (grouping_var + '_clustermap.pdf') fg = plot_clustermap(featZ, metadata, group_by=grouping_var, row_colours=None, method=METHOD, metric=METRIC, figsize=[20, 30], sub_adj={ 'bottom': 0.01, 'left': 0, 'top': 1, 'right': 0.95 }, saveto=full_clustermap_path, label_size=8, show_xlabels=False) if args.n_top_feats <= 256: full_clustermap_path = plot_dir / 'heatmaps' / ( grouping_var + '_clustermap_label.pdf') fg = plot_clustermap(featZ, metadata, group_by=grouping_var, row_colours=None, method=METHOD, metric=METRIC, figsize=[20, 40], sub_adj={ 'bottom': 0.18, 'left': 0, 'top': 1, 'right': 0.95 }, saveto=full_clustermap_path, label_size=(15, 10), show_xlabels=True) # clustered feature order for all strains _ = np.array(featZ.columns)[fg.dendrogram_col.reordered_ind] pvals_heatmap = anova_table.loc[control_clustered_features, 'pvals'] pvals_heatmap.name = 'P < {}'.format(args.pval_threshold) assert all(f in featZ.columns for f in pvals_heatmap.index) # Plot heatmap (averaged for each sample) if len(metadata[grouping_var].unique()) < 250: print("\nPlotting barcode heatmap") heatmap_path = plot_dir / 'heatmaps' / (grouping_var + '_heatmap.pdf') plot_barcode_heatmap( featZ=featZ[control_clustered_features], meta=metadata, group_by=[grouping_var], pvalues_series=pvals_heatmap, p_value_threshold=args.pval_threshold, selected_feats=None, # fset if len(fset) > 0 else None saveto=heatmap_path, figsize=[20, 30], sns_colour_palette="Pastel1", label_size=10) ##### Principal Components Analysis ##### pca_dir = plot_dir / 'PCA' # remove outlier samples from PCA if args.remove_outliers: outlier_path = pca_dir / 'mahalanobis_outliers.pdf' features, inds = remove_outliers_pca(df=features, saveto=outlier_path) metadata = metadata.reindex(features.index) # reindex metadata featZ = features.apply(zscore, axis=0) # re-normalise data # Drop features with NaN values after normalising n_cols = len(featZ.columns) featZ.dropna(axis=1, inplace=True) n_dropped = n_cols - len(featZ.columns) if n_dropped > 0: print("Dropped %d features after normalisation (NaN)" % n_dropped) coloured_strains_pca = [control] + hit_strains[:15] coloured_strains_pca = [ s for s in coloured_strains_pca if s in metadata[grouping_var].unique() ] #from tierpsytools.analysis.decomposition import plot_pca _ = plot_pca(featZ, metadata, group_by=grouping_var, control=control, var_subset=coloured_strains_pca, saveDir=pca_dir, PCs_to_keep=10, n_feats2print=10, kde=False, sns_colour_palette="plasma", n_dims=2, label_size=8, sub_adj={ 'bottom': 0.13, 'left': 0.13, 'top': 0.95, 'right': 0.88 }, legend_loc=[1.02, 0.6], hypercolor=False) # add details of COG category information to metadata # (using hard-coded dict of info from Baba et al. 2006 paper) metadata['COG_category'] = metadata['COG_category'].map(COG_category_dict) # plot pca coloured by Keio COG category _ = plot_pca(featZ, metadata, group_by='COG_category', control=None, var_subset=list(metadata['COG_category'].dropna().unique()), saveDir=pca_dir / 'COG', PCs_to_keep=10, n_feats2print=10, kde=False, n_dims=2, hypercolor=False, label_size=8, figsize=[12, 8], sub_adj={ 'bottom': 0.1, 'left': 0.1, 'top': 0.95, 'right': 0.7 }, legend_loc=[1.02, 0.6], sns_colour_palette="plasma") ##### t-distributed Stochastic Neighbour Embedding ##### print("\nPerforming tSNE") tsne_dir = plot_dir / 'tSNE' perplexities = [mean_sample_size ] # NB: should be roughly equal to group size _ = plot_tSNE(featZ, metadata, group_by=grouping_var, var_subset=coloured_strains_pca, saveDir=tsne_dir, perplexities=perplexities, figsize=[8, 8], label_size=8, marker_size=20, sns_colour_palette="plasma") print("\nPerforming tSNE") tsne_dir = plot_dir / 'tSNE' perplexities = [mean_sample_size ] # NB: should be roughly equal to group size _ = plot_tSNE(featZ, metadata, group_by='COG_category', var_subset=list(metadata['COG_category'].dropna().unique()), saveDir=tsne_dir / 'COG_category', perplexities=perplexities, figsize=[8, 8], label_size=8, marker_size=20, sns_colour_palette="plasma") ##### Uniform Manifold Projection ##### print("\nPerforming UMAP") umap_dir = plot_dir / 'UMAP' n_neighbours = [mean_sample_size ] # NB: should be roughly equal to group size min_dist = 0.1 # Minimum distance parameter _ = plot_umap(featZ, metadata, group_by=grouping_var, var_subset=coloured_strains_pca, saveDir=umap_dir, n_neighbours=n_neighbours, min_dist=min_dist, figsize=[8, 8], label_size=8, marker_size=20, sns_colour_palette="plasma")
metadata = pd.read_csv(METADATA_PATH, dtype={'comments':str, 'source_plate_id':str}) # Subset for control data only control_strain = args.control_dict[args.grouping_variable] # control strain to use control_features, control_metadata = subset_results(features, metadata, column=args.grouping_variable, groups=[control_strain]) # Subset for imaging dates of interest if args.dates is not None: dates = [int(d) for d in args.dates] control_features, control_metadata = subset_results(control_features, control_metadata, column='date_yyyymmdd', groups=dates) # Clean data after subset - to remove features with zero std control_features, control_metadata = clean_summary_results(control_features, control_metadata, max_value_cap=False, imputeNaN=False) # Load Tierpsy Top feature set + subset (columns) for top feats only if args.n_top_feats is not None: top_feats_path = Path(args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(str(args.n_top_feats)) topfeats = load_topfeats(top_feats_path, add_bluelight=True, remove_path_curvature=True, header=None) # Drop features that are not in results top_feats_list = [feat for feat in list(topfeats) if feat in control_features.columns] control_features = control_features[top_feats_list] print("Investigating variation in '%s' (control %s)" % (control_strain, args.grouping_variable)) control_variation(control_features, control_metadata,