#%%     t-distributed Stochastic Neighbour Embedding (tSNE)

        tsne_dir = plot_dir / 'tSNE'
        perplexities = [5,15,30]
        
        tSNE_df = plot_tSNE(featZ=featZ_df,
                            meta=meta_df,
                            group_by=GROUPING_VAR,
                            var_subset=None,
                            saveDir=tsne_dir,
                            perplexities=perplexities,
                             # NB: perplexity parameter should be roughly equal to group size
                            sns_colour_palette="plasma")
       
        #%%     Uniform Manifold Projection (UMAP)

        umap_dir = plot_dir / 'UMAP'
        n_neighbours = [5,15,30]
        min_dist = 0.1 # Minimum distance parameter
        
        umap_df = plot_umap(featZ=featZ_df,
                            meta=meta_df,
                            group_by=GROUPING_VAR,
                            var_subset=None,
                            saveDir=umap_dir,
                            n_neighbours=n_neighbours,
                            # NB: n_neighbours parameter should be roughly equal to group size
                            min_dist=min_dist,
                            sns_colour_palette="tab10")
Пример #2
0
def compare_strains_keio(features, metadata, args):
    """ Compare Keio single-gene deletion mutants with wild-type BW25113 control and look to see if 
        they signfiicantly alter N2 C. elegans behaviour while feeding.
        
        Subset results to omit selected strains (optional) 
        Inputs
        ------
        features, metadata : pd.DataFrame
            Matching features summaries and metadata
        
        args : Object 
            Python object with the following attributes:
            - drop_size_features : bool
            - norm_features_only : bool
            - percentile_to_use : str
            - remove_outliers : bool
            - omit_strains : list
            - grouping_variable : str
            - control_dict : dict
            - collapse_control : bool
            - n_top_feats : int
            - tierpsy_top_feats_dir (if n_top_feats) : str
            - test : str
            - f_test : bool
            - pval_threshold : float
            - fdr_method : str
            - n_sig_features : int
    """

    assert set(features.index) == set(metadata.index)

    # categorical variable to investigate, eg.'gene_name'
    grouping_var = args.grouping_variable
    n_strains = len(metadata[grouping_var].unique())
    assert n_strains == len(
        metadata[grouping_var].str.upper().unique())  # check case-sensitivity
    print("\nInvestigating '%s' variation (%d samples)" %
          (grouping_var, n_strains))

    # Subset results (rows) to omit selected strains
    if args.omit_strains is not None:
        features, metadata = subset_results(features,
                                            metadata,
                                            column=grouping_var,
                                            groups=args.omit_strains,
                                            omit=True)

    control = args.control_dict[grouping_var]  # control strain to use

    # Load Tierpsy Top feature set + subset (columns) for top feats only
    if args.n_top_feats is not None:
        top_feats_path = Path(
            args.tierpsy_top_feats_dir) / "tierpsy_{}.csv".format(
                str(args.n_top_feats))
        topfeats = load_topfeats(top_feats_path,
                                 add_bluelight=True,
                                 remove_path_curvature=True,
                                 header=None)

        # Drop features that are not in results
        top_feats_list = [
            feat for feat in list(topfeats) if feat in features.columns
        ]
        features = features[top_feats_list]

    ##### Control variation #####

    control_metadata = metadata[metadata[grouping_var] == control]
    control_features = features.reindex(control_metadata.index)

    # Clean data after subset - to remove features with zero std
    control_feat_clean, control_meta_clean = clean_summary_results(
        control_features,
        control_metadata,
        max_value_cap=False,
        imputeNaN=False)
    if args.analyse_control:
        control_variation(control_feat_clean,
                          control_meta_clean,
                          args,
                          variables=[
                              k for k in args.control_dict.keys()
                              if k != grouping_var
                          ],
                          n_sig_features=10)

    if args.collapse_control:
        print("\nCollapsing control data (mean of each day)")
        features, metadata = average_plate_control_data(features, metadata)

    # Record mean sample size per group
    mean_sample_size = int(
        np.round(
            metadata.join(features).groupby([grouping_var],
                                            as_index=False).size().mean()))
    print("Mean sample size: %d" % mean_sample_size)

    save_dir = get_save_dir(args)
    stats_dir = save_dir / grouping_var / "Stats" / args.fdr_method
    plot_dir = save_dir / grouping_var / "Plots" / args.fdr_method

    ##### STATISTICS #####

    # =============================================================================
    #     ##### Pairplot Tierpsy Features - Pairwise correlation matrix #####
    #     if args.n_top_feats == 16:
    #         g = sns.pairplot(features, height=1.5)
    #         for ax in g.axes.flatten():
    #             # rotate x and y axis labels
    #             ax.set_xlabel(ax.get_xlabel(), rotation = 90)
    #             ax.set_ylabel(ax.get_ylabel(), rotation = 0)
    #         plt.subplots_adjust(left=0.3, bottom=0.3)
    #         plt.show()
    # =============================================================================

    if not args.use_corrected_pvals:
        anova_path = stats_dir / '{}_results_uncorrected.csv'.format(args.test)
    else:
        anova_path = stats_dir / '{}_results.csv'.format(args.test)

    # load results + record significant features
    print("\nLoading statistics results")
    anova_table = pd.read_csv(anova_path, index_col=0)
    pvals = anova_table.sort_values(
        by='pvals', ascending=True)['pvals']  # rank features by p-value
    fset = pvals[pvals < args.pval_threshold].index.to_list()
    print(
        "\n%d significant features found by %s (P<%.2f, %s)" %
        (len(fset), args.test, args.pval_threshold,
         ('uncorrected' if not args.use_corrected_pvals else args.fdr_method)))

    ### k-significant features

    if len(fset) > 0:
        # Compare k sigfeat and ANOVA significant feature set overlap
        if not args.use_corrected_pvals:
            k_sigfeats_path = stats_dir / "k_significant_features_uncorrected.csv"
        else:
            k_sigfeats_path = stats_dir / "k_significant_features.csv"

        ksig_table = pd.read_csv(k_sigfeats_path, index_col=0)
        fset_ksig = ksig_table[
            ksig_table['pvals'] < args.pval_threshold].index.to_list()

        fset_overlap = set(fset).intersection(set(fset_ksig))
        prop_overlap = len(fset_overlap) / len(fset)
        print("%.1f%% overlap with k-significant features" %
              (prop_overlap * 100))

        if prop_overlap < 0.5 and len(fset) > 100:
            print(
                "WARNING: Inconsistency in statistics for feature set agreement between "
                + "%s and k significant features!" % args.test)

        if args.use_k_sig_feats_overlap:
            fset = list(ksig_table.loc[fset_overlap].sort_values(
                by='pvals', ascending=True).index)

        ### t-test

        t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney'  # aka. Wilcoxon rank-sum

        if not args.use_corrected_pvals:
            ttest_path = stats_dir / '{}_results_uncorrected.csv'.format(
                t_test)
        else:
            ttest_path = stats_dir / '{}_results.csv'.format(t_test)

        # read t-test results + record significant features (NOT ORDERED)
        ttest_table = pd.read_csv(ttest_path, index_col=0)
        pvals_t = ttest_table[[c for c in ttest_table if "pvals_" in c]]
        pvals_t.columns = [c.split('pvals_')[-1] for c in pvals_t.columns]
        fset_ttest = pvals_t[(pvals_t < args.pval_threshold).sum(
            axis=1) > 0].index.to_list()
        print("%d significant features found by %s (P<%.2f, %s)" %
              (len(fset_ttest), t_test, args.pval_threshold,
               ('uncorrected'
                if not args.use_corrected_pvals else args.fdr_method)))

    else:
        print("No significant features found for %s by %s" %
              (grouping_var, args.test))

    ##### PLOTTING #####

    if len(fset) > 0:
        # Rank strains by number of sigfeats by t-test
        ranked_nsig = (pvals_t < args.pval_threshold).sum(axis=0).sort_values(
            ascending=False)
        # Select top hit strains by n sigfeats (select strains with > 5 sigfeats as hit strains?)
        hit_strains_nsig = ranked_nsig[ranked_nsig > 0].index.to_list()
        #hit_nuo = ranked_nsig[[i for i in ranked_nsig[ranked_nsig > 0].index if 'nuo' in i]]
        # if no sigfaets, subset for top strains ranked by lowest p-value by t-test for any feature
        print("%d significant strains (with 1 or more significant features)" %
              len(hit_strains_nsig))
        if len(hit_strains_nsig) > 0:
            write_list_to_file(hit_strains_nsig, stats_dir / 'hit_strains.txt')

        # Rank strains by lowest p-value for any feature
        ranked_pval = pvals_t.min(axis=0).sort_values(ascending=True)
        # Select top 100 hit strains by lowest p-value for any feature
        hit_strains_pval = ranked_pval[
            ranked_pval < args.pval_threshold].index.to_list()
        hit_strains_pval = ranked_pval.index[:N_LOWEST_PVAL].to_list()
        write_list_to_file(
            hit_strains_pval,
            stats_dir / 'lowest{}_pval.txt'.format(N_LOWEST_PVAL))

        print("\nPlotting ranked strains by number of significant features")
        ranked_nsig_path = plot_dir / (
            'ranked_number_sigfeats' + '_' +
            ('uncorrected' if args.fdr_method is None else args.fdr_method) +
            '.png')
        plt.ioff()
        plt.close('all')
        fig, ax = plt.subplots(figsize=(20, 6))
        ax.plot(ranked_nsig)
        if len(ranked_nsig.index) > 250:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(ranked_nsig.index.to_list(),
                               rotation=90,
                               fontsize=5)
        plt.xlabel("Strains (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Number of significant features", fontsize=12, labelpad=10)
        plt.subplots_adjust(left=0.08, right=0.98, bottom=0.15)
        plt.savefig(ranked_nsig_path, dpi=600)

        print("Plotting ranked strains by lowest p-value of any feature")
        lowest_pval_path = plot_dir / (
            'ranked_lowest_pval' + '_' +
            ('uncorrected' if args.fdr_method is None else args.fdr_method) +
            '.png')
        plt.close('all')
        fig, ax = plt.subplots(figsize=(20, 6))
        ax.plot(ranked_pval)
        plt.axhline(y=args.pval_threshold, c='dimgray', ls='--')
        if len(ranked_nsig.index) > 250:
            ax.set_xticklabels([])
        else:
            ax.set_xticklabels(ranked_nsig.index.to_list(),
                               rotation=90,
                               fontsize=5)
        plt.xlabel("Strains (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Lowest p-value by t-test", fontsize=12, labelpad=10)
        plt.subplots_adjust(left=0.08, right=0.98, bottom=0.15)
        plt.savefig(lowest_pval_path, dpi=600)
        plt.close()

        print("\nMaking errorbar plots")
        errorbar_sigfeats(features,
                          metadata,
                          group_by=grouping_var,
                          fset=fset,
                          control=control,
                          rank_by='mean',
                          max_feats2plt=args.n_sig_features,
                          figsize=[20, 10],
                          fontsize=5,
                          ms=8,
                          elinewidth=1.5,
                          fmt='.',
                          tight_layout=[0.01, 0.01, 0.99, 0.99],
                          saveDir=plot_dir / 'errorbar')

        # =============================================================================
        #         print("Making boxplots")
        #         boxplots_grouped(feat_meta_df=metadata.join(features),
        #                           group_by=grouping_var,
        #                           control_group=control,
        #                           test_pvalues_df=(pvals_t.T if len(fset) > 0 else None),
        #                           feature_set=fset,
        #                           max_feats2plt=args.n_sig_features,
        #                           max_groups_plot_cap=None,
        #                           p_value_threshold=args.pval_threshold,
        #                           drop_insignificant=False,
        #                           sns_colour_palette="tab10",
        #                           figsize=[6,130],
        #                           saveDir=plot_dir / ('boxplots' + '_' + (
        #                                   'uncorrected' if args.fdr_method is None else args.fdr_method) +
        #                                   '.png'))
        # =============================================================================

        # If no sigfeats, subset for top strains ranked by lowest p-value by t-test for any feature
        if len(hit_strains_nsig) == 0:
            print(
                "\Saving lowest %d strains ranked by p-value for any feature" %
                N_LOWEST_PVAL)
            write_list_to_file(hit_strains_pval,
                               stats_dir / 'Top100_lowest_pval.txt')
            hit_strains = hit_strains_pval
        elif len(hit_strains_nsig) > 0:
            hit_strains = hit_strains_nsig

        # Individual boxplots of significant features by pairwise t-test (each group vs control)
        boxplots_sigfeats(
            features,
            y_class=metadata[grouping_var],
            control=control,
            pvals=pvals_t,
            z_class=metadata['date_yyyymmdd'],
            feature_set=None,
            saveDir=plot_dir / 'paired_boxplots',
            p_value_threshold=args.pval_threshold,
            drop_insignificant=True if len(hit_strains) > 0 else False,
            max_sig_feats=args.n_sig_features,
            max_strains=N_LOWEST_PVAL if len(hit_strains_nsig) == 0 else None,
            sns_colour_palette="tab10",
            verbose=False)

        if SUBSET_HIT_STRAINS:
            strain_list = [control] + hit_strains[:TOP_N_HITS]
            print("Subsetting for Top%d hit strains" % (len(strain_list) - 1))
            features, metadata = subset_results(features,
                                                metadata,
                                                column=grouping_var,
                                                groups=strain_list,
                                                verbose=False)
        else:
            strain_list = list(metadata[grouping_var].unique())

# =============================================================================
#         # NOT NECESSARY FOR ALL STRAINS - LOOK AT CONTROL ONLY FOR THIS
#         # superplots of variation with respect to 'date_yyyymmdd'
#         print("\nPlotting superplots of date variation for significant features")
#         for feat in tqdm(fset[:args.n_sig_features]):
#             # plot day variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2=None,
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=False)
#             # plot run number vs day variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2='imaging_run_number',
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=True)
#             # plot plate number variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2='source_plate_id',
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=True)
#             # plot instrument name variation
#             superplot(features, metadata, feat,
#                       x1='date_yyyymmdd',
#                       x2='instrument_name',
#                       saveDir=plot_dir / 'superplots',
#                       figsize=[24,6],
#                       show_points=False,
#                       plot_means=True,
#                       dodge=True)
# =============================================================================

# from tierpsytools.analysis.significant_features import plot_feature_boxplots
# plot_feature_boxplots(feat_to_plot=features,
#                       y_class=metadata[grouping_var],
#                       scores=pvals_t.rank(axis=1),
#                       pvalues=np.asarray(pvals_t).flatten(),
#                       saveto=None,
#                       close_after_plotting=True)

##### Hierarchical Clustering Analysis #####

# Z-normalise control data
    control_featZ = control_features.apply(zscore, axis=0)
    #featZ = (features-features.mean())/features.std() # minus mean, divide by std

    #from tierpsytools.preprocessing.scaling_class import scalingClass
    #scaler = scalingClass(scaling='standardize')
    #featZ = scaler.fit_transform(features)

    ### Control clustermap

    # control data is clustered and feature order is stored and applied to full data
    print("\nPlotting control clustermap")

    control_clustermap_path = plot_dir / 'heatmaps' / 'date_clustermap.pdf'
    cg = plot_clustermap(
        control_featZ,
        control_metadata,
        group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else
                  [grouping_var, 'date_yyyymmdd']),
        method=METHOD,
        metric=METRIC,
        figsize=[20, 6],
        sub_adj={
            'bottom': 0.05,
            'left': 0,
            'top': 1,
            'right': 0.85
        },
        saveto=control_clustermap_path,
        label_size=15,
        show_xlabels=False)
    # control clustermap with labels
    if args.n_top_feats <= 256:
        control_clustermap_path = plot_dir / 'heatmaps' / 'date_clustermap_label.pdf'
        cg = plot_clustermap(
            control_featZ,
            control_metadata,
            group_by=([grouping_var] if grouping_var == 'date_yyyymmdd' else
                      [grouping_var, 'date_yyyymmdd']),
            method=METHOD,
            metric=METRIC,
            figsize=[20, 10],
            sub_adj={
                'bottom': 0.5,
                'left': 0,
                'top': 1,
                'right': 0.85
            },
            saveto=control_clustermap_path,
            label_size=(15, 15),
            show_xlabels=True)

    #col_linkage = cg.dendrogram_col.calculated_linkage
    control_clustered_features = np.array(
        control_featZ.columns)[cg.dendrogram_col.reordered_ind]

    ### Full clustermap

    # Z-normalise data for all strains
    featZ = features.apply(zscore, axis=0)

    ## Save z-normalised values
    # z_stats = featZ.join(hit_metadata[grouping_var]).groupby(by=grouping_var).mean().T
    # z_stats.columns = ['z-mean_' + v for v in z_stats.columns.to_list()]
    # z_stats.to_csv(z_stats_path, header=True, index=None)

    # Clustermap of full data
    print("Plotting all strains clustermap")
    full_clustermap_path = plot_dir / 'heatmaps' / (grouping_var +
                                                    '_clustermap.pdf')
    fg = plot_clustermap(featZ,
                         metadata,
                         group_by=grouping_var,
                         row_colours=None,
                         method=METHOD,
                         metric=METRIC,
                         figsize=[20, 30],
                         sub_adj={
                             'bottom': 0.01,
                             'left': 0,
                             'top': 1,
                             'right': 0.95
                         },
                         saveto=full_clustermap_path,
                         label_size=8,
                         show_xlabels=False)

    if args.n_top_feats <= 256:
        full_clustermap_path = plot_dir / 'heatmaps' / (
            grouping_var + '_clustermap_label.pdf')
        fg = plot_clustermap(featZ,
                             metadata,
                             group_by=grouping_var,
                             row_colours=None,
                             method=METHOD,
                             metric=METRIC,
                             figsize=[20, 40],
                             sub_adj={
                                 'bottom': 0.18,
                                 'left': 0,
                                 'top': 1,
                                 'right': 0.95
                             },
                             saveto=full_clustermap_path,
                             label_size=(15, 10),
                             show_xlabels=True)

    # clustered feature order for all strains
    _ = np.array(featZ.columns)[fg.dendrogram_col.reordered_ind]

    pvals_heatmap = anova_table.loc[control_clustered_features, 'pvals']
    pvals_heatmap.name = 'P < {}'.format(args.pval_threshold)

    assert all(f in featZ.columns for f in pvals_heatmap.index)

    # Plot heatmap (averaged for each sample)
    if len(metadata[grouping_var].unique()) < 250:
        print("\nPlotting barcode heatmap")
        heatmap_path = plot_dir / 'heatmaps' / (grouping_var + '_heatmap.pdf')
        plot_barcode_heatmap(
            featZ=featZ[control_clustered_features],
            meta=metadata,
            group_by=[grouping_var],
            pvalues_series=pvals_heatmap,
            p_value_threshold=args.pval_threshold,
            selected_feats=None,  # fset if len(fset) > 0 else None
            saveto=heatmap_path,
            figsize=[20, 30],
            sns_colour_palette="Pastel1",
            label_size=10)

    ##### Principal Components Analysis #####

    pca_dir = plot_dir / 'PCA'

    # remove outlier samples from PCA
    if args.remove_outliers:
        outlier_path = pca_dir / 'mahalanobis_outliers.pdf'
        features, inds = remove_outliers_pca(df=features, saveto=outlier_path)
        metadata = metadata.reindex(features.index)  # reindex metadata
        featZ = features.apply(zscore, axis=0)  # re-normalise data

        # Drop features with NaN values after normalising
        n_cols = len(featZ.columns)
        featZ.dropna(axis=1, inplace=True)
        n_dropped = n_cols - len(featZ.columns)
        if n_dropped > 0:
            print("Dropped %d features after normalisation (NaN)" % n_dropped)

    coloured_strains_pca = [control] + hit_strains[:15]
    coloured_strains_pca = [
        s for s in coloured_strains_pca
        if s in metadata[grouping_var].unique()
    ]

    #from tierpsytools.analysis.decomposition import plot_pca
    _ = plot_pca(featZ,
                 metadata,
                 group_by=grouping_var,
                 control=control,
                 var_subset=coloured_strains_pca,
                 saveDir=pca_dir,
                 PCs_to_keep=10,
                 n_feats2print=10,
                 kde=False,
                 sns_colour_palette="plasma",
                 n_dims=2,
                 label_size=8,
                 sub_adj={
                     'bottom': 0.13,
                     'left': 0.13,
                     'top': 0.95,
                     'right': 0.88
                 },
                 legend_loc=[1.02, 0.6],
                 hypercolor=False)

    # add details of COG category information to metadata
    # (using hard-coded dict of info from Baba et al. 2006 paper)
    metadata['COG_category'] = metadata['COG_category'].map(COG_category_dict)

    # plot pca coloured by Keio COG category
    _ = plot_pca(featZ,
                 metadata,
                 group_by='COG_category',
                 control=None,
                 var_subset=list(metadata['COG_category'].dropna().unique()),
                 saveDir=pca_dir / 'COG',
                 PCs_to_keep=10,
                 n_feats2print=10,
                 kde=False,
                 n_dims=2,
                 hypercolor=False,
                 label_size=8,
                 figsize=[12, 8],
                 sub_adj={
                     'bottom': 0.1,
                     'left': 0.1,
                     'top': 0.95,
                     'right': 0.7
                 },
                 legend_loc=[1.02, 0.6],
                 sns_colour_palette="plasma")

    ##### t-distributed Stochastic Neighbour Embedding #####

    print("\nPerforming tSNE")
    tsne_dir = plot_dir / 'tSNE'
    perplexities = [mean_sample_size
                    ]  # NB: should be roughly equal to group size
    _ = plot_tSNE(featZ,
                  metadata,
                  group_by=grouping_var,
                  var_subset=coloured_strains_pca,
                  saveDir=tsne_dir,
                  perplexities=perplexities,
                  figsize=[8, 8],
                  label_size=8,
                  marker_size=20,
                  sns_colour_palette="plasma")

    print("\nPerforming tSNE")
    tsne_dir = plot_dir / 'tSNE'
    perplexities = [mean_sample_size
                    ]  # NB: should be roughly equal to group size
    _ = plot_tSNE(featZ,
                  metadata,
                  group_by='COG_category',
                  var_subset=list(metadata['COG_category'].dropna().unique()),
                  saveDir=tsne_dir / 'COG_category',
                  perplexities=perplexities,
                  figsize=[8, 8],
                  label_size=8,
                  marker_size=20,
                  sns_colour_palette="plasma")

    ##### Uniform Manifold Projection #####

    print("\nPerforming UMAP")
    umap_dir = plot_dir / 'UMAP'
    n_neighbours = [mean_sample_size
                    ]  # NB: should be roughly equal to group size
    min_dist = 0.1  # Minimum distance parameter
    _ = plot_umap(featZ,
                  metadata,
                  group_by=grouping_var,
                  var_subset=coloured_strains_pca,
                  saveDir=umap_dir,
                  n_neighbours=n_neighbours,
                  min_dist=min_dist,
                  figsize=[8, 8],
                  label_size=8,
                  marker_size=20,
                  sns_colour_palette="plasma")
Пример #3
0
def analyse_acute_rescue(features, 
                         metadata,
                         save_dir,
                         control_strain, 
                         control_antioxidant, 
                         control_window,
                         fdr_method='fdr_by',
                         pval_threshold=0.05,
                         remove_outliers=False):
 
    stats_dir =  Path(save_dir) / "Stats" / fdr_method
    plot_dir = Path(save_dir) / "Plots" / fdr_method

    strain_list = [control_strain] + [s for s in metadata['gene_name'].unique() if s != control_strain]  
    antiox_list = [control_antioxidant] + [a for a in metadata['antioxidant'].unique() if 
                                           a != control_antioxidant]
    window_list = [control_window] + [w for w in metadata['window'].unique() if w != control_window]

    # categorical variables to investigate: 'gene_name', 'antioxidant' and 'window'
    print("\nInvestigating difference in fraction of worms paused between hit strain and control " +
          "(for each window), in the presence/absence of antioxidants:\n")    

    # print mean sample size
    sample_size = df_summary_stats(metadata, columns=['gene_name', 'antioxidant', 'window'])
    print("Mean sample size of strain/antioxidant for each window: %d" %\
          (int(sample_size['n_samples'].mean())))
            
    # plot dates as different colours (in loop)
    date_lut = dict(zip(list(metadata['date_yyyymmdd'].unique()), 
                        sns.color_palette('Set1', n_colors=len(metadata['date_yyyymmdd'].unique()))))
        
    for strain in strain_list[1:]: # skip control_strain at first index postion        
        plot_meta = metadata[np.logical_or(metadata['gene_name']==strain, 
                                           metadata['gene_name']==control_strain)]
        plot_feat = features.reindex(plot_meta.index)
        plot_df = plot_meta.join(plot_feat[[FEATURE]])
        
        # Is there a difference between strain vs control at any window? (pooled antioxidant data)
        print("Plotting windows for %s vs control" % strain)
        plt.close('all')
        fig, ax = plt.subplots(figsize=((len(window_list) if len(window_list) >= 20 else 12),8))
        ax = sns.boxplot(x='window', y=FEATURE, hue='gene_name', hue_order=strain_list, order=window_list,
                         data=plot_df, palette='Set3', dodge=True, ax=ax)
        for date in date_lut.keys():
            date_df = plot_df[plot_df['date_yyyymmdd']==date]   
            ax = sns.stripplot(x='window', y=FEATURE, hue='gene_name', order=window_list,
                               hue_order=strain_list, data=date_df, 
                               palette={control_strain:date_lut[date], strain:date_lut[date]}, 
                               alpha=0.7, size=4, dodge=True, ax=ax)
        n_labs = len(plot_df['gene_name'].unique())
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
                
        # scale plot to omit outliers (>2.5*IQR from mean)
        if scale_outliers_box:
            grouped_strain = plot_df.groupby('window')
            y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers
            # Computing IQR
            Q1 = grouped_strain[FEATURE].quantile(0.25)
            Q3 = grouped_strain[FEATURE].quantile(0.75)
            IQR = Q3 - Q1
            plt.ylim(-0.02, max(y_bar) + 3 * max(IQR))
            
        # load t-test results + annotate p-values on plot
        for ii, window in enumerate(window_list):
            ttest_strain_path = stats_dir / 'pairwise_ttests' / 'window' /\
                                '{}_window_results.csv'.format(strain)
            ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0)
            strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
            strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] 
            p = strain_pvals_t.loc[FEATURE, str(window)]
            text = ax.get_xticklabels()[ii]
            assert text.get_text() == str(window)
            p_text = 'P<0.001' if p < 0.001 else 'P=%.3f' % p
            #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max()
            #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50
            trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
            plt.plot([ii-.3, ii-.3, ii+.3, ii+.3], 
                     [0.98, 0.99, 0.99, 0.98], #[y+h, y+2*h, y+2*h, y+h], 
                     lw=1.5, c='k', transform=trans)
            ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans,
                    rotation=(0 if len(window_list) <= 20 else 90))
            
        ax.set_xticks(range(len(window_list)+1))
        xlabels = [str(int(WINDOW_FRAME_DICT[w][0]/60)) for w in window_list]
        ax.set_xticklabels(xlabels)
        x_text = 'Time (minutes)' if ALL_WINDOWS else 'Time of bluelight 10-second burst (minutes)'
        ax.set_xlabel(x_text, fontsize=15, labelpad=10)
        ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10)
        
        fig_savepath = plot_dir / 'window_boxplots' / strain / (FEATURE + '.png')
        fig_savepath.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_savepath)
    
    
        # Is there a difference between strain vs control for any antioxidant? (pooled window data)
        plt.close('all')
        fig, ax = plt.subplots(figsize=(10,8))
        ax = sns.boxplot(x='antioxidant', y=FEATURE, hue='gene_name', hue_order=strain_list, data=plot_df,
                          palette='Set3', dodge=True, order=antiox_list)
        ax = sns.swarmplot(x='antioxidant', y=FEATURE, hue='gene_name', hue_order=strain_list, data=plot_df,
                          color='k', alpha=0.7, size=4, dodge=True, order=antiox_list)
        n_labs = len(plot_df['gene_name'].unique())
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
        ax.set_xlabel('antioxidant', fontsize=15, labelpad=10)
        ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10)
        
        # scale plot to omit outliers (>2.5*IQR from mean)
        if scale_outliers_box:
            grouped_strain = plot_df.groupby('antioxidant')
            y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers
            # Computing IQR
            Q1 = grouped_strain[FEATURE].quantile(0.25)
            Q3 = grouped_strain[FEATURE].quantile(0.75)
            IQR = Q3 - Q1
            plt.ylim(min(y_bar) - 2.5 * max(IQR), max(y_bar) + 2.5 * max(IQR))
            
        # annotate p-values
        for ii, antiox in enumerate(antiox_list):
            ttest_strain_path = stats_dir / 'pairwise_ttests' / 'antioxidant' /\
                                '{}_antioxidant_results.csv'.format(strain)
            ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0)
            strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
            strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] 
            p = strain_pvals_t.loc[FEATURE, antiox]
            text = ax.get_xticklabels()[ii]
            assert text.get_text() == antiox
            p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p
            #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max()
            #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50
            trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
            plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], 
                      [0.8, 0.81, 0.81, 0.8], #[y+h, y+2*h, y+2*h, y+h], 
                      lw=1.5, c='k', transform=trans)
            ax.text(ii, 0.82, p_text, fontsize=9, ha='center', va='bottom', transform=trans)
                
        fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / (FEATURE + '.png')
        fig_savepath.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_savepath)
        
    # Plot for each strain separately to see whether antioxidants had an effect at all
    for strain in strain_list:
            
        plt.close('all')
        fig, ax = plt.subplots(figsize=(10,8))
        ax = sns.boxplot(x='antioxidant', y=FEATURE, order=antiox_list, 
                         dodge=True, data=plot_df[plot_df['gene_name']==strain])
        ax = sns.swarmplot(x='antioxidant', y=FEATURE, order=antiox_list, 
                           dodge=True, data=plot_df[plot_df['gene_name']==strain],
                           alpha=0.7, size=4, color='k')        
        n_labs = len(plot_df['antioxidant'].unique())
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
        ax.set_xlabel('antioxidant', fontsize=15, labelpad=10)
        ax.set_ylabel(FEATURE.replace('_',' '), fontsize=15, labelpad=10)
        
        # scale plot to omit outliers (>2.5*IQR from mean)
        if scale_outliers_box:
            grouped_strain = plot_df.groupby('antioxidant')
            y_bar = grouped_strain[FEATURE].median() # median is less skewed by outliers
            # Computing IQR
            Q1 = grouped_strain[FEATURE].quantile(0.25)
            Q3 = grouped_strain[FEATURE].quantile(0.75)
            IQR = Q3 - Q1
            plt.ylim(min(y_bar) - 1 * max(IQR), max(y_bar) + 2.5 * max(IQR))
            
        # annotate p-values
        for ii, antiox in enumerate(antiox_list):
            if antiox == control_antioxidant:
                continue
            # load antioxidant results for strain
            ttest_strain_path = stats_dir / 't-test_{}_antioxidant_results.csv'.format(strain)
            ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0, header=0)
            strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
            strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns] 
            p = strain_pvals_t.loc[FEATURE, antiox]
            text = ax.get_xticklabels()[ii]
            assert text.get_text() == antiox
            p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p
            trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
            #plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], [0.98, 0.99, 0.98, 0.99], lw=1.5, c='k', transform=trans)
            ax.text(ii, 1.01, p_text, fontsize=9, ha='center', va='bottom', transform=trans)
                
        plt.title(strain, fontsize=18, pad=30)
        fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / (FEATURE + '.png')
        fig_savepath.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(fig_savepath)
        
        
    # Hierarchical Clustering Analysis
    #   - Clustermap of features by strain, to see if data cluster into groups
    #   - Control data is clustered first, feature order is stored and ordering applied to 
    #     full data for comparison
    
    # subset for Tierpsy top16 features only
    features = select_feat_set(features, tierpsy_set_name='tierpsy_16', append_bluelight=False)
    
    # Ensure no NaNs or features with zero standard deviation before normalisation
    assert not features.isna().sum(axis=0).any()
    assert not (features.std(axis=0) == 0).any()
       
    # Extract data for control
    control_feat_df = features[metadata['gene_name']==control_strain]
    control_meta_df = metadata.reindex(control_feat_df.index)
    
    control_feat_df, control_meta_df = clean_summary_results(features=control_feat_df,
                                                             metadata=control_meta_df,
                                                             imputeNaN=False)
    

    #zscores = (df-df.mean())/df.std() # minus mean, divide by std
    controlZ_feat_df = control_feat_df.apply(zscore, axis=0)

    # plot clustermap for control        
    control_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format(control_strain)
    cg = plot_clustermap(featZ=controlZ_feat_df,
                         meta=control_meta_df,
                         row_colours=True,
                         group_by=['gene_name','antioxidant'],
                         col_linkage=None,
                         method='complete',#[linkage, complete, average, weighted, centroid]
                         figsize=(20,10),
                         show_xlabels=True,
                         label_size=15,
                         sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85},
                         saveto=control_clustermap_path,
                         bluelight_col_colours=False)

    # extract clustered feature order
    clustered_features = np.array(controlZ_feat_df.columns)[cg.dendrogram_col.reordered_ind]
     
    featZ_df = features.apply(zscore, axis=0)
    
    # Save stats table to CSV   
    # if not stats_path.exists():
    #     # Add z-normalised values
    #     z_stats = featZ_df.join(meta_df[GROUPING_VAR]).groupby(by=GROUPING_VAR).mean().T
    #     z_mean_cols = ['z-mean ' + v for v in z_stats.columns.to_list()]
    #     z_stats.columns = z_mean_cols
    #     stats_table = stats_table.join(z_stats)
    #     first_cols = [m for m in stats_table.columns if 'mean' in m]
    #     last_cols = [c for c in stats_table.columns if c not in first_cols]
    #     first_cols.extend(last_cols)
    #     stats_table = stats_table[first_cols].reset_index()
    #     first_cols.insert(0, 'feature')
    #     stats_table.columns = first_cols
    #     stats_table['feature'] = [' '.join(f.split('_')) for f in stats_table['feature']]
    #     stats_table = stats_table.sort_values(by='{} p-value'.format((T_TEST_NAME if 
    #                                  len(run_strain_list) == 2 else TEST_NAME)), ascending=True)
    #     stats_table_path = stats_dir / 'stats_summary_table.csv'
    #     stats_table.to_csv(stats_table_path, header=True, index=None)
    
    # Clustermap of full data - antioxidants  
    full_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format('gene_antioxidant')
    _ = plot_clustermap(featZ=featZ_df,
                        meta=metadata, 
                        group_by=['gene_name','antioxidant'],
                        col_linkage=None,
                        method='complete',
                        figsize=(20,10),
                        show_xlabels=True,
                        label_size=15,
                        sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85},
                        saveto=full_clustermap_path,
                        bluelight_col_colours=False)

    # Heatmap of strain/antioxidant treatment, ordered by control clustered feature order
    heatmap_date_path = plot_dir / 'heatmaps' / 'gene_antioxidant_heatmap.pdf'
    plot_barcode_heatmap(featZ=featZ_df[clustered_features], 
                         meta=metadata, 
                         group_by=['gene_name','antioxidant'], 
                         pvalues_series=None,
                         saveto=heatmap_date_path,
                         figsize=(20,6),
                         sns_colour_palette="Pastel1")    
      
    # Clustermap of full data - windows  
    full_clustermap_path = plot_dir / 'heatmaps' / '{}_clustermap.pdf'.format('gene_window')
    _ = plot_clustermap(featZ=featZ_df,
                        meta=metadata, 
                        group_by=['gene_name','window'],
                        col_linkage=None,
                        method='complete',
                        figsize=(20,10),
                        show_xlabels=True,
                        label_size=15,
                        sub_adj={'bottom':0.6,'left':0,'top':1,'right':0.85},
                        saveto=full_clustermap_path,
                        bluelight_col_colours=False)
                  
    # Principal Components Analysis (PCA)

    if remove_outliers:
        outlier_path = plot_dir / 'mahalanobis_outliers.pdf'
        features, inds = remove_outliers_pca(df=features, 
                                            features_to_analyse=None, 
                                            saveto=outlier_path)
        metadata = metadata.reindex(features.index)
        featZ_df = features.apply(zscore, axis=0)
  
    # project data + plot PCA
    #from tierpsytools.analysis.decomposition import plot_pca
    pca_dir = plot_dir / 'PCA'
    _ = plot_pca(featZ=featZ_df, 
                 meta=metadata, 
                 group_by='gene_name', 
                 n_dims=2,
                 control=control_strain,
                 var_subset=None, 
                 saveDir=pca_dir,
                 PCs_to_keep=10,
                 n_feats2print=10,
                 sns_colour_palette="Set1",
                 figsize=(12,8),
                 sub_adj={'bottom':0.1,'left':0.1,'top':0.95,'right':0.7},
                 legend_loc=[1.02,0.6],
                 hypercolor=False) 
         
    # t-distributed Stochastic Neighbour Embedding (tSNE)

    tsne_dir = plot_dir / 'tSNE'
    perplexities = [5,15,30] # NB: perplexity parameter should be roughly equal to group size
    
    _ = plot_tSNE(featZ=featZ_df,
                  meta=metadata,
                  group_by='gene_name',
                  var_subset=None,
                  saveDir=tsne_dir,
                  perplexities=perplexities,
                  figsize=(8,8),
                  label_size=15,
                  size=20,
                  sns_colour_palette="Set1")
   
    # Uniform Manifold Projection (UMAP)

    umap_dir = plot_dir / 'UMAP'
    n_neighbours = [5,15,30] # NB: n_neighbours parameter should be roughly equal to group size
    min_dist = 0.1 # Minimum distance parameter
    
    _ = plot_umap(featZ=featZ_df,
                  meta=metadata,
                  group_by='gene_name',
                  var_subset=None,
                  saveDir=umap_dir,
                  n_neighbours=n_neighbours,
                  min_dist=min_dist,
                  figsize=(8,8),
                  label_size=15,
                  size=20,
                  sns_colour_palette="Set1")
    
    _ = plot_pca_2var(featZ=featZ_df, 
                      meta=metadata, 
                      var1='gene_name',
                      var2='antioxidant',
                      saveDir=pca_dir,
                      PCs_to_keep=10,
                      n_feats2print=10,
                      sns_colour_palette="Set1",
                      label_size=15,
                      figsize=[9,8],
                      sub_adj={'bottom':0,'left':0,'top':1,'right':1})

    return
Пример #4
0
def compare_keio_rescue(features, metadata, args):
    """ Compare Keio single-gene deletion mutants with wild-type BW25113 control under different 
        antioxidant treatment conditions, and look to see if the addition of antioxidants can rescue
        the worm phenotype on these mutant strains, effectively bringing the worms back to wild-type
        behaviour.
        
        - Plot boxplots for each strain, comparing each pairwise antioxidant condition vs the control
          (for all features)
        - 
        
        Inputs
        ------
        features, metadata : pd.DataFrame
            Matching features summaries and metadata
        
        args : Object 
            Python object with the following attributes:
            - drop_size_features : bool
            - norm_features_only : bool
            - percentile_to_use : str
            - remove_outliers : bool
            - omit_strains : list
            - control_dict : dict
            - n_top_feats : int
            - tierpsy_top_feats_dir (if n_top_feats) : str
            - test : str
            - pval_threshold : float
            - fdr_method : str
            - n_sig_features : int
    """

    assert set(features.index) == set(metadata.index)

    strain_list = list(metadata[STRAIN_COLNAME].unique())
    antioxidant_list = list(metadata[TREATMENT_COLNAME].unique())
    assert CONTROL_STRAIN in strain_list and CONTROL_TREATMENT in antioxidant_list
    strain_list = [CONTROL_STRAIN] + [s for s in sorted(strain_list) if s != CONTROL_STRAIN]
    antioxidant_list = [CONTROL_TREATMENT] + [a for a in sorted(antioxidant_list) if a != CONTROL_TREATMENT]
    n_strains = len(strain_list)
    n_antiox = len(antioxidant_list)
    
    # assert there will be no errors due to case-sensitivity
    assert len(metadata[STRAIN_COLNAME].unique()) == len(metadata[STRAIN_COLNAME].str.upper().unique())
    assert len(metadata[TREATMENT_COLNAME].unique()) == len(metadata[TREATMENT_COLNAME].str.upper().unique())
    
    assert not features.isna().any().any()
    n_feats = features.shape[1]

    # construct save paths
    save_dir = get_save_dir(args)
    stats_dir =  save_dir / "Stats" / args.fdr_method
    plot_dir = save_dir / "Plots" / args.fdr_method
    plot_dir.mkdir(exist_ok=True, parents=True)

    # Print mean sample size
    sample_size = df_summary_stats(metadata, columns=[STRAIN_COLNAME, TREATMENT_COLNAME])
    ss_savepath = save_dir / 'sample_sizes.csv'
    sample_size.to_csv(ss_savepath, index=False)  

    # add combined treatment column (for heatmap/PCA)
    metadata['treatment_combination'] = [str(s) + '_' + str(a) for s, a in 
                                         zip(metadata[STRAIN_COLNAME], metadata[TREATMENT_COLNAME])]

    # Subset for control data for strain and for treatment
    control_strain_meta = metadata[metadata[STRAIN_COLNAME] == CONTROL_STRAIN]
    control_strain_feat = features.reindex(control_strain_meta.index)
    
    #control_antiox_meta = metadata[metadata[TREATMENT_COLNAME] == CONTROL_TREATMENT]
    #control_antiox_feat = features.reindex(control_antiox_meta.index)
                
# =============================================================================
#     ##### Control variation ##### 
#     # Clean data after subset - to remove features with zero std
#     control_strain_feat, control_strain_meta = clean_summary_results(control_strain_feat, 
#                                                                      control_strain_meta, 
#                                                                      max_value_cap=False,
#                                                                      imputeNaN=False)  
#     
#     control_antiox_feat, control_antiox_meta = clean_summary_results(control_antiox_feat, 
#                                                                      control_antiox_meta, 
#                                                                      max_value_cap=False,
#                                                                      imputeNaN=False)                   
#     if args.analyse_control:
#         control_variation(control_strain_feat, control_strain_meta, args,
#                           variables=[TREATMENT_COLNAME], n_sig_features=10)
#         control_variation(control_antiox_feat, control_antiox_meta, args,
#                           variables=[STRAIN_COLNAME], n_sig_features=10)                            
# =============================================================================
       
    print("\nComparing %d %ss with %d %s treatments for %d features" %\
          (n_strains, STRAIN_COLNAME, n_antiox, TREATMENT_COLNAME, n_feats))

    t_test = 't-test' if args.test == 'ANOVA' else 'Mann-Whitney' # aka. Wilcoxon rank-sum

    ##### FOR EACH STRAIN #####
    
    for strain in tqdm(strain_list[1:]):
        print("\nPlotting results for %s:" % strain)
        strain_meta = metadata[metadata[STRAIN_COLNAME]==strain]
        strain_feat = features.reindex(strain_meta.index)
        
        # Load ANOVA results for strain
        if not args.use_corrected_pvals:
            anova_strain_path = stats_dir / '{}_uncorrected.csv'.format((args.test + '_' + strain))
        else:
            anova_strain_path = stats_dir / '{}_results.csv'.format((args.test + '_' + strain))
        anova_strain_table = pd.read_csv(anova_strain_path, index_col=0)            
        strain_pvals = anova_strain_table.sort_values(by='pvals', ascending=True)['pvals'] # rank features by p-value
        #strain_fset = strain_pvals[strain_pvals < args.pval_threshold].index.to_list()  
        
        # load antioxidant t-test results
        ttest_strain_path = stats_dir / 'pairwise_ttests' / '{}_results.csv'.format(strain + 
                            "_vs_" + CONTROL_STRAIN)
        ttest_strain_table = pd.read_csv(ttest_strain_path, index_col=0)
        strain_pvals_t = ttest_strain_table[[c for c in ttest_strain_table if "pvals_" in c]] 
        strain_pvals_t.columns = [c.split('pvals_')[-1] for c in strain_pvals_t.columns]       
        #strain_fset_t = strain_pvals_t[(strain_pvals_t < args.pval_threshold).sum(axis=1) > 0].index.to_list()
           
        # Plot ranked n significant features by t-test for each antioxidant treatment
        ranked_antiox_nsig = (strain_pvals_t < args.pval_threshold).sum(axis=0).sort_values(ascending=False)
        ranked_antiox_nsig_path = plot_dir / ('{}_ranked_number_sigfeats_'.format(strain) + ('uncorrected' 
                                              if args.fdr_method is None else args.fdr_method) + '.png')
        plt.close('all')
        fig, ax = plt.subplots() #figsize=(20,6)
        ax.plot(ranked_antiox_nsig)
        ax.set_xticklabels(ranked_antiox_nsig.index.to_list(), rotation=90, fontsize=5)
        plt.xlabel("Antioxidant (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Number of significant features", fontsize=12, labelpad=10)
        plt.tight_layout()
        plt.savefig(ranked_antiox_nsig_path, dpi=600)
        
        # Plot ranked lowest pval by t-test for each antioxidant treatment
        ranked_antiox_pval = strain_pvals_t.min(axis=0).sort_values(ascending=True)
        lowest_antiox_pval_path = plot_dir / ('{}_ranked_lowest_pval_'.format(strain) + ('uncorrected' 
                                              if args.fdr_method is None else args.fdr_method) + '.png')
        plt.close('all')
        fig, ax = plt.subplots()
        ax.plot(ranked_antiox_pval)
        plt.axhline(y=args.pval_threshold, c='dimgray', ls='--')
        ax.set_xticklabels(ranked_antiox_nsig.index.to_list(), rotation=90, fontsize=5)
        plt.xlabel("Antioxidant (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Lowest p-value by t-test", fontsize=12, labelpad=10)
        plt.tight_layout()
        plt.savefig(lowest_antiox_pval_path, dpi=600)
        plt.close()

# =============================================================================
#         print("\nMaking errorbar plots")
#         errorbar_sigfeats(strain_feat, strain_meta, 
#                           group_by=TREATMENT_COLNAME, 
#                           fset=strain_pvals.index, 
#                           control=CONTROL_TREATMENT, 
#                           rank_by='mean',
#                           max_feats2plt=args.n_sig_features, 
#                           figsize=[20,10], 
#                           fontsize=15,
#                           ms=20,
#                           elinewidth=7,
#                           fmt='.',
#                           tight_layout=[0.01,0.01,0.99,0.99],
#                           saveDir=plot_dir / 'errorbar' / strain)
# =============================================================================
                
        if strain != CONTROL_STRAIN:
            # stick together length-wise
            plot_meta = pd.concat([control_strain_meta, strain_meta], ignore_index=True)
            plot_feat = pd.concat([control_strain_feat, strain_feat], ignore_index=True)
            # stick together width-wise
            plot_df = plot_meta.join(plot_feat)
    
            # Plot boxplots for top 10 features comparing strain vs wild-type for each antioxidant treatment            
            for f, feature in enumerate(tqdm(strain_pvals.index)):
                            
                plt.close('all')
                fig, ax = plt.subplots(figsize=(10,8))
                ax = sns.boxplot(x=TREATMENT_COLNAME, y=feature, hue=STRAIN_COLNAME, data=plot_df,
                                 palette='Set3', dodge=True, order=antioxidant_list)
                ax = sns.swarmplot(x=TREATMENT_COLNAME, y=feature, hue=STRAIN_COLNAME, data=plot_df,
                                 color='k', alpha=0.7, size=4, dodge=True, order=antioxidant_list)
                n_labs = len(plot_df[STRAIN_COLNAME].unique())
                handles, labels = ax.get_legend_handles_labels()
                ax.legend(handles[:n_labs], labels[:n_labs], fontsize=15, frameon=False, loc='upper right')
                ax.set_xlabel(TREATMENT_COLNAME, fontsize=15, labelpad=10)
                ax.set_ylabel(feature.replace('_',' '), fontsize=15, labelpad=10)
                
                # scale plot to omit outliers (>2.5*IQR from mean)
                if scale_outliers_box:
                    grouped_strain = plot_df.groupby('antioxidant')
                    y_bar = grouped_strain[feature].median() # median is less skewed by outliers
                    # Computing IQR
                    Q1 = grouped_strain[feature].quantile(0.25)
                    Q3 = grouped_strain[feature].quantile(0.75)
                    IQR = Q3 - Q1
                    plt.ylim(min(y_bar) - 2.5 * max(IQR), max(y_bar) + 2.5 * max(IQR))
                    
                # annotate p-values
                for ii, antiox in enumerate(antioxidant_list):
                    try:
                        p = strain_pvals_t.loc[feature, antiox]
                        text = ax.get_xticklabels()[ii]
                        assert text.get_text() == antiox
                        p_text = 'P < 0.001' if p < 0.001 else 'P = %.3f' % p
                        #y = (y_bar[antiox] + 2 * IQR[antiox]) if scale_outliers_box else plot_df[feature].max()
                        #h = (max(IQR) / 10) if scale_outliers_box else (y - plot_df[feature].min()) / 50
                        trans = transforms.blended_transform_factory(ax.transData, ax.transAxes)
                        plt.plot([ii-.2, ii-.2, ii+.2, ii+.2], 
                                 [0.8, 0.81, 0.81, 0.8], #[y+h, y+2*h, y+2*h, y+h], 
                                 lw=1.5, c='k', transform=trans)
                        ax.text(ii, 0.82, p_text, fontsize=9, ha='center', va='bottom', transform=trans)
                    except Exception as e:
                        print(e)
                        
                fig_savepath = plot_dir / 'antioxidant_boxplots' / strain / ('{}_'.format(f+1) + 
                                                                             feature + '.png')
                fig_savepath.parent.mkdir(parents=True, exist_ok=True)
                plt.savefig(fig_savepath)
            
    ##### FOR EACH ANTIOXIDANT #####
    
    for antiox in antioxidant_list:
        print("\nPlotting results for %s:" % antiox)
        #antiox_meta = metadata[metadata[TREATMENT_COLNAME]==antiox]
        #antiox_feat = features.reindex(antiox_meta.index)

        # Load ANOVA results for antioxidant
        # if not args.use_corrected_pvals:
        #     anova_antiox_path = stats_dir / '{}_uncorrected.csv'.format((args.test + '_' + antiox))
        # else:
        #     anova_antiox_path = stats_dir / '{}_results.csv'.format((args.test + '_' + antiox))
        #anova_antiox_table = pd.read_csv(anova_antiox_path, index_col=0)            
        #antiox_pvals = anova_antiox_table.sort_values(by='pvals', ascending=True)['pvals'] # rank features by p-value
        #antiox_fset = antiox_pvals[antiox_pvals < args.pval_threshold].index.to_list()  
        
        # Load t-test results
        if not args.use_corrected_pvals:
            ttest_antiox_path = stats_dir / '{}_uncorrected.csv'.format((t_test + '_' + antiox))
        else:
            ttest_antiox_path = stats_dir / '{}_results.csv'.format((t_test + '_' + antiox))
        ttest_antiox_table = pd.read_csv(ttest_antiox_path, index_col=0)
        antiox_pvals_t = ttest_antiox_table[[c for c in ttest_antiox_table if "pvals_" in c]] 
        antiox_pvals_t.columns = [c.split('pvals_')[-1] for c in antiox_pvals_t.columns]       
        #antiox_fset_t = antiox_pvals_t[(antiox_pvals_t < args.pval_threshold).sum(axis=1) > 0].index.to_list()
     
        # Plot ranked n significant features by t-test for each strain
        ranked_strain_nsig = (antiox_pvals_t < args.pval_threshold).sum(axis=0).sort_values(ascending=False)
        ranked_strain_nsig_path = plot_dir / ('{}_ranked_number_sigfeats_'.format(antiox) + ('uncorrected' 
                                              if args.fdr_method is None else args.fdr_method) + '.png')
        plt.close('all')
        fig, ax = plt.subplots() #figsize=(20,6)
        ax.plot(ranked_strain_nsig)
        ax.set_xticklabels(ranked_strain_nsig.index.to_list(), rotation=90, fontsize=5)
        plt.xlabel("Strain (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Number of significant features", fontsize=12, labelpad=10)
        plt.tight_layout()
        plt.savefig(ranked_strain_nsig_path, dpi=600)
        
        # Plot ranked lowest pval by t-test for each antioxidant treatment
        ranked_strain_pval = antiox_pvals_t.min(axis=0).sort_values(ascending=True)
        lowest_strain_pval_path = plot_dir / ('{}_ranked_lowest_pval_'.format(antiox) + ('uncorrected' 
                                              if args.fdr_method is None else args.fdr_method) + '.png')
        plt.close('all')
        fig, ax = plt.subplots()
        ax.plot(ranked_strain_pval)
        plt.axhline(y=args.pval_threshold, c='dimgray', ls='--')
        ax.set_xticklabels(ranked_strain_nsig.index.to_list(), rotation=90, fontsize=5)
        plt.xlabel("Strain (ranked)", fontsize=12, labelpad=10)
        plt.ylabel("Lowest p-value by t-test", fontsize=12, labelpad=10)
        plt.tight_layout()
        plt.savefig(lowest_strain_pval_path, dpi=600)
        plt.close()
        
        # Plot boxplots for top 10 features comparing antioxidant vs None for each strain
        # TODO

# =============================================================================
#         print("Making boxplots")
#         boxplots_grouped(feat_meta_df=metadata.join(features), 
#                           group_by=grouping_var,
#                           control_group=control,
#                           test_pvalues_df=(pvals_t.T if len(fset) > 0 else None), 
#                           feature_set=fset,
#                           max_feats2plt=args.n_sig_features, 
#                           max_groups_plot_cap=None,
#                           p_value_threshold=args.pval_threshold,
#                           drop_insignificant=False,
#                           sns_colour_palette="tab10",
#                           figsize=[6,130], 
#                           saveDir=plot_dir / ('boxplots' + '_' + (
#                                   'uncorrected' if args.fdr_method is None else args.fdr_method) + 
#                                   '.png'))
# =============================================================================

        # # If no sigfeats, subset for top strains ranked by lowest p-value by t-test for any feature    
        # if len(hit_strains_nsig) == 0:
        #     print("\Saving lowest %d strains ranked by p-value for any feature" % N_LOWEST_PVAL)
        #     write_list_to_file(hit_strains_pval, stats_dir / 'Top100_lowest_pval.txt')
        #     hit_strains = hit_strains_pval
        # elif len(hit_strains_nsig) > 0:
        #     hit_strains = hit_strains_nsig

        # # Individual boxplots of significant features by pairwise t-test (each group vs control)
        # boxplots_sigfeats(features,
        #                   y_class=metadata[grouping_var],
        #                   control=control,
        #                   pvals=pvals_t, 
        #                   z_class=metadata['date_yyyymmdd'],
        #                   feature_set=None,
        #                   saveDir=plot_dir / 'paired_boxplots',
        #                   p_value_threshold=args.pval_threshold,
        #                   drop_insignificant=True if len(hit_strains) > 0 else False,
        #                   max_sig_feats=args.n_sig_features,
        #                   max_strains=N_LOWEST_PVAL if len(hit_strains_nsig) == 0 else None,
        #                   sns_colour_palette="tab10",
        #                   verbose=False)
        
        # if SUBSET_HIT_STRAINS:
        #     strain_list = [control] + hit_strains[:TOP_N_HITS]
        #     print("Subsetting for Top%d hit strains" % (len(strain_list)-1))
        #     features, metadata = subset_results(features, metadata, column=grouping_var,
        #                                         groups=strain_list, verbose=False)   
        # else:
        #     strain_list = list(metadata[grouping_var].unique())
                   
    ##### Hierarchical Clustering Analysis #####

    # Z-normalise control data
    control_strain_featZ = control_strain_feat.apply(zscore, axis=0)
    
    ### Control clustermap
    
    # control data is clustered and feature order is stored and applied to full data
    print("\nPlotting clustermap for %s control" % CONTROL_STRAIN)
    
    control_clustermap_path = plot_dir / 'heatmaps' / 'control_clustermap.pdf'
    cg = plot_clustermap(control_strain_featZ, control_strain_meta,
                         group_by='treatment_combination',
                         method=METHOD, 
                         metric=METRIC,
                         figsize=[20,6],
                         sub_adj={'bottom':0.05,'left':0,'top':1,'right':0.85},
                         saveto=control_clustermap_path,
                         label_size=15,
                         show_xlabels=False)
    # control clustermap with labels
    if args.n_top_feats <= 256:
        control_clustermap_path = plot_dir / 'heatmaps' / 'control_clustermap_label.pdf'
        cg = plot_clustermap(control_strain_featZ, control_strain_meta,
                             group_by='treatment_combination',
                             method=METHOD, 
                             metric=METRIC,
                             figsize=[20,10],
                             sub_adj={'bottom':0.7,'left':0,'top':1,'right':0.85},
                             saveto=control_clustermap_path,
                             label_size=(15,15),
                             show_xlabels=True)

    #col_linkage = cg.dendrogram_col.calculated_linkage
    control_clustered_features = np.array(control_strain_featZ.columns)[cg.dendrogram_col.reordered_ind]

    # ### Full clustermap 
    # TODO: all strains, for each treatment
    # TODO: all treatments, for each strain
    # all strains/treatments together
    
    # Z-normalise data for all strains
    featZ = features.apply(zscore, axis=0)
                    
    ## Save z-normalised values
    # z_stats = featZ.join(hit_metadata[grouping_var]).groupby(by=grouping_var).mean().T
    # z_stats.columns = ['z-mean_' + v for v in z_stats.columns.to_list()]
    # z_stats.to_csv(z_stats_path, header=True, index=None)
    
    # Clustermap of full data   
    print("Plotting all strains clustermap")    
    full_clustermap_path = plot_dir / 'heatmaps' / 'full_clustermap.pdf'
    fg = plot_clustermap(featZ, metadata, 
                         group_by='treatment_combination',
                         row_colours=None,
                         method=METHOD, 
                         metric=METRIC,
                         figsize=[20,30],
                         sub_adj={'bottom':0.01,'left':0,'top':1,'right':0.95},
                         saveto=full_clustermap_path,
                         label_size=8,
                         show_xlabels=False)
    if args.n_top_feats <= 256:
        full_clustermap_path = plot_dir / 'heatmaps' / 'full_clustermap_label.pdf'
        fg = plot_clustermap(featZ, metadata, 
                             group_by='treatment_combination',
                             row_colours=None,
                             method=METHOD, 
                             metric=METRIC,
                             figsize=[20,40],
                             sub_adj={'bottom':0.18,'left':0,'top':1,'right':0.95},
                             saveto=full_clustermap_path,
                             label_size=(15,10),
                             show_xlabels=True)
    
    # clustered feature order for all strains
    _ = np.array(featZ.columns)[fg.dendrogram_col.reordered_ind]
    
    pvals_heatmap = anova_strain_table.loc[control_clustered_features, 'pvals']
    pvals_heatmap.name = 'P < {}'.format(args.pval_threshold)

    assert all(f in featZ.columns for f in pvals_heatmap.index)
            
    # Plot heatmap (averaged for each sample)
    if len(metadata['treatment_combination'].unique()) < 250:
        print("\nPlotting barcode heatmap")
        heatmap_path = plot_dir / 'heatmaps' / 'full_heatmap.pdf'
        plot_barcode_heatmap(featZ=featZ[control_clustered_features], 
                             meta=metadata, 
                             group_by='treatment_combination', 
                             pvalues_series=pvals_heatmap,
                             p_value_threshold=args.pval_threshold,
                             selected_feats=None, # fset if len(fset) > 0 else None
                             saveto=heatmap_path,
                             figsize=[20,30],
                             sns_colour_palette="Pastel1",
                             label_size=15,
                             sub_adj={'top':0.95,'bottom':0.01,'left':0.15,'right':0.92})        
                    
    # ##### Principal Components Analysis #####

    pca_dir = plot_dir / 'PCA'
    
    # remove outlier samples from PCA
    if args.remove_outliers:
        outlier_path = pca_dir / 'mahalanobis_outliers.pdf'
        features, inds = remove_outliers_pca(df=features, saveto=outlier_path)
        metadata = metadata.reindex(features.index) # reindex metadata
        featZ = features.apply(zscore, axis=0) # re-normalise data

        # Drop features with NaN values after normalising
        n_cols = len(featZ.columns)
        featZ.dropna(axis=1, inplace=True)
        n_dropped = n_cols - len(featZ.columns)
        if n_dropped > 0:
            print("Dropped %d features after normalisation (NaN)" % n_dropped)

    # plot PCA 
    # Total of 50 treatment combinations, so plot fes/fepD/entA/wild_type only
    treatment_list = sorted(list(metadata['treatment_combination'].unique()))
    treatment_subset = [i for i in treatment_list if i.split('_')[0] in ['fes','fepD','entA','wild']]
    _ = plot_pca(featZ, metadata, 
                 group_by='treatment_combination', 
                 control=CONTROL_STRAIN + '_' + CONTROL_TREATMENT,
                 var_subset=treatment_subset, 
                 saveDir=pca_dir,
                 PCs_to_keep=10,
                 n_feats2print=10,
                 kde=False,
                 sns_colour_palette="plasma",
                 n_dims=2,
                 label_size=8,
                 sub_adj={'bottom':0.13,'left':0.13,'top':0.95,'right':0.88},
                 legend_loc=[1.02,0.6],
                 hypercolor=False)

    # add details of COG category information to metadata 
    # (using hard-coded dict of info from Baba et al. 2006 paper)
    metadata['COG_category'] = metadata['COG_category'].map(COG_category_dict)
    
    # plot pca coloured by Keio COG category    
    _ = plot_pca(featZ, metadata, 
                 group_by='COG_category', 
                 control=None,
                 var_subset=list(metadata['COG_category'].dropna().unique()), 
                 saveDir=pca_dir / 'COG',
                 PCs_to_keep=10,
                 n_feats2print=10,
                 kde=False,
                 n_dims=2,
                 hypercolor=False,
                 label_size=8,
                 figsize=[12,8],
                 sub_adj={'bottom':0.1,'left':0.1,'top':0.95,'right':0.7},
                 legend_loc=[1.02,0.6],
                 sns_colour_palette="plasma")

    ##### t-distributed Stochastic Neighbour Embedding #####   
    mean_sample_size = int(sample_size['n_samples'].mean())

    print("\nPerforming tSNE")
    tsne_dir = plot_dir / 'tSNE'
    perplexities = [mean_sample_size] # NB: should be roughly equal to group size    
    _ = plot_tSNE(featZ, metadata,
                  group_by='treatment_combination',
                  var_subset=treatment_subset,
                  saveDir=tsne_dir,
                  perplexities=perplexities,
                  figsize=[8,8],
                  label_size=7,
                  marker_size=30,
                  sns_colour_palette="plasma")
    
    print("\nPerforming tSNE")
    tsne_dir = plot_dir / 'tSNE'
    perplexities = [mean_sample_size] # NB: should be roughly equal to group size    
    _ = plot_tSNE(featZ, metadata,
                  group_by='COG_category',
                  var_subset=list(metadata['COG_category'].dropna().unique()),
                  saveDir=tsne_dir / 'COG_category',
                  perplexities=perplexities,
                  figsize=[8,8],
                  label_size=7,
                  marker_size=30,
                  sns_colour_palette="plasma")

    ##### Uniform Manifold Projection #####  
    
    print("\nPerforming UMAP")
    umap_dir = plot_dir / 'UMAP'
    n_neighbours = [mean_sample_size] # NB: should be roughly equal to group size
    min_dist = 0.1 # Minimum distance parameter    
    _ = plot_umap(featZ, metadata,
                  group_by='treatment_combination',
                  var_subset=treatment_subset,
                  saveDir=umap_dir,
                  n_neighbours=n_neighbours,
                  min_dist=min_dist,
                  figsize=[8,8],
                  label_size=7,
                  marker_size=30,
                  sns_colour_palette="plasma")