def plot_variance(main_dir='./', plot_dir='./plots/', log=True, genus_only=True): df = sort_by_variance(main_dir=main_dir, genus_only=genus_only) fig, ax = plt.subplots() df.reset_index().variance.plot(ax=ax, kind='hist', bins=100) if log: ax.set_yscale('log') plt.title('distribution of variances (log scale)') plt.xlabel("variance for genera's abundance across all samples") plt.ylabel("frequency (log scale)") filepath = \ concat_dir_and_filename(plot_dir, 'distribution_of_sample-wise_variances.pdf') plt.savefig(filepath)
def plot_pca_results(top_percent=20, genus_only=False, facet_row=True, uniform_axes=True, main_dir='./', plot_dir='./plots/', savefig=True): # get data that's ready for PCA: pca_input, data_transformed, variances = \ run_pca(main_dir=main_dir, top_percent=top_percent, genus_only=genus_only) # import the sample info needed to map features to sample IDs. sample_info = colnames_to_sample_info_array(pca_input, main_dir=main_dir) # prepare axis labels, which also serve as dataframe column names. x_axis_label = 'principal component 1 ({0:.0%})'.format(variances[0]) y_axis_label = 'principal component 2 ({0:.0%})'.format(variances[1]) # put together the transformed data and sample descriptions plot_data = pd.concat([pd.DataFrame( {x_axis_label: data_transformed[:, 0], y_axis_label: data_transformed[:, 1]}), sample_info], axis=1) # define a custom color palette using: # Conditions were seized at week ten, so seven early samples in # the original condition and four latest samples in an alternative # condition. color_palette = build_color_palette(num_items=14 - 4 + 1, weeks_before_switch=7) # color_palette = sns.cubehelix_palette(11, start=.5, rot=-.75) # update matplotlib params for bigger fonts, ticks: mpl.rcParams.update({ 'font.size': 16, 'axes.titlesize': 17, 'axes.labelsize': 15, 'xtick.labelsize': 10, 'ytick.labelsize': 13, 'font.weight': 600, 'axes.labelweight': 600, 'axes.titleweight': 600}) # Plot with Seaborn if facet_row: plt.figure(figsize=(4, 8)) else: plt.figure(figsize=(6, 12)) sns.set(style="ticks") # prepare the max and min axes values if we are forcing them to same range pc_colnames = [col for col in plot_data.columns if 'principal component' in col] max_value = plot_data[pc_colnames].max(axis=0).max() min_value = plot_data[pc_colnames].min(axis=0).min() axis_max = math.ceil(max_value * 100) / 100.0 axis_min = math.floor(min_value * 100) / 100.0 def base_plot(**kwargs): plot = sns.FacetGrid(plot_data, hue='week', palette=color_palette, size=3, aspect=1, **kwargs) plot = (plot.map(plt.scatter, x_axis_label, y_axis_label, edgecolor="w", s=60).add_legend()) return plot plot_args = {} if facet_row: plot_args['row'] = 'oxy' plot_args['col'] = 'rep' if uniform_axes: plot_args['xlim'] = (axis_min, axis_max) plot_args['ylim'] = (axis_min, axis_max) print(plot_args) g = base_plot(**plot_args) filename = concat_dir_and_filename( plot_dir, 'pca_of_top_{}_percent--'.format(top_percent)) # prepare a filename, depending on whether all taxonomy or only genus # is used. if genus_only: filename += 'genus_only' else: filename += 'all_taxa' if uniform_axes: filename += '_unif_axes_' if facet_row: filename += '--faceted.pdf' else: filename += '.pdf' if savefig: g.fig.savefig(filename)