def plot_frequency(n = 200): """ Draws the histogram of the distribution of n tweets by date. Parameters ---------- n: int An integer specifying how many tweets should be analysed. Returns ------- It saves the histogram as a .png file in the static folder. """ from plotnine import ggplot, aes, geom_histogram, scale_x_datetime, labs, theme_minimal, ggsave from Mod_1_API import gather_tweets from mizani.breaks import date_breaks from mizani.formatters import date_format import pandas df = pandas.DataFrame(gather_tweets(n)) plot1 = (ggplot(df, aes(x = 'Date', fill = 'Author')) + geom_histogram() + scale_x_datetime(breaks=date_breaks('1 week')) + labs(x = "Time in weeks", y = "Number of tweets by source") + theme_minimal() ) ggsave(plot = plot1, filename = "test.png", path = "static/")
def create_scatterplots(dataframe, unique_id='unique_id'): ''' Creates and saves scatterplots for each column in a dataframe Inputs: dataframe: a pandas dataframe unique_id: a pandas series representing a unique identifier for each observation Outputs: None ''' reset_df = dataframe.reset_index() for column in dataframe.columns: file_name = str(column) + 'scatterplot' + '.png' plt1 = p9.ggplot(reset_df, p9.aes(x=column, y=unique_id)) + p9.geom_point() print('Saving scatterplot: ' + file_name) p9.ggsave(filename=file_name, plot=plt1, device='png')
def plot_bar_predictions(data, filenamePlot, x, y, facet, plot_size): # dados = dados.loc[dados['Gain'] < 25] var_plot_bar_all_predictions = p9.ggplot(data, p9.aes(x=x, y=y)) +\ p9.geom_bar(stat='identity') +\ p9.geom_text(p9.aes(label=y),size=7, va='bottom') +\ p9.facet_wrap(facet) +\ p9.scales.scale_y_log10() +\ p9.theme(axis_text_x = p9.element_text(angle=90, size =7.5 )) +\ p9.theme(subplots_adjust={'wspace': 0.25}) p9.ggsave(var_plot_bar_all_predictions, 'images/plot_Bar_' + filenamePlot + '.png', height=plot_size, width=plot_size, units='in', dpi=300) return var_plot_bar_all_predictions
plot_background=element_rect(fill="white"), panel_background=element_rect(fill="white"), panel_grid_major_x=element_line(color="lightgrey"), panel_grid_major_y=element_line(color="lightgrey"), axis_line=element_line(color="grey"), legend_key=element_rect(fill='white', colour='white'), legend_title=element_text(family='sans-serif', size=15), legend_text=element_text(family='sans-serif', size=12), plot_title=element_text(family='sans-serif', size=15), axis_text=element_text(family='sans-serif', size=12), axis_title=element_text(family='sans-serif', size=15) ) \ + scale_color_manual(['#1976d2', '#b3e5fc']) \ print(panel_A) ggsave(plot=panel_A, filename=svcca_file, device="svg", dpi=300) ggsave(plot=panel_A, filename=svcca_png_file, device="svg", dpi=300) # ### Uncorrected PCA # In[14]: lst_num_partitions = [lst_num_partitions[i] for i in pca_ind] all_data_df = pd.DataFrame() # Get batch 1 data partition_1_file = os.path.join(compendia_dir, "Partition_1_0.txt.xz") partition_1 = pd.read_table(partition_1_file, header=0, index_col=0, sep='\t')
name='Filtration Step', values=['#1b9e77', '#d95f02', '#7570b3', '#e7298a'], labels=[ 'All Variants', 'Common Variants', 'Depth (< {} reads)'.format(replicate_filter_min_depth_count), 'Depth (> {} reads)'.format(replicate_filter_max_depth_count) ]) + gg.xlab('Sample') + gg.ylab('Final Number of Variants') + gg.theme_bw() + gg.theme(axis_text_x=gg.element_text(angle='90'), axis_text=gg.element_text(size=8), axis_title=gg.element_text(size=14))) p # In[13]: figure_file = os.path.join('figures', 'replicates_filtration_results.pdf') gg.ggsave(p, figure_file, height=5.5, width=6.5, dpi=500) # In[14]: p = (gg.ggplot( filter_counts_df, gg.aes(x='lane', y='COSMIC_count', fill='filter_min_depth_count')) + gg.geom_bar(stat='identity', position='dodge') + gg.geom_text( gg.aes(y=10, label='log_mut_count'), size=5, colour='white') + gg.scale_fill_gradient(low='blue', high='red', name='All Variants') + gg.facet_wrap('~ final_id') + gg.xlab('Lane') + gg.ylab('Number of COSMIC Variants') + gg.theme_bw() + gg.theme(axis_text_x=gg.element_text(angle='90'), axis_text=gg.element_text(size=8), axis_title=gg.element_text(size=14))) p
str(min_acceptable_range), " to ", str(max_acceptable_range), "\n\n") # save csv file outlierfile = filename.replace('.csv', '_outliers.csv') data_output.to_csv(outlierfile, index=False) # plot overlay of IQR and mod-Z score outliers p = ( p9.ggplot(data=data_output, mapping=p9.aes(x='age_rounded', y='value', group='age_rounded')) + p9.geom_jitter(mapping=p9.aes(color='z_outlier', outlier_alpha=0.1)) + p9.geom_boxplot(outlier_size=0, outlier_stroke=0) + p9.ggtitle( "Outliers detected via the IQR method (boxplot)\nand modified z-score method (dotplot)" ) + p9.ylim(-10, 175)) print(p) plotfile = filename.replace('.csv', '_outlierplot') p9.ggsave(plot=p, filename=plotfile) # plot regression x = data_stats_regression['age_rounded'] y = data_stats_regression['median'] plt.plot(x, y, 'o') plt.plot(x, r.func_linear(x, *linear_coeff)) plt.plot(x, r.func_log(x, *log10_coeff)) plt.plot(x, r.func_ln(x, *ln_coeff)) plt.title( "Regression performed on medians of age 1, 3 and 5\ndata with outliers removed" ) plt.show()
show_legend=False) \ + labs(x = "Number of Partitions", y = "Similarity score (SVCCA)", title = "Similarity across varying numbers of partitions") \ + theme(plot_title=element_text(weight='bold'), plot_background=element_rect(fill="white"), panel_background=element_rect(fill="white"), panel_grid_major_x=element_line(color="lightgrey"), panel_grid_major_y=element_line(color="lightgrey"), axis_line=element_line(color="grey"), legend_key=element_rect(fill='white', colour='white') ) \ + scale_color_manual(['#b3e5fc']) \ print(g) ggsave(plot=g, filename=svcca_uncorrected_file, dpi=300) # In[9]: # Plot - black lst_num_experiments = list(all_svcca.index[0:int(len(all_svcca.index) / 2)]) threshold = pd.DataFrame(pd.np.tile(permuted_score, (len(lst_num_experiments), 1)), index=lst_num_experiments, columns=['score']) g = ggplot(all_svcca[all_svcca['Group'] == 'uncorrected']) + geom_line(all_svcca[all_svcca['Group'] == 'uncorrected'], aes(x=lst_num_experiments, y='score', color='Group'), size=1.5) \ + geom_point(aes(x=lst_num_experiments, y='score'),
# Concatenate input and simulated dataframes together combined_data_df = pd.concat( [input_data_UMAPencoded_df, simulated_data_UMAPencoded_df]) # Plot g_input_sim = ggplot(combined_data_df, aes(x='1', y='2')) + geom_point(aes(color='dataset'), alpha=0.3) + labs(x = "UMAP 1", y = "UMAP 2", title = "UMAP of original and simulated data") + theme_bw() + theme( legend_title_align = "center", plot_background=element_rect(fill='white'), legend_key=element_rect(fill='white', colour='white'), plot_title=element_text(weight='bold') ) \ + guides(colour=guide_legend(override_aes={'alpha': 1})) \ + scale_colour_manual(["grey", '#87CEFA']) print(g_input_sim) ggsave(plot=g_input_sim, filename=umap_overlay_file, dpi=300) # ## 2. Visualize effects of multiple experiments in PCA space # In[13]: get_ipython().run_cell_magic( 'time', '', '\nall_data_df = pd.DataFrame()\n\n# Get batch 1 data\npartition_1_file = os.path.join(\n partition_dir,\n "Partition_1.txt.xz")\n\npartition_1 = pd.read_table(\n partition_1_file,\n header=0,\n index_col=0,\n sep=\'\\t\')\n\n\nfor i in lst_num_partitions:\n print(\'Plotting PCA of 1 partition vs {} partition...\'.format(i))\n \n # Simulated data with all samples in a single partition\n original_data_df = partition_1.copy()\n \n # Add grouping column for plotting\n original_data_df[\'num_partitions\'] = \'1\'\n \n # Get data with additional partitions added\n partition_other_file = os.path.join(\n partition_dir,\n "Partition_"+str(i)+".txt.xz")\n\n partition_other = pd.read_table(\n partition_other_file,\n header=0,\n index_col=0,\n sep=\'\\t\')\n \n # Simulated data with i partitions\n partition_data_df = partition_other\n \n # Add grouping column for plotting\n partition_data_df[\'num_partitions\'] = \'multiple\'\n \n # Concatenate datasets together\n combined_data_df = pd.concat([original_data_df, partition_data_df])\n\n # PCA projection\n pca = PCA(n_components=2)\n\n # Encode expression data into 2D PCA space\n combined_data_numeric_df = combined_data_df.drop([\'num_partitions\'], axis=1)\n combined_data_PCAencoded = pca.fit_transform(combined_data_numeric_df)\n\n\n combined_data_PCAencoded_df = pd.DataFrame(combined_data_PCAencoded,\n index=combined_data_df.index,\n columns=[\'PC1\', \'PC2\']\n )\n \n # Variance explained\n print(pca.explained_variance_ratio_) \n \n # Add back in batch labels (i.e. labels = "batch_"<how many batch effects were added>)\n combined_data_PCAencoded_df[\'num_partitions\'] = combined_data_df[\'num_partitions\']\n \n # Add column that designates which batch effect comparision (i.e. comparison of 1 batch vs 5 batches\n # is represented by label = 5)\n combined_data_PCAencoded_df[\'comparison\'] = str(i)\n \n # Concatenate ALL comparisons\n all_data_df = pd.concat([all_data_df, combined_data_PCAencoded_df])\n \n # Plot individual comparisons\n print(ggplot(combined_data_PCAencoded_df, aes(x=\'PC1\', y=\'PC2\')) \\\n + geom_point(aes(color=\'num_partitions\'), alpha=0.2) \\\n + labs(x = "PC 1", y = "PC 2", title = "Partition 1 and Partition {}".format(i))\\\n + theme_bw() \\\n + theme(\n legend_title_align = "center",\n plot_background=element_rect(fill=\'white\'),\n legend_key=element_rect(fill=\'white\', colour=\'white\'), \n plot_title=element_text(weight=\'bold\')\n ) \\\n + guides(colour=guide_legend(override_aes={\'alpha\': 1})) \\\n + scale_colour_manual(["grey", \'#b3e5fc\'])\n ) ' ) # In[14]: # Convert 'num_experiments' into categories to preserve the ordering lst_num_partitions_str = [str(i) for i in lst_num_partitions] num_partitions_cat = pd.Categorical(all_data_df['num_partitions'],
# In[16]: # Plot threshold = pd.DataFrame(pd.np.tile(permuted_score, (len(lst_num_experiments), 1)), index=lst_num_experiments, columns=['score']) g = ggplot(similarity_score_df, aes(x=lst_num_experiments, y='score')) + geom_line() + geom_line(aes(x=lst_num_experiments, y='score'), threshold, linetype='dashed') + labs(x = "Number of Experiments", y = "Similarity score (SVCCA)", title = "Similarity after correcting for experiment variation") \ + theme_bw() \ + theme(plot_title=element_text(weight='bold')) print(g) ggsave(plot=g, filename=svcca_file, dpi=300) # In[17]: # Plot - black threshold = pd.DataFrame(pd.np.tile(permuted_score, (len(lst_num_experiments), 1)), index=lst_num_experiments, columns=['score']) g = ggplot(similarity_score_df, aes(x=lst_num_experiments, y='score')) + geom_line(colour="white") + geom_line(aes(x=lst_num_experiments, y='score'), threshold, colour="white", linetype='dashed') + labs(x = "Number of Experiments", y = "Similarity score (SVCCA)", title = "Similarity after correcting for experiment variation") \ + theme(plot_title=element_text(weight='bold', colour="white"), plot_background=element_rect(fill="black"), panel_background=element_rect(fill="black"),
input_data_UMAPencoded_df['dataset'] = 'original' simulated_data_UMAPencoded_df['dataset'] = 'simulated' # Concatenate input and simulated dataframes together combined_data_df = pd.concat([input_data_UMAPencoded_df, simulated_data_UMAPencoded_df]) # Plot g_input_sim = ggplot(combined_data_df[combined_data_df['dataset'] == 'original'], aes(x='1', y='2')) g_input_sim += geom_point(color='#d5a6bd', alpha=0.15) g_input_sim += labs(x = "UMAP 1", y = "UMAP 2", title = "Original and simulated data") g_input_sim += theme_bw() g_input_sim += theme( legend_title_align = "center", plot_background=element_rect(fill='white'), legend_key=element_rect(fill='white', colour='white'), plot_title=element_text(family='sans-serif', size=15), axis_text=element_text(family='sans-serif', size=12), axis_title=element_text(family='sans-serif', size=15) ) g_input_sim += geom_point(combined_data_df[combined_data_df['dataset'] == 'simulated'], alpha=0.09, color='#cccccc') print(g_input_sim) ggsave(plot = g_input_sim, filename = umap_overlay_file, dpi=300)
comparison[["no_column", "data_length"]] = comparison[["no_column", "data_length"]].apply(pd.to_numeric, downcast="integer") ### Visual Exploration saveformat = "png" ## Select # pandas plot = (gg.ggplot(pandas_sel, gg.aes("factor(no_column)", "factor(data_length)")) + gg.geom_tile(gg.aes(fill="q50")) + gg.geom_text(gg.aes(label="q50"), color="white", size=9) + gg.labs(y="# Rows", x="# Columns", title="Pandas median selection time") + gg.facet_grid("pos_col ~ sel_col") + gg.theme_bw() + gg.theme(legend_position=None)) gg.ggsave(plot, filename=os.path.join(path_n, "output", f"select_results_pandas.{saveformat}"), width=15, height=10) # data.table plot = (gg.ggplot(datatable_sel, gg.aes("factor(no_column)", "factor(data_length)")) + gg.geom_tile(gg.aes(fill="q50")) + gg.geom_text(gg.aes(label="q50"), color="white", size=9) + gg.labs(y="# Rows", x="# Columns", title="data.table median selection time") + gg.facet_grid("pos_col ~ sel_col") + gg.theme_bw() + gg.theme(legend_position=None)) gg.ggsave(plot, filename=os.path.join(path_n, "output", f"select_results_datatable.{saveformat}"), width=15, height=10) # comparison plot = (gg.ggplot(comparison[comparison.operation == "select"], gg.aes("factor(no_column)", "factor(data_length)")) + gg.geom_tile(gg.aes(fill="q50")) +
gg.geom_rug(gg.aes(color="Metadata_cell_line"), show_legend={'color': False}) + \ gg.theme_bw() + \ gg.theme( subplots_adjust={"wspace": 0.2}, axis_text=gg.element_text(size=7), axis_title=gg.element_text(size=9), strip_text=gg.element_text(size=6, color="black"), strip_background=gg.element_rect(colour="black", fill="#fdfff4"), ) + \ gg.xlim([-0.5, 1]) + \ gg.xlab("Median Correlation of All Guides Across Genes") + \ gg.ylab("Density") + \ gg.facet_wrap("~replicate_type", nrow=2, scales="free") + \ gg.scale_fill_manual(name="Cell Line", values=["#1b9e77", "#d95f02", "#7570b3"]) + \ gg.scale_color_manual(name="Cell Line", values=["#1b9e77", "#d95f02", "#7570b3"]) ) file = os.path.join("figures", "median-guide-correlation-density") for extension in ['.png', '.pdf']: gg.ggsave(cor_density_gg, filename='{}{}'.format(file, extension), dpi=500, height=2, width=3, units='in') cor_density_gg
def main(): args = UserInput() if args.y_lim: y_lim = np.array(args.y_lim, dtype=np.float32) else: y_lim = None if args.size: size = np.array(args.size, dtype=np.float32) else: size = args.size ################################### df_list = [ pd.read_csv(f, sep=args.sep, skipinitialspace=True) for f in args.infile ] ## only take input with 1 or 2 columns; for 2 columns, 1st is always removed lg_list = [] for idx, df in enumerate(df_list): xdf = pd.DataFrame(df.iloc[:, int(args.col) - 1]) if args.col_names: xdf.columns = [args.col_names[idx]] lg_list.append(pd.melt(xdf)) lg_df = pd.concat(lg_list) lg_df.columns = [args.x_name, args.y_name] print(lg_df) ## plotnine method if args.use_p9: import plotnine as p9 Quant = [.25, .5, .75] if y_lim is not None: set_ylim = p9.ylim(y_lim) else: set_ylim = p9.ylim( [lg_df[args.y_name].min(), lg_df[args.y_name].max()]) df_plot = (p9.ggplot( lg_df, p9.aes(x=args.x_name, y=args.y_name, fill=args.x_name)) + p9.geom_violin( width=.75, draw_quantiles=Quant, show_legend=False) + p9.ggtitle(args.title) + p9.theme_classic() + set_ylim + p9.scale_x_discrete(limits=args.col_names) + p9.theme(text=p9.element_text(size=12, color='black'), axis_text_x=p9.element_text(angle=33), panel_grid_major_y=p9.element_line(color='gray', alpha=.5))) p9.ggsave(filename='{0}.violin.{1}'.format(args.outpref, args.img), plot=df_plot, dpi=int(args.dpi), format=args.img, width=size[0], height=size[1], units='in', verbose=False) else: ## Seaborn method import seaborn as sns sns.set(style='whitegrid') ax = sns.violinplot(x=args.x_name, y=args.y_name, data=lg_df, linewidth=1, inner='box') if args.title: ax.set_title(args.title) if y_lim is not None: ax.set(ylim=y_lim) plt.savefig('{0}.violin.{1}'.format(args.outpref, args.img), figsize=tuple(size), format=args.img, dpi=int(args.dpi)) plt.clf()
# Printing information print('\nConditional attributes description: ') print(conditional_attributes.describe()) print('\nDecision attribute description: ') print(decision_attribute.describe()) # Creating directory for files with plots if not exists if not os.path.isdir('./analyze'): os.mkdir('analyze') # Generating and saving plots for every attributes for i in conditional_attributes: plot = ggplot(dataset, aes(x=i, fill=decision_attribute.name)) + geom_histogram(stat="count") filename = '{0}-vs-class.png'.format(i) ggsave(plot=plot, filename=filename, dpi=300, scale=1, verbose=False, path='analyze') for i in conditional_attributes: plot = ggplot(dataset, aes(x=decision_attribute.name, fill=i)) + geom_histogram(stat="count") filename = 'class-vs-{0}.png'.format(str(i)) ggsave(plot=plot, filename=filename, dpi=300, scale=1, verbose=False, path='analyze') # Encoding attributes encoder = LabelEncoder() for i in dataset.columns: dataset[i] = encoder.fit_transform(dataset[i]) normalize_conditional_attributes = dataset.iloc[:, :6] normalize_decision_attribute = dataset['class'] # Splitting dataset
import numpy as np import pandas as pd from sklearn.manifold import TSNE from plotnine import ggplot, geom_point, aes, ggsave df = pd.read_csv( r"C:\Users\Peter\Documents\PhD Semester 1\BIOS 611\HW5\charpowersnumerical.csv" ) df_tsne = TSNE().fit_transform(df) np.savetxt(r"C:\Users\Peter\Documents\PhD Semester 1\BIOS 611\HW5\tsne.txt", df_tsne, fmt="%s") tsneplot = pd.read_csv( r"C:\Users\Peter\Documents\PhD Semester 1\BIOS 611\HW5\tsnealign.csv") tsneplotted = ggplot(tsneplot, aes('V1', 'V2', color='factor(V3)')) + geom_point() ggsave(tsneplotted, r"C:\Users\Peter\Documents\PhD Semester 1\BIOS 611\HW5\pythontsne.png")
def test_ggsave(self): ggsave(p) fn = p._save_filename('pdf') assert_exist_and_clean(fn, "default filename")
# Concatenate input and simulated dataframes together combined_data_df = pd.concat([input_data_UMAPencoded_df, simulated_data_UMAPencoded_df]) # Plot fig = ggplot(combined_data_df, aes(x='1', y='2')) fig += geom_point(aes(color='experiment_id'), alpha=0.1) fig += facet_wrap('~dataset') fig += labs(x ='UMAP 1', y = 'UMAP 2', title = 'UMAP of original and simulated data (gene space)') fig += theme_bw() fig += theme( legend_title_align = "center", plot_background=element_rect(fill='white'), legend_key=element_rect(fill='white', colour='white'), legend_title=element_text(family='sans-serif', size=15), legend_text=element_text(family='sans-serif', size=12), plot_title=element_text(family='sans-serif', size=15), axis_text=element_text(family='sans-serif', size=12), axis_title=element_text(family='sans-serif', size=15) ) fig += guides(colour=guide_legend(override_aes={'alpha': 1})) fig += scale_color_manual(['red', '#bdbdbd']) fig += geom_point(data=combined_data_df[combined_data_df['experiment_id'] == example_id], alpha=0.1, color='red') print(fig) ggsave(plot=fig, filename=experiment_simulated_file)