class TestAesthetics: p = (ggplot(df, aes('x')) + geom_boxplot(aes(y='y'), size=2) + geom_boxplot(df[:2 * m], aes(y='y+25', fill='x'), size=2) + geom_boxplot(df[2 * m:], aes(y='y+30', color='x'), size=2) + geom_boxplot(df[2 * m:], aes(y='y+55', linetype='x'), size=2)) def test_aesthetics(self): assert self.p == 'aesthetics' def test_aesthetics_coordflip(self): assert self.p + coord_flip() == 'aesthetics+coord_flip'
def plot_boxplot_series(df, normalisation_method=None): """ Treating each column as a separate boxplot and each row as an independent observation (ie. different company) render a series of box plots to identify a shift in performance from the observations. normalisation_method should be one of the values present in SectorSentimentSearchForm.normalisation_choices """ # and plot the normalised data if normalisation_method is None or normalisation_method == "1": normalized_df = df y_label = "Percentage change" elif normalisation_method == "2": normalized_df = (df - df.min()) / (df.max() - df.min()) y_label = "Percentage change (min/max. scaled)" else: normalized_df = df / df.max(axis=0) # div by max if all else fails... y_label = "Percentage change (normalised by dividing by max)" n_inches = len(df.columns) / 5 melted = normalized_df.melt(ignore_index=False).dropna() plot = (p9.ggplot(melted, p9.aes(x="fetch_date", y="value")) + p9.geom_boxplot(outlier_colour="blue") + p9.coord_flip()) return user_theme(plot, y_axis_label=y_label, figure_size=(12, n_inches))
def _make_plots(df_plt, out_file_base, y='AUC', facet_grid='', h_line=''): len_x = len(np.unique(df_plt['resolution'])) if 'sparsity_l1' in df_plt.columns: df_plt['Sparsity'] = df_plt['sparsity_l1'] len_x2 = len(np.unique(df_plt['Sparsity'])) else: len_x2 = 0 if len_x2 > 1: gplt = plt9.ggplot(df_plt, plt9.aes( fill='Sparsity', x='resolution', y=y, )) gplt = gplt + plt9.geom_boxplot(alpha=0.8, outlier_alpha=0) gplt = gplt + plt9.geom_jitter( plt9.aes(color='Sparsity'), alpha=0.25, width=0.2) else: gplt = plt9.ggplot(df_plt, plt9.aes(x='resolution', y=y)) gplt = gplt + plt9.geom_boxplot(alpha=0.8, outlier_alpha=0) gplt = gplt + plt9.geom_jitter(alpha=0.25, width=0.2) gplt = gplt + plt9.theme_bw(base_size=12) if facet_grid != '': gplt = gplt + plt9.facet_grid('{} ~ .'.format(facet_grid)) if y == 'f1-score': gplt = gplt + plt9.labs(x='Resolution', y='F1 score', title='') elif y in ['AUC', 'MCC']: gplt = gplt + plt9.labs(x='Resolution', y=y, title='') else: gplt = gplt + plt9.labs( x='Resolution', y=y.capitalize().replace('_', ' '), title='') gplt = gplt + plt9.theme( # legend_position='none', axis_text_x=plt9.element_text(angle=-45, hjust=0)) if len_x2 != 0 and len_x2 < 9: gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') if h_line != '': gplt = gplt + plt9.geom_hline(plt9.aes(yintercept=h_line), linetype='dashdot') gplt.save('{}-resolution__{}.png'.format(out_file_base, y.replace('-', '_')), dpi=300, width=4 * ((len_x + len_x2) / 4), height=5, limitsize=False)
def plot_score(df, plot_fn): f = (p9.ggplot(df, p9.aes(x="emotion_cat", y="score")) + p9.geom_boxplot() + p9.labs(x="Model", y="EMOTION FEEL Score") + p9.theme_538() + p9.theme(legend_position="top", legend_direction="horizontal", figure_size=(10, 5)) + p9.theme(plot_background=p9.element_rect( fill=BG_COLOR, color=BG_COLOR, size=1))) f.save(plot_fn)
def plot_replicate_correlation( df, batch, plate, facet_string=None, split_samples=False, output_file_base=None, output_file_extensions=[".png", ".pdf", ".svg"], dpi=500, height=4, width=5, return_plot=False, ): correlation_gg = ( gg.ggplot( df, gg.aes(x="group_replicate", y="similarity_metric", fill="group_replicate"), ) + gg.geom_boxplot( alpha=0.3, outlier_alpha=0, width=0.8, notchwidth=0.25, fatten=1.5 ) + gg.geom_jitter(shape=".", size=0.001, alpha=0.3, width=0.3, height=0) + gg.scale_fill_manual( name="Replicate", labels={"True": "True", "False": "False"}, values=["#B99638", "#2DB898"], ) + gg.xlab("Replicates") + gg.ylab("Pearson Correlation") + gg.ggtitle("{}: {}".format(batch, plate)) + gg.theme_bw() + gg.theme( subplots_adjust={"wspace": 0.2}, title=gg.element_text(size=5), axis_text=gg.element_text(size=4), axis_title=gg.element_text(size=5), legend_text=gg.element_text(size=4), legend_title=gg.element_text(size=5), strip_text=gg.element_text(size=4, color="black"), strip_background=gg.element_rect(colour="black", fill="#fdfff4"), ) ) if split_samples: assert facet_string, "To split samples, specify a facet_string" correlation_gg += gg.facet_wrap(facet_string) if output_file_base: save_figure( correlation_gg, output_file_base, output_file_extensions, dpi, height, width ) if return_plot: return correlation_gg
def plot_boxplot_series(df, normalisation_method=None): """ Treating each column as a separate boxplot and each row as an independent observation (ie. different company) render a series of box plots to identify a shift in performance from the observations. normalisation_method should be one of the values present in SectorSentimentSearchForm.normalisation_choices """ # compute star performers: those who are above the mean on a given day counted over all days count = defaultdict(int) for col in df.columns: avg = df.mean(axis=0) winners = df[df[col] > avg[col]][col] for winner in winners.index: count[winner] += 1 winner_results = [] for asx_code, n_wins in count.items(): x = df.loc[asx_code].sum() # avoid "dead cat bounce" stocks which fall spectacularly and then post major increases in percentage terms if x > 0.0: winner_results.append((asx_code, n_wins, x)) # and plot the normalised data if normalisation_method is None or normalisation_method == "1": normalized_df = df y_label = "Percentage change" elif normalisation_method == "2": normalized_df = (df - df.min()) / (df.max() - df.min()) y_label = "Percentage change (min/max. scaled)" else: normalized_df = df / df.max(axis=0) # div by max if all else fails... y_label = "Percentage change (normalised by dividing by max)" n_inches = len(df.columns) / 5 melted = normalized_df.melt(ignore_index=False).dropna() plot = ( p9.ggplot(melted, p9.aes(x="fetch_date", y="value")) + p9.geom_boxplot(outlier_colour="blue") + p9.theme( axis_text_x=p9.element_text(size=7), axis_text_y=p9.element_text(size=7), figure_size=(12, n_inches), ) + p9.labs(x="Date (YYYY-MM-DD)", y=y_label) + p9.coord_flip() ) return ( plot_as_inline_html_data(plot), list(reversed(sorted(winner_results, key=lambda t: t[2]))), )
def test_weight(): # The boxes of the two plots should differ slightly due to the # method used to calculate weighted percentiles. There is no # standard method for calculating weighted percentiles. df = pd.DataFrame({ 'x': list('a' * 11 + 'b' * 5), 'y': np.hstack([[1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 15], [1, 2, 3, 4, 15]]), 'weight': np.hstack([np.ones(11), [1, 2, 3, 4, 1]]) }) p = (ggplot(df, aes(x='x', y='y', weight='weight')) + geom_boxplot()) assert p == 'weight'
def plot_market_cap_distribution(ld: LazyDictionary) -> p9.ggplot: df = ld["market_cap_df"] assert set(df.columns).intersection(set( ["market", "market_cap", "bin"])) == set(["market", "market_cap", "bin"]) pos_market_cap_only = df[df["market_cap"] > 0.0] plot = (p9.ggplot(pos_market_cap_only) + p9.geom_boxplot(p9.aes(x="market", y="market_cap")) + p9.facet_wrap("bin", scales="free_y") + p9.scales.scale_y_log10()) return user_theme( plot, y_axis_label="Market cap. ($AUD Millions)", subplots_adjust={"wspace": 0.30}, )
def plot_preprocessing_boxplot_bymodel(dataframe, models_labels, metrics_labels, groups_labels, figure_size=(14, 4)): """ We define a function to plot the grid. """ return ( # Define the plot. p9.ggplot(dataframe, p9.aes(x='variable', y='value', fill='group')) # Add the boxplots. + p9.geom_boxplot(position='dodge') # Rename the x axis. + p9.scale_x_discrete(name='Metric', labels=lambda l: [metrics_labels[x] for x in l]) # Rename the y axis. + p9.scale_y_continuous( name='Value', expand=(0, 0.05), # breaks=[-0.25, 0, 0.25, 0.5, 0.75, 1], limits=[-0.25, 1], labels=lambda l: ['{:.2f}'.format(x) for x in l]) # Define the colors for the metrics for color-blind people. + p9.scale_fill_brewer(name='Group', labels=lambda l: [groups_labels[x] for x in l], type='qual', palette='Set2') # Place the plots in a grid, renaming the labels. + p9.facet_grid( 'model ~ .', scales='free_y', labeller=p9.labeller(rows=lambda x: f'{models_labels[x]}')) # Define the theme for the plot. + p9.theme( # Remove the x and y axis names. axis_title_x=p9.element_blank(), axis_title_y=p9.element_blank(), # Set the size of x and y tick labels font. axis_text_x=p9.element_text(size=7), axis_text_y=p9.element_text(size=7), # Place the legend on top, without title, and reduce the margin. legend_title=p9.element_blank(), legend_position='top', legend_box_margin=2, # Set the size for the figure. figure_size=figure_size, ))
def create(self, file_path: str) -> None: metrics = self._data["metric"].unique() for metric in metrics: data = self._data[self._data["metric"] == metric] q75, q25 = np.percentile(data["value"], [98, 2]) (ggplot(data, aes(x="category", y="value")) + geom_boxplot(outlier_shape="") + coord_cartesian(ylim=(q75 * 0.8, q25 * 1.2)) #+ facet_wrap(facets="metric", scales="free", ncol=3) + ggtitle(metric) #+ ggtitle("QMOOD Quality Attributes") + xlab("Category") + ylab("Value") + theme_classic(base_size=28, base_family="Helvetica") #+ theme(subplots_adjust={"wspace": 0.25, "hspace": 0.2}) ).save(f"{file_path}.{metric}.pdf", width=24, height=24)
def grid_search_models(X,y): # get only exons 4-12 X2 = X[:,3:12] X_train, X_test, y_train, y_test = train_test_split(X2,y,test_size=0.3) #SVM svc = SVC() param_grid = {'C':[0.5,1,2,3,5,6,7,8,9,10],'kernel':['rbf','linear','poly','sigmoid'],'degree':[2,3,4,5,6]} grid_search_svc = GridSearchCV(svc, param_grid, scoring='accuracy') grid_search_svc.fit(X_train, y_train) #logistic regression lr = LogisticRegression() param_grid = {'penalty':['l1','l2'],'C':[0.5,1,2,3,4,5,8,10]} grid_search_lr = GridSearchCV(lr, param_grid, scoring='accuracy') grid_search_lr.fit(X_train, y_train) #decision tree dt = DecisionTreeClassifier() param_grid = {'max_depth': [3, 10, 20, 30], 'max_leaf_nodes': [2, 4, 6, 8],'min_samples_leaf':[1,2,3],'min_samples_split':[2,4,6]} grid_search_dt = RandomizedSearchCV(dt, param_grid, cv=10, scoring='accuracy') grid_search_dt.fit(X_train, y_train) # plot performances data = { 'Model':['SVM']*10 + ['LogisticRegression']*10 + ['DecisionTree']*10, 'Accuracy':list(cross_val_score(grid_search_svc.best_estimator_,X_train,y_train,cv=10)) + \ list(cross_val_score(grid_search_lr.best_estimator_,X_train,y_train,cv=10)) + \ list(cross_val_score(grid_search_dt.best_estimator_,X_train,y_train,cv=10)) } data = pd.DataFrame(data) data['Model'] = pd.Categorical(data['Model'], categories=['SVM','LogisticRegression','DecisionTree'], ordered=True) p = pn.ggplot(data,pn.aes('Model','Accuracy')) + pn.geom_boxplot() + pn.ylim(0,1) p.save('./plots/tumor_genotype_prediction/accuracy-model.png')
def create_boxplot(box_df): """This function should create a boxplot from the dataframe created in melt_data Input ----- box_df: pandas.DataFrame The dataframe returned by melt_data Returns ------- plot: plotnine.ggplot A boxplot visualizing the data in box_df """ plot = ggplot( box_df, aes(x='treated/control', y='blood_pressure', fill='treated/control')) + geom_boxplot() return plot
def plot_market_cap_distribution(stocks, ymd: str, ymd_start_of_timeframe: str): #print(ymd) latest_quotes = valid_quotes_only(ymd) earliest_quotes = valid_quotes_only(ymd_start_of_timeframe) asx_codes = set(stocks) latest_df = make_quote_df(latest_quotes, asx_codes, ymd) earliest_df = make_quote_df(earliest_quotes, asx_codes, ymd_start_of_timeframe) df = latest_df.append(earliest_df) #print(df) small_text = p9.element_text(size=7) plot = p9.ggplot(df) + \ p9.geom_boxplot(p9.aes(x='market', y='market_cap')) + \ p9.facet_wrap("bin", scales="free_y") + \ p9.labs(x='', y='Market cap. ($AUD Millions)') + \ p9.theme(subplots_adjust={'wspace': 0.30}, axis_text_x=small_text, axis_text_y=small_text) return plot_as_inline_html_data(plot)
def test_params(): p = (ggplot(df, aes('x')) + geom_boxplot(df[:m], aes(y='y'), size=2, notch=True) + geom_boxplot(df[m:2*m], aes(y='y'), size=2, notch=True, notchwidth=0.8) + # outliers geom_boxplot(df[2*m:3*m], aes(y='y'), size=2, outlier_size=4, outlier_color='green') + geom_boxplot(df[2*m:3*m], aes(y='y+25'), size=2, outlier_size=4, outlier_alpha=0.5) + geom_boxplot(df[2*m:3*m], aes(y='y+60'), size=2, outlier_size=4, outlier_shape='D') + # position dodge geom_boxplot(df[3*m:4*m], aes(y='y', fill='factor(y%2)')) + theme(facet_spacing={'right': 0.85}) ) assert p == 'params'
def test_params(): p = (ggplot(df, aes('x')) + geom_boxplot(df[:m], aes(y='y'), size=2, notch=True) + geom_boxplot(df[m:2*m], aes(y='y'), size=2, notch=True, notchwidth=0.8) + # outliers geom_boxplot(df[2*m:3*m], aes(y='y'), size=2, outlier_size=4, outlier_color='green') + geom_boxplot(df[2*m:3*m], aes(y='y+25'), size=2, outlier_size=4, outlier_alpha=0.5) + geom_boxplot(df[2*m:3*m], aes(y='y+60'), size=2, outlier_size=4, outlier_shape='D') + # position dodge geom_boxplot(df[3*m:4*m], aes(y='y', fill='factor(y%2)')) + theme(subplots_adjust={'right': 0.85}) ) assert p == 'params'
def plot_box_plots(var, draws, measurements, variable_id_map): """Return plotnine.geoms.geom_boxplot of given variable.""" plot = p9.ggplot(data=draws[var]) + p9.geom_boxplot( p9.aes(x=variable_id_map[var], y=var, fill=variable_id_map[var]), outlier_shape="", ) if measurements[var].empty is False: plot += p9.geoms.geom_point(p9.aes(y="measurement", x=variable_id_map[var]), data=measurements[var]) if var != "flux": plot += p9.scale_y_log10() plot += p9.facet_wrap("~experiments") + p9.themes.theme( panel_spacing_y=0.05, panel_spacing_x=0.35, axis_title=p9.element_text(size=10), axis_text=p9.themes.element_text(size=11), ) if var == "flux": plot += p9.scale_y_continuous(breaks=np.arange(-0.001, 0.002, 0.00025), limits=[-0.001, 0.002]) plot += p9.theme(axis_text_x=p9.themes.element_text(rotation=90, size=6)) return plot
def test_dodge2(): p = (ggplot(df3, aes('x', 'y', color='c')) + geom_boxplot(position='dodge2', size=2)) assert p + _theme == 'dodge2'
# ggbarse.save('gse75386_gad1_barchart_stat.pdf', format='pdf', # height=1, width=6) ## mean bars +/- standard error using seaborn plt.close() # plt.figure(figsize=(6, 1)) sns.barplot(data=gse75386, y='class', x='Gad1', color='slategray', ci=68) # plt.savefig('gse75386_gad1_barchart_stat.pdf', # format='pdf', bbox_inches='tight') ## ----------------------------------------------------------------- ## GSE75386 boxplot + stripchart ## ----------------------------------------------------------------- plt.close() ggbox = ggplot(gse75386, gg.aes(x='class', y='Gad1')) +\ gg.geom_boxplot(stat='boxplot', outlier_size=0.0001) +\ gg.geom_point(alpha=0.5) +\ gg.coord_flip() print(ggbox) # ggbox.save('gse75386_gad1_boxplot.pdf', format='pdf', height=1, width=6) plt.close() # plt.figure(figsize=(6, 1)) sns.boxplot(data=gse75386, y='class', x='Gad1', color='white') sns.stripplot(data=gse75386, y='class', x='Gad1', color='black') # plt.savefig('gse75386_gad1_boxplot.pdf', # format='pdf', bbox_inches='tight') ## ----------------------------------------------------------------- ## GSE75386 scatterplot ## -----------------------------------------------------------------
xseq_2 = np.linspace(np.min(x), np.max(x), 80) results_2 = linregress(x, y) print(results_2) # - x_line = np.array([ published_date_distances["version_count"].min(), published_date_distances["version_count"].max(), ]) y_line = x_line * results_2.slope + results_2.intercept g = (p9.ggplot( published_date_distances, p9.aes(x="factor(version_count)", y="time_to_published"), ) + p9.geom_boxplot(fill="#a6cee3") + p9.geom_line( mapping=p9.aes(x="version_count", y="time_to_published"), stat="smooth", method="lm", linetype="dashed", se=False, alpha=1, size=0.7, inherit_aes=False, ) + p9.scale_y_timedelta(labels=timedelta_format("d")) + p9.annotate( "text", x=9, y=timedelta(days=1470), label=f"Y={results_2.slope:.2f}*X+{results_2.intercept:.2f}", ) + p9.labs(x="# of Preprint Versions", y="Time Elapsed Until Preprint is Published") + p9.theme_seaborn(
projected_documents.shape projected_documents_df = pd.DataFrame( projected_documents, columns=[f"PC_{dim+1}" for dim in range(n_components)] ).assign( category=document_categories_df.category.tolist(), document=document_categories_df.document.tolist(), ) projected_documents_df g = ( p9.ggplot(projected_documents_df) + p9.aes(x="factor(category)", y="PC_1") + p9.geom_boxplot( fill="#a6cee3", outlier_size=1, outlier_alpha=0.65, fatten=1.5, ) + p9.coord_flip() + p9.scale_x_discrete( limits=( projected_documents_df.groupby("category") .agg({"PC_1": "median"}) .sort_values("PC_1", ascending=False) .reset_index() .category.tolist()[::-1] ) ) + p9.labs(x="Article Category", y="PC1") + p9.theme_seaborn(context="paper", style="ticks", font="Arial", font_scale=2) + p9.theme(figure_size=(11, 8.5))
best_line, age, linear_coeff, log10_coeff, ln_coeff) + zero_z_score print("\n\nThe predicted acceptable range at age ", str(age), " is from ", str(min_acceptable_range), " to ", str(max_acceptable_range), "\n\n") # save csv file outlierfile = filename.replace('.csv', '_outliers.csv') data_output.to_csv(outlierfile, index=False) # plot overlay of IQR and mod-Z score outliers p = ( p9.ggplot(data=data_output, mapping=p9.aes(x='age_rounded', y='value', group='age_rounded')) + p9.geom_jitter(mapping=p9.aes(color='z_outlier', outlier_alpha=0.1)) + p9.geom_boxplot(outlier_size=0, outlier_stroke=0) + p9.ggtitle( "Outliers detected via the IQR method (boxplot)\nand modified z-score method (dotplot)" ) + p9.ylim(-10, 175)) print(p) plotfile = filename.replace('.csv', '_outlierplot') p9.ggsave(plot=p, filename=plotfile) # plot regression x = data_stats_regression['age_rounded'] y = data_stats_regression['median'] plt.plot(x, y, 'o') plt.plot(x, r.func_linear(x, *linear_coeff)) plt.plot(x, r.func_log(x, *log10_coeff)) plt.plot(x, r.func_ln(x, *ln_coeff)) plt.title( "Regression performed on medians of age 1, 3 and 5\ndata with outliers removed"
# In[8]: projected_documents_df = (pd.DataFrame( projected_documents, columns=[f"PC_{dim+1}" for dim in range(n_components) ]).assign(category=document_categories_df.category.tolist(), document=document_categories_df.document.tolist())) projected_documents_df # In[9]: g = ( p9.ggplot(projected_documents_df) + p9.aes(x="factor(category)", y="PC_1") + p9.geom_boxplot( fill="#a6cee3", outlier_size=1, outlier_alpha=0.65, fatten=1.5, ) + p9.coord_flip() + p9.scale_x_discrete( limits=(projected_documents_df.groupby("category").agg({ "PC_1": "median" }).sort_values( "PC_1", ascending=False).reset_index().category.tolist()[::-1])) + p9.labs(x="Article Category", y="PC1") + p9.theme(figure_size=(6.66, 5)) + p9.theme_seaborn( context="paper", style="ticks", font="Arial", font_scale=1)) g.save("output/pca_plots/figures/category_box_plot_pc1.png", dpi=250) g.save( "output/pca_plots/svg_files/category_box_plot/category_box_plot_pc1.svg", dpi=250) print(g)
def test_dodge2_varwidth(): p = (ggplot(df3, aes('x', 'y', color='c')) + geom_boxplot( position=position_dodge2(preserve='single'), varwidth=True, size=2)) assert p + _theme == 'dodge2_varwidth'
+ geom_histogram()).save(filename="MonthlyCharges_Hist.png", dpi=300) (ggplot(dat, aes(x='TotalCharges')) + geom_histogram()).save(filename="TotalCharges_Hist.png", dpi=300) #Neither follow a normal distribution. Log transformation could help, but these are odd. dat["LogTotalCharges"] = np.log(dat["TotalCharges"]+1) dat["LogMonthlyCharges"] = np.log(dat["MonthlyCharges"]+1) (ggplot(dat, aes(x='LogMonthlyCharges')) + geom_histogram()) (ggplot(dat, aes(x='LogTotalCharges')) + geom_histogram()) #Doesn't really help so leave this for now. dat = dat.drop(columns = ["LogTotalCharges", "LogMonthlyCharges"]) dat["Churn_label"] = dat["Churn"].astype(str) (ggplot(dat, aes(x="Churn_label", y='MonthlyCharges')) + geom_boxplot()).save(filename="MonthlyChargesChurn_Box.png", dpi=300) (ggplot(dat, aes(x="Churn_label", y='TotalCharges')) + geom_boxplot()).save(filename="TotalChargesChurn_Box.png", dpi=300) dat = dat.drop(columns="Churn_label")
+ p9.geom_point() + facet ) (plot + p9.scale_y_log10() if use_y_log10 else plot).save( args.output_prefix + ".elapsed-vs-size.png" ) # %% plot = ( p9.ggplot(data=data, mapping=p9.aes(x="MiB", y="CpuNanosPerByte", color="ApiName")) + p9.geom_point() + facet ) (plot + p9.scale_y_log10() if use_y_log10 else plot).save( args.output_prefix + ".cpu-vs-size.png" ) # %% ( p9.ggplot(data=data, mapping=p9.aes(x="MiB", y="MiBs", color="ApiName")) + p9.geom_point() + facet ).save(args.output_prefix + ".tp-vs-size.png") # %% ( p9.ggplot(data=data, mapping=p9.aes(x="ApiName", y="MiBs", color="ApiName")) + p9.geom_boxplot() + facet ).save(args.output_prefix + ".tp-vs-api.png")
df, separated_peaks = er.proof_artificial( model, ad_partial, region_length=parameters['pad_to'], nb_datasets=parameters['artificial_nb_datasets'], nb_tfs=parameters['artificial_nb_tfs'], n_iter=500, squish_factor=parameters['squish_factor']) arti_end = time.time() print('Artificial data generalisation completed in ' + str(arti_end - arti_start) + ' s') # The plots a = ggplot(df, aes(x="type", y="rebuilt_value", fill="tf_group")) a1 = a + geom_violin(position=position_dodge(1), width=1) a2 = a + geom_boxplot(position=position_dodge(1), width=0.5) b = ggplot(df, aes( x="brothers", y="rebuilt_value", group="brothers")) + scale_fill_grey() + geom_boxplot(width=0.4) a2.save(filename=plot_output_path + 'artifical_data_systematisation_value_per_type.png', height=10, width=14, units='in', dpi=400, verbose=False) b.save(filename=plot_output_path + 'artifical_data_systematisation_value_per_brothers.png', height=10, width=14,
treatment_replace) print(scores_df.shape) scores_df.head(3) # In[ ]: # In[6]: scores_df.Metadata_treatment.value_counts() # In[7]: clone_a_gg = ( gg.ggplot(scores_df, gg.aes(y="Clone A", x="Metadata_clone_number")) + gg.geom_boxplot(gg.aes(fill="data_fit")) + gg.facet_wrap("~shuffle_label") + gg.xlab("Cell Line") + gg.ylab("Clone A Probability") + gg.theme_bw() + gg.theme(legend_key=gg.element_rect(color="black", fill="white"), strip_text=gg.element_text(size=6, color="black"), strip_background=gg.element_rect(colour="black", fill="#fdfff4"))) file = pathlib.Path("figures", "predictions", "clone_a_single_cell_proba.png") clone_a_gg.save(file, height=3, width=6, dpi=400) clone_a_gg # In[8]: clone_e_gg = ( gg.ggplot(scores_df, gg.aes(y="Clone E", x="Metadata_clone_number")) +
def plot(self): """Plot the figures using R""" df = pandas.DataFrame( self.data, columns=self.datacols, ) with capture_c_msg("datar", prefix=f"[r]{self.title}[/r]: "): df.columns = make_unique(df.columns.tolist()) if self.savedata: datafile = self.outprefix + ".csv" logger.info( "[r]%s[/r]: Saving data to: %r", self.title, datafile, extra={"markup": True}, ) df.to_csv(datafile, index=False) if df.shape[0] == 0: logger.warning("No data points to plot") return aes_for_geom_fill = None aes_for_geom_color = None theme_elems = p9.theme(axis_text_x=p9.element_text(angle=60, hjust=2)) if df.shape[1] > 2: aes_for_geom_fill = p9.aes(fill=df.columns[2]) aes_for_geom_color = p9.aes(color=df.columns[2]) plt = p9.ggplot(df, p9.aes(y=df.columns[0], x=df.columns[1])) if self.figtype == "scatter": plt = plt + p9.geom_point(aes_for_geom_color) theme_elems = None elif self.figtype == "line": pass elif self.figtype == "bar": plt = plt + p9.geom_bar(p9.aes(fill=df.columns[0])) elif self.figtype == "col": plt = plt + p9.geom_col(aes_for_geom_fill) elif self.figtype == "pie": logger.warning("Pie chart is not support by plotnine yet, " "plotting bar chart instead.") col0 = df.iloc[:, 0] if df.shape[1] > 2: plt = plt + p9.geom_bar( p9.aes(x=df.columns[2], y=col0.name, fill=df.columns[2]), stat="identity" # aes_for_geom_fill, # x=df.Group, # y=col0, # label=paste0(round_(100 * col0 / sum_(col0), 1), "%"), # show_legend=False, # position=p9.position_adjust_text(), ) else: col0 = factor(col0, levels=rev(unique(as_character(col0)))) fills = rev(levels(col0)) sums = map(lambda x: sum(col0 == x), fills) print(col0) print(fills) plt = (p9.ggplot(df, p9.aes(x=df.columns[1])) + p9.geom_bar(p9.aes(fill=df.columns[0])) + p9.geom_label( x=1, y=cumsum(sums) - sums / 2, label=paste0(round(sums / sum(sums) * 100, 1), "%"), show_legend=False, )) theme_elems = p9.theme( axis_title_x=p9.element_blank(), axis_title_y=p9.element_blank(), axis_text_y=p9.element_blank(), ) elif self.figtype == "violin": plt = plt + p9.geom_violin(aes_for_geom_fill) elif self.figtype == "boxplot": plt = plt + p9.geom_boxplot(aes_for_geom_fill) elif self.figtype in ("histogram", "density"): plt = p9.ggplot(df, p9.aes(x=df.columns[0])) geom = getattr(p9, f"geom_{self.figtype}") if df.columns[1] != "ONE": plt = plt + geom(p9.aes(fill=df.columns[1]), alpha=0.6) theme_elems = None else: plt = plt + geom(alpha=0.6) theme_elems = p9.theme(legend_position="none") elif self.figtype == "freqpoly": plt = p9.ggplot(df, p9.aes(x=df.columns[0])) if df.columns[1] != "ONE": plt = plt + p9.geom_freqpoly(p9.aes(fill=df.columns[1])) else: plt = plt + p9.geom_freqpoly() theme_elems = None else: raise ValueError(f"Unknown figure type: {self.figtype}") plt = plt + p9.ggtitle(self.title) self.save_plot(plt, theme_elems)
+ p9.xlab("age at diagnosis (days)") + p9.theme_bw() + p9.theme(text=p9.element_text(size=16)) ) ## Challenge: create a scatterplot from smoke_complete showing # age at diagnosis vs years smoked with points colored by gender # and appropriate axis labels #### Plotting distributions #### # boxplot (p9.ggplot(smoke_complete, p9.aes(x="vital_status", y="cigarettes_per_day")) + p9.geom_boxplot() ) # change color of boxes and move aes to geom layer (p9.ggplot(smoke_complete) + p9.geom_boxplot(p9.aes(x="vital_status", y="cigarettes_per_day"), color="tomato") ) # adding colored points to black box and whisker plot (p9.ggplot(smoke_complete, p9.aes(x="vital_status", y="cigarettes_per_day")) + p9.geom_boxplot() + p9.geom_jitter(alpha=0.2, color="blue") )
# -*- coding: utf-8 -*- import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt housing_data = pd.read_csv( "C:\\Users\\sofia.dejesus\\Documents\\02_book\\kc_house_data.csv") #Housing_data.head() #print(housing_data.shape) housing_data.describe(include=[np.number]) housing_data.head housing_data.describe() #Checking for missing values in data housing_data.isnull().sum() #Pairplotting for some data coln = ['price', 'sqft_living', 'zipcode', 'sqft_above'] sns.pairplot(housing_data[coln], height=4) plt.savefig('pairplotting.png', dpi=300) plt.show() from plotnine.data import huron from plotnine import ggplot, aes, geom_boxplot (ggplot(huron) + aes(x='sqft_living', y='sqft_above') + geom_boxplot())
def MDplot(Data, Names=None, Ordering='Default', Scaling=None, Fill='darkblue', RobustGaussian=True, GaussianColor='magenta', Gaussian_lwd=1.5, BoxPlot=False, BoxColor='darkred', MDscaling='width', LineColor='black', LineSize=0.01, QuantityThreshold=40, UniqueValuesThreshold=12, SampleSize=500000, SizeOfJitteredPoints=1, OnlyPlotOutput=True, ValueColumn=None, ClassColumn=None): """ Plots a mirrored density plot for each numeric column Args: Data (dataframe): dataframe containing data. Each column is one variable (wide table format, for long table format see ValueColumn and ClassColumn) Names (list): list of column names (will be used if data is not a dataframe) Ordering (str): 'Default', 'Columnwise', 'Alphabetical' or 'Statistics' Scaling (str): scaling method, one of: Percentalize, CompleteRobust, Robust, Log Fill (str): color of MD-Plot RobustGaussian (bool): draw a gaussian distribution if column is gaussian GaussianColor (str): color for gaussian distribution Gaussian_lwd (float): line width of gaussian distribution BoxPlot (bool): draw box-plot BoxColor (str): color for box-plots MDscaling (str): scale of ggplot violin LineSize (float): line width of ggplot violin QuantityThreshold (int): minimal number of rows UniqueValuesThreshold (int): minimal number of unique values per column SampleSize (int): number of samples used if number of rows is larger than SampleSize OnlyPlotOutput (bool): if True than returning only ggplot object, if False than returning dictionary containing ggplot object and additional infos ValueColumn (str): name of the column of values to be plotted (data in long table format) ClassColumn (str): name of the column with class identifiers for the value column (data in long table format) Returns: ggplot object or dictionary containing ggplot object and additional infos """ if not isinstance(Data, pd.DataFrame): try: if Names is not None: Data = pd.DataFrame(Data, columns=Names) else: Data = pd.DataFrame(Data) lstCols = list(Data.columns) dctCols = {} for strCol in lstCols: dctCols[strCol] = "C_" + str(strCol) Data = Data.rename(columns=dctCols) except: raise Exception("Data cannot be converted into pandas dataframe") else: Data = Data.reset_index(drop=True) if ValueColumn is not None and ClassColumn is not None: lstCols = list(Data.columns) if ValueColumn not in lstCols: raise Exception("ValueColumn not contained in dataframe") if ClassColumn not in lstCols: raise Exception("ClassColumn not contained in dataframe") lstClasses = list(Data[ClassColumn].unique()) DataWide = pd.DataFrame() for strClass in lstClasses: if len(DataWide) == 0: DataWide = Data[Data[ClassColumn] == strClass].copy()\ .reset_index(drop=True) DataWide = DataWide.rename(columns={ValueColumn: strClass}) DataWide = DataWide[[strClass]] else: dfTemp = Data[Data[ClassColumn] == strClass].copy()\ .reset_index(drop=True) dfTemp = dfTemp.rename(columns={ValueColumn: strClass}) dfTemp = dfTemp[[strClass]] DataWide = DataWide.join(dfTemp, how='outer') Data = DataWide.copy() lstCols = list(Data.columns) for strCol in lstCols: if not is_numeric_dtype(Data[strCol]): print("Deleting non numeric column: " + strCol) Data = Data.drop([strCol], axis=1) else: if abs(Data[strCol].sum()) == np.inf: print("Deleting infinite column: " + strCol) Data = Data.drop([strCol], axis=1) Data = Data.rename_axis("index", axis="index")\ .rename_axis("variable", axis="columns") dvariables = Data.shape[1] nCases = Data.shape[0] if nCases > SampleSize: print('Data has more cases than "SampleSize". Drawing a sample for ' 'faster computation. You can omit this by setting ' '"SampleSize=len(data)".') sampledIndex = np.sort( np.random.choice(list(Data.index), size=SampleSize, replace=False)) Data = Data.loc[sampledIndex] nPerVar = Data.apply(lambda x: len(x.dropna())) nUniquePerVar = Data.apply(lambda x: len(list(x.dropna().unique()))) # renaming columns to nonumeric names lstCols = list(Data.columns) dctCols = {} for strCol in lstCols: try: a = float(strCol) dctCols[strCol] = "C_" + str(strCol) except: dctCols[strCol] = str(strCol) Data = Data.rename(columns=dctCols) if Scaling == "Percentalize": Data = Data.apply(lambda x: 100 * (x - x.min()) / (x.max() - x.min())) if Scaling == "CompleteRobust": Data = robust_normalization(Data, centered=True, capped=True) if Scaling == "Robust": Data = robust_normalization(Data, centered=False, capped=False) if Scaling == "Log": Data = signed_log(Data, base="Ten") if RobustGaussian == True: RobustGaussian = False print("log with robust gaussian does not work, because mean and " "variance is not valid description for log normal data") #_______________________________________________Roboust Gaussian and Statistics if RobustGaussian == True or Ordering == "Statistics": Data = Data.applymap(lambda x: np.nan if abs(x) == np.inf else x) if nCases < 50: warnings.warn("Sample is maybe too small for statistical testing") factor = pd.Series([0.25, 0.75]).apply(lambda x: abs(norm.ppf(x)))\ .sum() std = Data.std() dfQuartile = Data.apply( lambda x: mquantiles(x, [0.25, 0.75], alphap=0.5, betap=0.5)) dfQuartile = dfQuartile.append(dfQuartile.loc[1] - dfQuartile.loc[0], ignore_index=True) dfQuartile.index = ["low", "hi", "iqr"] dfMinMax = Data.apply( lambda x: mquantiles(x, [0.001, 0.999], alphap=0.5, betap=0.5)) dfMinMax.index = ["min", "max"] shat = pd.Series() mhat = pd.Series() nonunimodal = pd.Series() skewed = pd.Series() bimodalprob = pd.Series() isuniformdist = pd.Series() nSample = max([10000, nCases]) normaldist = np.empty((nSample, dvariables)) normaldist[:] = np.nan normaldist = pd.DataFrame(normaldist, columns=lstCols) for strCol in lstCols: shat[strCol] = min( [std[strCol], dfQuartile[strCol].loc["iqr"] / factor]) mhat[strCol] = trim_mean(Data[strCol].dropna(), 0.1) if nCases > 45000 and nPerVar[strCol] > 8: # statistical testing does not work with to many cases sampledIndex = np.sort( np.random.choice(list(Data.index), size=45000, replace=False)) vec = Data[strCol].loc[sampledIndex] if nUniquePerVar[strCol] > UniqueValuesThreshold: nonunimodal[strCol] = dip.diptst(vec.dropna(), numt=100)[1] skewed[strCol] = skewtest(vec)[1] args = (dfMinMax[strCol].loc["min"], dfMinMax[strCol].loc["max"] \ - dfMinMax[strCol].loc["min"]) isuniformdist[strCol] = kstest(vec, "uniform", args)[1] bimodalprob[strCol] = bimodal(vec)["Bimodal"] else: print("Not enough unique values for statistical testing, " "thus output of testing is ignored.") nonunimodal[strCol] = 1 skewed[strCol] = 1 isuniformdist[strCol] = 0 bimodalprob[strCol] = 0 elif nPerVar[strCol] < 8: warnings.warn("Sample of finite values to small to calculate " "agostino.test or dip.test for " + strCol) nonunimodal[strCol] = 1 skewed[strCol] = 1 isuniformdist[strCol] = 0 bimodalprob[strCol] = 0 else: if nUniquePerVar[strCol] > UniqueValuesThreshold: nonunimodal[strCol] = dip.diptst(Data[strCol].dropna(), numt=100)[1] skewed[strCol] = skewtest(Data[strCol])[1] args = (dfMinMax[strCol].loc["min"], dfMinMax[strCol].loc["max"] \ - dfMinMax[strCol].loc["min"]) isuniformdist[strCol] = kstest(Data[strCol], "uniform", args)[1] bimodalprob[strCol] = bimodal(Data[strCol])["Bimodal"] else: print("Not enough unique values for statistical testing, " "thus output of testing is ignored.") nonunimodal[strCol] = 1 skewed[strCol] = 1 isuniformdist[strCol] = 0 bimodalprob[strCol] = 0 if isuniformdist[strCol] < 0.05 and nonunimodal[strCol] > 0.05 \ and skewed[strCol] > 0.05 and bimodalprob[strCol] < 0.05 \ and nPerVar[strCol] > QuantityThreshold \ and nUniquePerVar[strCol] > UniqueValuesThreshold: normaldist[strCol] = np.random.normal(mhat[strCol], shat[strCol], nSample) normaldist[strCol] = normaldist[strCol]\ .apply(lambda x: np.nan if x < Data[strCol].min() \ or x > Data[strCol].max() else x) nonunimodal[nonunimodal == 0] = 0.0000000001 skewed[skewed == 0] = 0.0000000001 effectStrength = (-10 * np.log(skewed) - 10 * np.log(nonunimodal)) / 2 #______________________________________________________________________Ordering if Ordering == "Default": bimodalprob = pd.Series() for strCol in lstCols: if nCases > 45000 and nPerVar[strCol] > 8: sampledIndex = np.sort( np.random.choice(list(Data.index), size=45000, replace=False)) vec = Data[strCol].loc[sampledIndex] bimodalprob[strCol] = bimodal(vec)["Bimodal"] elif nPerVar[strCol] < 8: bimodalprob[strCol] = 0 else: bimodalprob[strCol] = bimodal(Data[strCol])["Bimodal"] if len(list(bimodalprob.unique())) < 2 and dvariables > 1 \ and RobustGaussian == True: rangfolge = list(effectStrength.sort_values(ascending=False).index) print("Using statistics for ordering instead of default") else: rangfolge = list(bimodalprob.sort_values(ascending=False).index) if Ordering == "Columnwise": rangfolge = lstCols if Ordering == "Alphabetical": rangfolge = lstCols.copy() rangfolge.sort() if Ordering == "Statistics": rangfolge = list(effectStrength.sort_values(ascending=False).index) #________________________________________________________________Data Reshaping if nPerVar.min() < QuantityThreshold \ or nUniquePerVar.min() < UniqueValuesThreshold: warnings.warn("Some columns have less than " + str(QuantityThreshold) + " data points or less than " + str(UniqueValuesThreshold) + " unique values. Changing from MD-plot to Jitter-Plot " "for these columns.") dataDensity = Data.copy() mm = Data.median() for strCol in lstCols: if nPerVar[strCol] < QuantityThreshold \ or nUniquePerVar[strCol] < UniqueValuesThreshold: if mm[strCol] != 0: dataDensity[strCol] = mm[strCol] \ * np.random.uniform(-0.001, 0.001, nCases) + mm[strCol] else: dataDensity[strCol] = np.random.uniform( -0.001, 0.001, nCases) # Generates in the cases where pdf cannot be estimated a scatter plot dataJitter = dataDensity.copy() # Delete all scatters for features where distributions can be estimated for strCol in lstCols: if nPerVar[strCol] >= QuantityThreshold \ and nUniquePerVar[strCol] >= UniqueValuesThreshold: dataJitter[strCol] = np.nan #apply ordering dataframe = dataDensity[rangfolge].reset_index()\ .melt(id_vars=["index"]) else: dataframe = Data[rangfolge].reset_index().melt(id_vars=["index"]) dctCols = {"index": "ID", "variable": "Variables", "value": "Values"} dataframe = dataframe.rename(columns=dctCols) #______________________________________________________________________Plotting plot = p9.ggplot(dataframe, p9.aes(x="Variables", group="Variables", y="Values")) \ + p9.scale_x_discrete(limits=rangfolge) plot = plot + p9.geom_violin(stat = stat_pde_density(scale=MDscaling), fill=Fill, colour=LineColor, size=LineSize, trim=True) \ + p9.theme(axis_text_x=p9.element_text(rotation=90)) if nPerVar.min() < QuantityThreshold \ or nUniquePerVar.min() < UniqueValuesThreshold: dataframejitter = dataJitter[rangfolge].reset_index()\ .melt(id_vars=["index"]) dataframejitter = dataframejitter.rename(columns=dctCols) plot = plot + p9.geom_jitter( size=SizeOfJitteredPoints, data=dataframejitter, colour=LineColor, mapping=p9.aes(x="Variables", group="Variables", y="Values"), position=p9.position_jitter(0.15)) if RobustGaussian == True: dfTemp = normaldist[rangfolge].reset_index().melt(id_vars=["index"]) dfTemp = dfTemp.rename(columns=dctCols) if dfTemp["Values"].isnull().all() == False: plot = plot + p9.geom_violin( data=dfTemp, mapping=p9.aes(x="Variables", group="Variables", y="Values"), colour=GaussianColor, alpha=0, scale=MDscaling, size=Gaussian_lwd, na_rm=True, trim=True, fill=None, position="identity", width=1) if BoxPlot == True: plot = plot + p9.stat_boxplot(geom = "errorbar", width = 0.5, color=BoxColor) \ + p9.geom_boxplot(width=1, outlier_colour = None, alpha=0, fill='#ffffff', color=BoxColor, position="identity") if OnlyPlotOutput == True: return plot else: print(plot) return { "Ordering": rangfolge, "DataOrdered": Data[rangfolge], "ggplotObj": plot }