def derplot(adata=None, filename='derplot', embedding='tsne', feature='sample_type_tech', size=(12, 12), save=False, draw=False, psize=1): start = datetime.datetime.now() p.options.figure_size = size savename = filename + '.' + embedding + '.' + feature + '.derplot.png' print( start.strftime("%H:%M:%S"), 'Starting ... \t', savename, ) p.theme_set(p.theme_classic()) pt = \ p.ggplot(p.aes(embedding +'0', embedding + '1', color=feature), adata.obs) \ + p.geom_point(size=psize, alpha = 1, stroke = 0 ) \ + p.guides(color = p.guide_legend(override_aes={'size': 15})) if save: pt.save(savename, format='png', dpi=200) end = datetime.datetime.now() delta = end - start print(start.strftime("%H:%M:%S"), str(int(delta.total_seconds())), 's to make: \t', savename)
def test_annotation_stripes_double(): pdf = mtcars.assign(gear=pd.Categorical(mtcars.gear), am=pd.Categorical(mtcars.am)) p = ( ggplot(pdf) + annotation_stripes( fills=["#0000FF", "#FF0000"], alpha=0.3, direction='vertical') + annotation_stripes( fills=["#AAAAAA", "#FFFFFF"], alpha=0.3, direction='horizontal') + geom_jitter(aes("gear", "wt", shape="gear", color="am"), random_state=5) + scale_shape_discrete(guide=guide_legend(order=1)) # work around #229 ) assert p == "annotation_stripes_double"
def test_annotation_stripes_coord_flip(): pdf = mtcars.assign(gear=pd.Categorical(mtcars.gear), am=pd.Categorical(mtcars.am)) p = ( ggplot(pdf) + annotation_stripes( fills=["#AAAAAA", "#FFFFFF", "#7F7FFF"], alpha=0.3) + geom_jitter( aes("gear", "wt", shape="gear", color="am"), random_state=5) + geom_vline(xintercept=0.5, color="black") + geom_vline(xintercept=1.5, color="black") + geom_vline(xintercept=2.5, color="black") + geom_vline(xintercept=3.5, color="black") + scale_shape_discrete(guide=guide_legend(order=1)) # work around #229 + coord_flip()) assert p == "annotation_stripes_coord_flip"
def plot_qq(df, color_var, facet_var=None, title=''): """ Inspired by https://www.cureffi.org/2012/08/15/qq-plots-with-matplotlib/ """ # retrive pmin, the most significant (i.e. min) p value (for defining # the axes) axis_max = max(df['pval_neglog10']) if facet_var is None: pvals = df.groupby( by=color_var).apply(calculate_expected_pval).reset_index( level=color_var, drop=True) else: pvals = df.groupby(by=[color_var, facet_var]).apply( calculate_expected_pval).reset_index(level=[color_var, facet_var], drop=True) # now plot these two arrays against each other n_colors = pvals[color_var].nunique() qqplot = plt9.ggplot( pvals, plt9.aes(x='expected_pval_neglog10', y='pval_neglog10', color=color_var)) qqplot = qqplot + plt9.geom_point(size=0.1, alpha=0.25) qqplot = qqplot + plt9.geom_abline( slope=1, intercept=0, color='black', linetype='dashed') qqplot = qqplot + plt9.theme_bw() if n_colors < 9: qqplot = qqplot + plt9.scale_colour_brewer(palette='Dark2', type='qual') qqplot = qqplot + plt9.labs(x='Expected (-log10 p-value)', y='Observed (-log10 p-value)', title=title, color='') qqplot = qqplot + plt9.lims(x=(0, axis_max), y=(0, axis_max)) if facet_var is not None: qqplot = qqplot + plt9.facet_wrap('~ {}'.format(facet_var), ncol=5) qqplot = qqplot + plt9.theme(strip_text=plt9.element_text(size=5), axis_text_x=plt9.element_text(angle=-45, hjust=0)) # set guide legend alpha to 1 qqplot = qqplot + plt9.guides(color=plt9.guide_legend(override_aes={ 'size': 2.0, 'alpha': 1.0 })) return (qqplot)
def wraplot(adata=None, filename='wraplot', embedding='tsne', feature='sample_type_tech', size=(12, 12), color=None, save=False, draw=False, psize=1): start = datetime.datetime.now() p.options.figure_size = size savename = filename + '.' + embedding + '.' + feature + '.' + str( color) + '.png' if color == None: color = feature savename = filename + '.' + embedding + '.' + feature + '.wraplot.png' print( start.strftime("%H:%M:%S"), 'Starting ... \t', savename, ) pt = (p.ggplot(p.aes(x=embedding + '0', y=embedding + '1', color=color), adata.obs) + p.geom_point(color='lightgrey', shape='.', data=adata.obs.drop(feature, axis=1)) + p.geom_point(shape='.', size=psize, alpha=1, stroke=0) + p.theme_minimal() + p.facet_wrap('~' + feature) + p.guides(color=p.guide_legend(override_aes={'size': 10}))) if save: pt.save(savename, format='png', dpi=200) end = datetime.datetime.now() delta = end - start print(start.strftime("%H:%M:%S"), str(int(delta.total_seconds())), 's to make: \t', savename)
def generate_map(data, region, value_field, iso_field='iso', scale_params=None, plot_na_dots=False, tolerance=None, plot_size=8, out_region_color='#f0f0f0', na_color='#aaaaaa', line_color='#666666', projection=None): """ This function returns a map plot with the specified options. :param pandas.DataFrame data: Data to be plotted. :param str region: Region to center the map around. Countries outside the chosen region will be obscured. :param str value_field: Column of *data* with the values to be plotted. :param str iso_field: Column of *data* with the ISO3 codes for each country. :param dict scale_params: Dictionary of parameters to be passed to the ggplot corresponding color scale (continuous or discrete). :param bool plot_na_dots: Whether to plot the dots for small countries if said country doesn't have data available. :param int tolerance: Coordinate tolerance for polygon simplification, a higher number will result in simpler polygons and faster rendering (see DEFAULT_TOLERANCES). :param int plot_size: Size of the plot, which determines the relative sizes of the elements within. :param str out_region_color: Hex color of the countries that are out of the specified region. :param str na_color: Hex color of the countries with no data available. :param str line_color: Color of the country borders. :param str projection: Kind of map projection to be used in the map. Currently, Oceania (XOX) is only available in ESPG:4326 to enable wrapping. :returns: a ggplot-like plot with the map :rtype: plotnine.ggplot """ if projection is None: if region == 'XOX': projection = 'epsg4326' else: projection = 'robinson' if projection not in PROJECTION_DICT.keys(): raise ValueError('Projection "{}" not valid'.format(projection)) if scale_params is None: scale_params = {} if region not in REGION_BOUNDS[projection]: raise ValueError( '"region" not available. Valid regions are: {}'.format(', '.join( REGION_BOUNDS[projection].keys()))) if tolerance is None: tolerance = DEFAULT_TOLERANCES[projection][region] countries = GeoDataFrame.from_file( os.path.join(os.path.dirname(__file__), 'data/world-countries.shp')) # To plot Oceania we need the original EPSG:4326 to wrap around the 180º # longitude. In other cases transform to the desired projection. if region == 'XOX': countries.crs['lon_wrap'] = '180' # Wrap around longitude 180º XOX_countries = countries['continent'] == 'XOX' countries[XOX_countries] = countries[XOX_countries].to_crs( countries.crs) centroids = countries[XOX_countries].apply( lambda row: row['geometry'].centroid, axis=1) countries.loc[XOX_countries, 'lon'] = [c.x for c in centroids] countries.loc[XOX_countries, 'lat'] = [c.y for c in centroids] else: if projection != 'epsg4326': countries = countries.to_crs(PROJECTION_DICT[projection]) centroids = countries.apply(lambda row: row['geometry'].centroid, axis=1) countries['lon'] = [c.x for c in centroids] countries['lat'] = [c.y for c in centroids] countries['geometry'] = countries['geometry'].simplify(tolerance) upper_left, lower_right = REGION_BOUNDS[projection][region] limits_x = [upper_left[0], lower_right[0]] limits_y = [lower_right[1], upper_left[1]] ratio = (limits_x[1] - limits_x[0]) / (limits_y[1] - limits_y[0]) plot_data = pd.merge(countries, data, how='left', left_on='iso', right_on=iso_field) map_bounds = REGION_BOUNDS['epsg4326'][region] map_area = ((map_bounds[1][0] - map_bounds[0][0]) * (map_bounds[0][1] - map_bounds[1][1])) plot_data['plot_dot'] = (plot_data['pol_area'] < DOT_THRESHOLD * map_area) if not plot_na_dots: plot_data['plot_dot'] &= ~pd.isnull(plot_data[value_field]) if region != 'XWX': in_region = ((~pd.isnull(plot_data[value_field])) & (plot_data['continent'] == region)) in_region_missing = ((pd.isnull(plot_data[value_field])) & (plot_data['continent'] == region)) out_region = plot_data['continent'] != region else: in_region = ~pd.isnull(plot_data[value_field]) in_region_missing = pd.isnull(plot_data[value_field]) out_region = np.repeat(False, len(plot_data)) if plot_data[value_field].dtype == 'object': # Assume discrete values fill_scale = scale_fill_brewer(**scale_params, drop=False) else: # Assume continuous values fill_scale = scale_fill_gradient(**scale_params) plot_data_values = plot_data[in_region] plot_data_missing = plot_data[in_region_missing] plot_data_out_region = plot_data[out_region] dots_region = plot_data_values[plot_data_values['plot_dot']] dots_region_missing = plot_data_missing[plot_data_missing['plot_dot']] dots_out_region = plot_data_out_region[plot_data_out_region['plot_dot']] plt = ( ggplot() + geom_map(plot_data_values, aes(fill=value_field), color=line_color, size=0.3) + geom_map( plot_data_missing, aes(color='plot_dot'), fill=na_color, size=0.3) + geom_map(plot_data_out_region, fill=out_region_color, color=line_color, size=0.3) + geom_point(dots_region, aes(x='lon', y='lat', fill=value_field), size=3, stroke=.1, color=line_color) + geom_point(dots_region_missing, aes(x='lon', y='lat'), fill=na_color, size=3, stroke=.1, color=line_color) + geom_point(dots_out_region, aes(x='lon', y='lat'), fill=out_region_color, size=3, stroke=.1, color=line_color) + scale_x_continuous(breaks=[], limits=limits_x) + scale_y_continuous(breaks=[], limits=limits_y) + theme( figure_size=(plot_size * ratio, plot_size), panel_background=element_rect(fill='white', color='black'), # panel_border=element_rect(fill='white', # color='black', # size=.1), legend_background=element_rect( fill="white", color='black', size=.5), legend_box_just='left') + xlab('') + ylab('')) if len(plot_data_values.index) > 0: plt += fill_scale plt += scale_color_manual(name=' ', values=[line_color], breaks=[False], labels=['No data available']) if plot_data[value_field].dtype == 'object': plt += guides(fill=guide_legend(override_aes={'shape': None})) return { 'plot': plt, 'ratio': ratio, }
fig = pn.ggplot(normalized_all_data_UMAPencoded_df, pn.aes(x="1", y="2")) fig += pn.geom_point(pn.aes(color="sample group"), alpha=0.4) fig += pn.labs(x="UMAP 1", y="UMAP 2", title="Gene expression data in gene space") fig += pn.theme_bw() fig += pn.theme( legend_title_align="center", plot_background=pn.element_rect(fill="white"), legend_key=pn.element_rect(fill="white", colour="white"), legend_title=pn.element_text(family="sans-serif", size=15), legend_text=pn.element_text(family="sans-serif", size=12), plot_title=pn.element_text(family="sans-serif", size=15), axis_text=pn.element_text(family="sans-serif", size=12), axis_title=pn.element_text(family="sans-serif", size=15), ) fig += pn.scale_color_manual(["#bdbdbd", "red", "blue"]) fig += pn.guides(colour=pn.guide_legend(override_aes={"alpha": 1})) fig += pn.scales.xlim(9, 10) print(fig) # - # Based on a UMAP of the normalized gene expression data, it looks like there isn't a clear separation between WT and mutant samples, though there are only 2 samples per group so this type of clustering observation is limited. # # **Takeaway:** # # In trying to understand why there are these flat-tops to some of the volcano plots and why some volcano plots are completely flat, we found: # 1. This behavior is _not_ a result of how we are plotting in python (there was some speculation about there being an issue with the numpy library used) # 2. The latent space shifting we're doing seems to roughly preserve differences between groups (as seen in [this notebook](https://github.com/greenelab/simulate-expression-compendia/blob/master/Pseudo_experiments/create_heatmap.ipynb) where the structure of the samples is preserved but there is a different set of related genes that are DE. More information can be found in Figure 3D in [this paper](https://academic.oup.com/gigascience/article/9/11/giaa117/5952607)), but this signal can be muddled/noisy depending on where the experiment was shifted to (i.e. the representation that is found in that location can cause the experiment to have a more compressed difference between groups) as seen in the heatmaps. The heatmap of the two simulation experiments shows that some experiments have a more noisey distinction between groups (WT vs mutant) whereas the other simulation experiment has a more distinct difference where the within grouping is cleaner. This definitely points to the need to understand how this simulation process is working and how biology is represented in the latent space. This will definitely be a project for the future. For now we at least have an explanation for why we are observing these shapes in the volcano plots
def density_plot(df, x, group=None, facet_x=None, facet_y=None, position='overlay', sort_groups=True, base_size=10, figure_size=(6, 3), **stat_kwargs): ''' Plot a 1-d density plot Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet position : str if groups are present, choose between `stack` or `overlay` base_size : int base size for theme_ez figure_size :tuple of int figure size stat_kwargs : kwargs kwargs for the density stat Returns ------- g : EZPlot EZplot object ''' if position not in ['overlay', 'stack']: log.error("position not recognized") raise NotImplementedError("position not recognized") # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' # aggregate data and reorder columns gdata = agg_data(dataframe, variables, groups, None, fill_groups=False) gdata = gdata[[ c for c in ['x', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] # start plotting g = EZPlot(gdata) # determine order and create a categorical type colors = ez_colors(g.n_groups('group')) # set groups if group is None: g += p9.geom_density(p9.aes(x="x"), stat=p9.stats.stat_density(**stat_kwargs), colour=ez_colors(1)[0], fill=ez_colors(1)[0], **POSITION_KWARGS[position]) else: g += p9.geom_density(p9.aes(x="x", group="factor(group)", colour="factor(group)", fill="factor(group)"), stat=p9.stats.stat_density(**stat_kwargs), **POSITION_KWARGS[position]) g += p9.scale_fill_manual(values=colors, reverse=False) g += p9.scale_color_manual(values=colors, reverse=False) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab('Density') # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) if sort_groups: g += p9.guides(fill=p9.guide_legend(reverse=True)) return g
def area_plot(df, x, y, group=None, facet_x=None, facet_y=None, aggfun='sum', fill=False, sort_groups=True, base_size=10, figure_size=(6, 3)): ''' Aggregates data in df and plots as a stacked area chart. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet aggfun : str or fun function to be used for aggregating (eg sum, mean, median ...) fill : bool plot shares for each group instead of absolute values sort_groups : bool sort groups by the sum of their value (otherwise alphabetical order is used) base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['y'], variables['y'] = unname(y) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' # aggregate data and reorder columns gdata = agg_data(dataframe, variables, groups, aggfun, fill_groups=True) gdata['y'].fillna(0, inplace=True) gdata = gdata[[ c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] if fill: groups_to_normalize = [ c for c in ['x', 'facet_x', 'facet_y'] if c in gdata.columns ] total_values = gdata \ .groupby(groups_to_normalize)['y'] \ .sum() \ .reset_index() \ .rename(columns = {'y':'tot_y'}) gdata = pd.merge(gdata, total_values, on=groups_to_normalize) gdata['y'] = gdata['y'] / (gdata['tot_y'] + EPSILON) gdata.drop('tot_y', axis=1, inplace=True) ylabeller = percent_labels else: ylabeller = ez_labels # get plot object g = EZPlot(gdata) # determine order and create a categorical type if sort_groups: sort_data_groups(g) # get colors colors = np.flip(ez_colors(g.n_groups('group'))) # set groups if group is None: g += p9.geom_area(p9.aes(x="x", y="y"), colour=None, fill=ez_colors(1)[0], na_rm=True) else: g += p9.geom_area(p9.aes(x="x", y="y", group="factor(group)", fill="factor(group)"), colour=None, na_rm=True) g += p9.scale_fill_manual(values=colors) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_timestamp('x'): g += p9.scale_x_datetime() elif g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ylabeller, expand=[0, 0, 0.1 * (not fill) + 0.03, 0]) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) if sort_groups: g += p9.guides(fill=p9.guide_legend(reverse=True), color=p9.guide_legend(reverse=True)) return g
fig += pn.labs(x ='UMAP 1', y = 'UMAP 2', title = 'Gene expression data in gene space') fig += pn.theme_bw() fig += pn.theme( legend_title_align = "center", plot_background=pn.element_rect(fill='white'), legend_key=pn.element_rect(fill='white', colour='white'), legend_title=pn.element_text(family='sans-serif', size=15), legend_text=pn.element_text(family='sans-serif', size=12), plot_title=pn.element_text(family='sans-serif', size=15), axis_text=pn.element_text(family='sans-serif', size=12), axis_title=pn.element_text(family='sans-serif', size=15) ) fig += pn.scale_color_manual(['#bdbdbd', 'red', 'blue']) fig += pn.guides(colour=pn.guide_legend(override_aes={'alpha': 1})) print(fig) # ## PCA in latent space # In[21]: # Model files model_encoder_filename = glob.glob(os.path.join(vae_model_dir, "*_encoder_model.h5"))[0] weights_encoder_filename = glob.glob(os.path.join(vae_model_dir, "*_encoder_weights.h5"))[0] model_decoder_filename = glob.glob(os.path.join(vae_model_dir, "*_decoder_model.h5"))[0] weights_decoder_filename = glob.glob(os.path.join(vae_model_dir, "*_decoder_weights.h5"))[0]
], axis='columns') df['feature_set'] = model cv_results_df = cv_results_df.append(df) cv_results_summary = (cv_results_df.groupby( ['classify__alpha', 'feature_set'])['mean_test_score'].max().reset_index()) # In[17]: (gg.ggplot( cv_results_summary, gg.aes(x='classify__alpha', y='mean_test_score', color='feature_set')) + gg.geom_jitter(size=4, alpha=0.8, height=0, width=0.05) + gg.scale_x_log10() + gg.labs(x='Regularization strength multiplier (log alpha)', y='CV AUROC') + gg.guides(fill=gg.guide_legend(title="Feature Set")) + gg.aes(ymin=min([0.5, cv_results_summary['mean_test_score'].min()]), ymax=1) + theme_cognoma()) # ## Use optimal hyperparameters to output ROC curve # In[18]: y_pred_dict = { model: { 'train': pipeline.decision_function(X_train), 'test': pipeline.decision_function(X_test) } for model, pipeline in cv_pipelines.items() }
def hist_plot(df, x, y=None, group = None, facet_x = None, facet_y = None, w='1', bins=21, bin_width = None, position = 'stack', normalize = False, sort_groups=True, base_size=10, figure_size=(6, 3)): ''' Plot a 1-d or 2-d histogram Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis. If this is specified the histogram will be 2-d. group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet w : str quoted expression representing histogram weights (default is 1) bins : int or tuple number of bins to be used bin_width : float or tuple bin width to be used position : str if groups are present, choose between `stack`, `overlay` or `dodge` normalize : bool normalize histogram counts sort_groups : bool sort groups by the sum of their value (otherwise alphabetical order is used) base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' if position not in ['overlay', 'stack', 'dodge']: log.error("position not recognized") raise NotImplementedError("position not recognized") if (bins is None) and (bin_width is None): log.error("Either bins or bin_with should be defined") raise ValueError("Either bins or bin_with should be defined") if (bins is not None) and (bin_width is not None): log.error("Only one between bins or bin_with should be defined") raise ValueError("Only one between bins or bin_with should be defined") if (y is not None) and (group is not None): log.error("y and group cannot be requested at the same time") raise ValueError("y and group cannot be requested at the same time") if y is None: bins = (bins, bins) bin_width = (bin_width, bin_width) else: if type(bins) not in [tuple, list]: bins = (bins, bins) if type(bin_width) not in [tuple, list]: bin_width = (bin_width, bin_width) # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'y', 'group', 'facet_x', 'facet_y'], [x, y, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['w'], variables['w'] = unname(w) # set column names and evaluate expressions tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False) # redefine groups and variables; remove and store (eventual) names new_groups = {c:c for c in tmp_df.columns if c in ['x', 'y', 'group', 'facet_x', 'facet_y']} non_xy_groups = [g for g in new_groups.keys() if g not in ['x', 'y']] new_variables = {'w':'w'} # bin data (if necessary) if tmp_df['x'].dtypes != np.dtype('O'): tmp_df['x'], bins_x, bin_width_x= bin_data(tmp_df['x'], bins[0], bin_width[0]) else: bin_width_x=1 if y is not None: if tmp_df['y'].dtypes != np.dtype('O'): tmp_df['y'], bins_y, bin_width_y = bin_data(tmp_df['y'], bins[1], bin_width[1]) else: bin_width_y=1 else: bin_width_y=1 # aggregate data and reorder columns gdata = agg_data(tmp_df, new_variables, new_groups, 'sum', fill_groups=True) gdata.fillna(0, inplace=True) gdata = gdata[[c for c in ['x', 'y', 'w', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]] # normalize if normalize: if len(non_xy_groups)==0: gdata['w'] = gdata['w']/(gdata['w'].sum()*bin_width_x*bin_width_y) else: gdata['w'] = gdata.groupby(non_xy_groups)['w'].apply(lambda x: x/(x.sum()*bin_width_x*bin_width_y)) # start plotting g = EZPlot(gdata) # determine order and create a categorical type if (group is not None) and sort_groups: if g.column_is_categorical('x'): g.sort_group('x', 'w', ascending=False) g.sort_group('group', 'w') g.sort_group('facet_x', 'w', ascending=False) g.sort_group('facet_y', 'w', ascending=False) if groups: colors = np.flip(ez_colors(g.n_groups('group'))) elif (group is not None): colors = ez_colors(g.n_groups('group')) if y is None: # set groups if group is None: g += p9.geom_bar(p9.aes(x="x", y="w"), stat = 'identity', colour = None, fill = ez_colors(1)[0]) else: g += p9.geom_bar(p9.aes(x="x", y="w", group="factor(group)", fill="factor(group)"), colour=None, stat = 'identity', **POSITION_KWARGS[position]) g += p9.scale_fill_manual(values=colors) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab('Counts') # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) if sort_groups: g += p9.guides(fill=p9.guide_legend(reverse=True)) else: g += p9.geom_tile(p9.aes(x="x", y="y", fill='w'), stat = 'identity', colour = None) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale if g.column_is_categorical('y'): g += p9.scale_y_discrete() else: g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text='Counts', size=base_size)) return g
x="median expression of PAO1-only genes (TPM)", y="median expression of PA14-only genes (TPM)", title="TPM of accessory genes in binned PAO1 compendium", ) fig1 += pn.theme_bw() fig1 += pn.theme( legend_title_align="center", plot_background=pn.element_rect(fill="white"), legend_key=pn.element_rect(fill="white", colour="white"), legend_title=pn.element_text(family="sans-serif", size=15), legend_text=pn.element_text(family="sans-serif", size=12), plot_title=pn.element_text(family="sans-serif", size=15), axis_text=pn.element_text(family="sans-serif", size=10), axis_title=pn.element_text(family="sans-serif", size=12), ) fig1 += pn.guides(colour=pn.guide_legend(override_aes={"alpha": 1})) print(fig1) # + # Plot accessory gene expression in PA14 compendium fig2 = pn.ggplot( pao1_pa14_acc_pa14_compendium_label, pn.aes(x="median acc expression_pao1", y="median acc expression_pa14"), ) fig2 += pn.geom_point(pn.aes(color="Strain type_pa14"), alpha=0.4) fig2 += pn.labs( x="median expression of PAO1-only genes (TPM)", y="median expression of PA14-only genes (TPM)", title="TPM of accessory genes in binned PA14 compendium", )
def variable_histogram(df, x, group=None, facet_y=None, w='1', bins=21, bin_width=None, position='stack', normalize=False, base_size=10, figure_size=(6, 3)): ''' Plot a 1-d histogram Parameters ---------- df : pd.DataFrame input dataframe x : str or list quoted expressions to be plotted on the x axis group : str quoted expression to be used as group (ie color) facet_y : str quoted expression to be used as facet w : str quoted expression representing histogram weights (default is 1) bins : int or tuple number of bins to be used bin_width : float or tuple bin width to be used position : str if groups are present, choose between `stack`, `overlay` or `dodge` normalize : bool normalize histogram counts base_size : int base size for theme_ez figure_size :tuple of int figure size Returns ------- g : EZPlot EZplot object ''' # TODO: performance improvement # TODO: add support for categorical variables in x if position not in ['overlay', 'stack', 'dodge']: log.error("position not recognized") raise NotImplementedError("position not recognized") if (bins is None) and (bin_width is None): log.error("Either bins or bin_with should be defined") raise ValueError("Either bins or bin_with should be defined") if (bins is not None) and (bin_width is not None): log.error("Only one between bins or bin_with should be defined") raise ValueError( "Only one between bins or bin_with should be defined") if isinstance(x, str): x = [x] # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['group', 'facet_y'], [group, facet_y]): names[label], groups[label] = unname(var) xs = [] for i, var in enumerate(x): xs.append('x_{}'.format(i)) names['x_{}'.format(i)], groups['x_{}'.format(i)] = unname(var) names['w'], variables['w'] = unname(w) # set column names and evaluate expressions tmp_df = agg_data(dataframe, variables, groups, None, fill_groups=False) # redefine groups and variables; remove and store (eventual) names new_groups = { c: c for c in tmp_df.columns if c in ['group', 'facet_y'] + xs } non_x_groups = [g for g in new_groups.keys() if g not in xs] # bin data (if necessary) bins_x = {} bin_width_x = {} for x in xs: if tmp_df[x].dtypes != np.dtype('O'): tmp_df[x], bins_x[x], bin_width_x[x] = bin_data( tmp_df[x], bins, bin_width) else: bin_width_x[x] = 1 # aggregate data and reorder columns df_ls = [] for x in xs: # aggregate data groups = {g: g for g in non_x_groups} groups[x] = x single_df = agg_data(tmp_df, variables, groups, 'sum', fill_groups=True) single_df.fillna(0, inplace=True) single_df['facet_x'] = names[x] single_df.rename(columns={x: 'x'}, inplace=True) # normalize if normalize: if len(non_x_groups) == 0: single_df['w'] = single_df['w'] / (single_df['w'].sum() * bin_width_x[x]) else: single_df['w'] = single_df.groupby(non_x_groups)['w'].apply( lambda z: z / (z.sum() * bin_width_x[x])) df_ls.append(single_df) gdata = pd.concat(df_ls) gdata = gdata[[ c for c in ['x', 'w', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] # start plotting g = EZPlot(gdata) # set groups for single_df in df_ls: if group is None: g += p9.geom_bar(p9.aes(x="x", y="w"), data=single_df, stat='identity', colour=None, fill=ez_colors(1)[0]) else: g += p9.geom_bar(p9.aes(x="x", y="w", group="factor(group)", fill="factor(group)"), data=single_df, colour=None, stat='identity', **POSITION_KWARGS[position]) g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group'))) # set facets if facet_y is None: g += p9.facet_wrap('~facet_x', scales='free') else: g += p9.facet_grid('facet_y~facet_x', scales='free') # set x scale g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab('Value') + \ p9.ylab('Counts') # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) g += p9.guides(fill=p9.guide_legend(reverse=True)) return g
color_map = { "before": mcolors.to_hex(pd.np.array([178,223,138, 255])/255), "after": mcolors.to_hex(pd.np.array([31,120,180, 255])/255) } # In[14]: g = ( p9.ggplot(calibration_df, p9.aes(x="predicted", y="actual", color="model_calibration")) + p9.geom_point() + p9.geom_path() + p9.geom_abline(p9.aes(slope=1, intercept=0), linetype='dashed', color='black') + p9.scale_color_manual(values={ "before":color_map["before"], "after":color_map["after"] }) + p9.facet_wrap("relation") + p9.labs( x="Predicted", y="Actual" ) + p9.guides(color=p9.guide_legend(title="Model Calibration")) + p9.theme_bw() ) print(g) g.save(filename="../model_calibration.png", dpi=300)
rel }) edges_df = pd.DataFrame.from_records(datarows) edges_df # In[11]: import math g = (p9.ggplot(edges_df, p9.aes(x="relation", y="edges", fill="in_hetionet")) + p9.geom_col(position="dodge") + p9.scale_fill_manual(values={ "Existing": color_map["Existing"], "Novel": color_map["Novel"] }) + p9.geom_text(p9.aes(label=( edges_df.apply(lambda x: f"{x['edges']}\n({x['recall']*100:.0f}%)" if not math.isnan(x['recall']) else f"{x['edges']}", axis=1))), position=p9.position_dodge(width=0.9), size=9, va="bottom") + p9.scale_y_log10() + p9.labs(y="# of Edges", x="Relation Type", title="Reconstructing Edges in Hetionet") + p9.guides(fill=p9.guide_legend(title="In Hetionet?")) + p9.theme( axis_text_y=p9.element_blank(), axis_ticks_major=p9.element_blank(), rect=p9.element_blank(), )) print(g) g.save(filename="../edges_added.png", dpi=300)
# Plot fig = ggplot(input_data_UMAPencoded_df, aes(x='1', y='2')) fig += geom_point(aes(color='dataset'), alpha=0.2) fig += labs(x ='UMAP 1', y = 'UMAP 2', title = 'UMAP of normalized compendium') fig += theme_bw() fig += theme( legend_title_align = "center", plot_background=element_rect(fill='white'), legend_key=element_rect(fill='white', colour='white'), legend_title=element_text(family='sans-serif', size=15), legend_text=element_text(family='sans-serif', size=12), plot_title=element_text(family='sans-serif', size=15), axis_text=element_text(family='sans-serif', size=12), axis_title=element_text(family='sans-serif', size=15) ) fig += guides(colour=guide_legend(override_aes={'alpha': 1})) fig += scale_color_manual(['#ff6666', '#add8e6']) print(fig) # **Observations:** # * There looks to be a good amount of variance in the compendium overall. # * Using a split of 25% seems to get a similar distribution of data between training and validation sets. # * Remember, the dataset is in 17K dimensional space, which will make the small clusters difficult to represent during training # # Overall, having so many features in our dataset, points to the need for more samples to represent the structure in the compendium. For now, we are limited by memory to only select a subset of recount2, but in a future iteration perhaps this will be updated.
cv_results_summary = (cv_results_df .groupby(['classify__alpha', 'feature_set'])['mean_test_score'] .max() .reset_index()) # In[17]: (gg.ggplot(cv_results_summary, gg.aes(x='classify__alpha', y='mean_test_score', color='feature_set')) + gg.geom_jitter(size=4, alpha=0.8, height=0, width=0.05) + gg.scale_x_log10() + gg.labs(x='Regularization strength multiplier (log alpha)', y='CV AUROC') + gg.guides(fill=gg.guide_legend(title="Feature Set")) + gg.aes(ymin=min([0.5, cv_results_summary['mean_test_score'].min()]), ymax=1) + theme_cognoma() ) # ## Use optimal hyperparameters to output ROC curve # In[18]: y_pred_dict = { model: { 'train': pipeline.decision_function(X_train), 'test': pipeline.decision_function(X_test) } for model, pipeline in cv_pipelines.items() }