def cell_cycle_phase_barplot(adata, palette='Set2'): """Plots the proportion of cells in each phase of the cell cycle See also: cell_cycle_phase_pieplot for the matplotlib pie chart Parameters ----------- adata: AnnData The AnnData object being used for the analysis. Must be previously evaluated by `tl.annotate_cell_cycle`. Returns ----------- A plotnine barplot with the total counts of cell in each phase of the cell cycle. """ plt_data = adata.obs.copy() plt_data['cell_cycle_phase'] = pd.Categorical( plt_data['cell_cycle_phase'], categories=['G1 post-mitotic', 'G1 pre-replication', 'S/G2/M']) cycle_plot = ( ggplot(plt_data, aes('cell_cycle_phase', fill='cell_cycle_phase')) + geom_bar() + coord_flip() + guides(fill=False) + labs(y='', x='Cell cycle phase') + theme_light() + theme(panel_grid_major_y=element_blank(), panel_grid_minor_y=element_blank(), panel_grid_major_x=element_line(size=1.5), panel_grid_minor_x=element_line(size=1.5)) + scale_fill_brewer(type='qual', palette=palette)) return cycle_plot
def comparison_plot(self, df: pd.DataFrame, xmin=None, xmax=None, bw="normal_reference", **kwargs): return (ggplot(df, aes(df.columns[1], fill=df.columns[0])) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_density(bw=bw, alpha=0.8) + ggtitle(self.plot_title) + self._scale_x(xmin, xmax) + ergo_theme)
def all_stack(fold=BUZZER_DEV_FOLD): df_rnn = stack('output/buzzer/RNNBuzzer', 'RNN', fold) df_mlp = stack('output/buzzer/MLPBuzzer', 'MLP', fold) df_thr = stack('output/buzzer/ThresholdBuzzer', 'Threshold', fold) df = df_rnn.append(df_mlp, ignore_index=True) df = df.append(df_thr, ignore_index=True) model_type = CategoricalDtype(categories=['Threshold', 'MLP', 'RNN']) df['Model'] = df['Model'].astype(model_type) p = (ggplot(df) + geom_area(aes(x='Position', y='Frequency', fill='Buzzing')) + facet_grid('~ Model') + theme_fs() + theme(aspect_ratio=1) + scale_fill_brewer(type='div', palette=7)) p.save('output/buzzer/{}_stack.pdf'.format(fold))
def comparison_plot( # type: ignore self, df: pd.DataFrame, xmin=None, xmax=None, bins: int = 50, **kwargs): return (ggplot(df, aes(df.columns[1], fill=df.columns[0])) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_histogram(position="identity", alpha=0.9, bins=bins) + self._scale_x(xmin, xmax) + facet_wrap(df.columns[0], ncol=1) + guides(fill=False) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
def plot_preprocessing_boxplot_bymodel(dataframe, models_labels, metrics_labels, groups_labels, figure_size=(14, 4)): """ We define a function to plot the grid. """ return ( # Define the plot. p9.ggplot(dataframe, p9.aes(x='variable', y='value', fill='group')) # Add the boxplots. + p9.geom_boxplot(position='dodge') # Rename the x axis. + p9.scale_x_discrete(name='Metric', labels=lambda l: [metrics_labels[x] for x in l]) # Rename the y axis. + p9.scale_y_continuous( name='Value', expand=(0, 0.05), # breaks=[-0.25, 0, 0.25, 0.5, 0.75, 1], limits=[-0.25, 1], labels=lambda l: ['{:.2f}'.format(x) for x in l]) # Define the colors for the metrics for color-blind people. + p9.scale_fill_brewer(name='Group', labels=lambda l: [groups_labels[x] for x in l], type='qual', palette='Set2') # Place the plots in a grid, renaming the labels. + p9.facet_grid( 'model ~ .', scales='free_y', labeller=p9.labeller(rows=lambda x: f'{models_labels[x]}')) # Define the theme for the plot. + p9.theme( # Remove the x and y axis names. axis_title_x=p9.element_blank(), axis_title_y=p9.element_blank(), # Set the size of x and y tick labels font. axis_text_x=p9.element_text(size=7), axis_text_y=p9.element_text(size=7), # Place the legend on top, without title, and reduce the margin. legend_title=p9.element_blank(), legend_position='top', legend_box_margin=2, # Set the size for the figure. figure_size=figure_size, ))
def all_stack(fold=BUZZER_DEV_FOLD): df_rnn = stack("output/buzzer/RNNBuzzer", "RNN", fold) df_mlp = stack("output/buzzer/MLPBuzzer", "MLP", fold) df_thr = stack("output/buzzer/ThresholdBuzzer", "Threshold", fold) df = df_rnn.append(df_mlp, ignore_index=True) df = df.append(df_thr, ignore_index=True) model_type = CategoricalDtype(categories=["Threshold", "MLP", "RNN"]) df["Model"] = df["Model"].astype(model_type) p = ( ggplot(df) + geom_area(aes(x="Position", y="Frequency", fill="Buzzing")) + facet_grid("~ Model") + theme_fs() + theme(aspect_ratio=1) + scale_fill_brewer(type="div", palette=7) ) p.save("output/buzzer/{}_stack.pdf".format(fold))
def _make_plots(df_plt, out_file_base, y='AUC', facet_grid='', h_line=''): len_x = len(np.unique(df_plt['resolution'])) if 'sparsity_l1' in df_plt.columns: df_plt['Sparsity'] = df_plt['sparsity_l1'] len_x2 = len(np.unique(df_plt['Sparsity'])) else: len_x2 = 0 if len_x2 > 1: gplt = plt9.ggplot(df_plt, plt9.aes( fill='Sparsity', x='resolution', y=y, )) gplt = gplt + plt9.geom_boxplot(alpha=0.8, outlier_alpha=0) gplt = gplt + plt9.geom_jitter( plt9.aes(color='Sparsity'), alpha=0.25, width=0.2) else: gplt = plt9.ggplot(df_plt, plt9.aes(x='resolution', y=y)) gplt = gplt + plt9.geom_boxplot(alpha=0.8, outlier_alpha=0) gplt = gplt + plt9.geom_jitter(alpha=0.25, width=0.2) gplt = gplt + plt9.theme_bw(base_size=12) if facet_grid != '': gplt = gplt + plt9.facet_grid('{} ~ .'.format(facet_grid)) if y == 'f1-score': gplt = gplt + plt9.labs(x='Resolution', y='F1 score', title='') elif y in ['AUC', 'MCC']: gplt = gplt + plt9.labs(x='Resolution', y=y, title='') else: gplt = gplt + plt9.labs( x='Resolution', y=y.capitalize().replace('_', ' '), title='') gplt = gplt + plt9.theme( # legend_position='none', axis_text_x=plt9.element_text(angle=-45, hjust=0)) if len_x2 != 0 and len_x2 < 9: gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') if h_line != '': gplt = gplt + plt9.geom_hline(plt9.aes(yintercept=h_line), linetype='dashdot') gplt.save('{}-resolution__{}.png'.format(out_file_base, y.replace('-', '_')), dpi=300, width=4 * ((len_x + len_x2) / 4), height=5, limitsize=False)
def protobowl(fold=BUZZER_DEV_FOLD): df_rnn = pickle.load( open("output/buzzer/RNNBuzzer/{}_protobowl.pkl".format(fold), "rb") ) df_rnn = df_rnn.groupby(["Possibility", "Outcome"]) df_rnn = df_rnn.size().reset_index().rename(columns={0: "Count"}) df_rnn["Model"] = pd.Series(["RNN" for _ in range(len(df_rnn))], index=df_rnn.index) df_mlp = pickle.load( open("output/buzzer/MLPBuzzer/{}_protobowl.pkl".format(fold), "rb") ) df_mlp = df_mlp.groupby(["Possibility", "Outcome"]) df_mlp = df_mlp.size().reset_index().rename(columns={0: "Count"}) df_mlp["Model"] = pd.Series(["MLP" for _ in range(len(df_mlp))], index=df_mlp.index) df_thr = pickle.load( open("output/buzzer/ThresholdBuzzer/{}_protobowl.pkl".format(fold), "rb") ) df_thr = df_thr.groupby(["Possibility", "Outcome"]) df_thr = df_thr.size().reset_index().rename(columns={0: "Count"}) df_thr["Model"] = pd.Series( ["Threshold" for _ in range(len(df_thr))], index=df_thr.index ) df = df_rnn.append(df_mlp, ignore_index=True) df = df.append(df_thr, ignore_index=True) outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15]) df["Outcome"] = df["Outcome"].astype(outcome_type) model_type = CategoricalDtype(categories=["Threshold", "MLP", "RNN"]) df["Model"] = df["Model"].astype(model_type) p = ( ggplot(df) + geom_col(aes(x="Possibility", y="Count", fill="Outcome"), width=0.7) + facet_grid("Model ~") + coord_flip() + theme_fs() + theme(aspect_ratio=0.17) + scale_fill_brewer(type="div", palette=7) ) figure_dir = os.path.join("output/buzzer/{}_protobowl.pdf".format(fold)) p.save(figure_dir)
def all_stack(fold=BUZZER_DEV_FOLD): df_rnn = stack('output/buzzer/RNNBuzzer', 'RNN', fold) df_mlp = stack('output/buzzer/MLPBuzzer', 'MLP', fold) df_thr = stack('output/buzzer/ThresholdBuzzer', 'Threshold', fold) df = df_rnn.append(df_mlp, ignore_index=True) df = df.append(df_thr, ignore_index=True) model_type = CategoricalDtype( categories=['Threshold', 'MLP', 'RNN']) df['Model'] = df['Model'].astype(model_type) p = ( ggplot(df) + geom_area(aes(x='Position', y='Frequency', fill='Buzzing')) + facet_grid('~ Model') + theme_fs() + theme( aspect_ratio=1, ) + scale_fill_brewer(type='div', palette=7) ) p.save('output/buzzer/{}_stack.pdf'.format(fold))
def protobowl(fold=BUZZER_DEV_FOLD): df_rnn = pickle.load( open('output/buzzer/RNNBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_rnn = df_rnn.groupby(['Possibility', 'Outcome']) df_rnn = df_rnn.size().reset_index().rename(columns={0: 'Count'}) df_rnn['Model'] = pd.Series(['RNN' for _ in range(len(df_rnn))], index=df_rnn.index) df_mlp = pickle.load( open('output/buzzer/MLPBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_mlp = df_mlp.groupby(['Possibility', 'Outcome']) df_mlp = df_mlp.size().reset_index().rename(columns={0: 'Count'}) df_mlp['Model'] = pd.Series(['MLP' for _ in range(len(df_mlp))], index=df_mlp.index) df_thr = pickle.load( open('output/buzzer/ThresholdBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_thr = df_thr.groupby(['Possibility', 'Outcome']) df_thr = df_thr.size().reset_index().rename(columns={0: 'Count'}) df_thr['Model'] = pd.Series(['Threshold' for _ in range(len(df_thr))], index=df_thr.index) df = df_rnn.append(df_mlp, ignore_index=True) df = df.append(df_thr, ignore_index=True) outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15]) df['Outcome'] = df['Outcome'].astype(outcome_type) model_type = CategoricalDtype( categories=['Threshold', 'MLP', 'RNN']) df['Model'] = df['Model'].astype(model_type) p = ( ggplot(df) + geom_col(aes(x='Possibility', y='Count', fill='Outcome'), width=0.7) + facet_grid('Model ~') + coord_flip() + theme_fs() + theme(aspect_ratio=0.17) + scale_fill_brewer(type='div', palette=7) ) figure_dir = os.path.join('output/buzzer/{}_protobowl.pdf'.format(fold)) p.save(figure_dir)
def _make_p9_plot(self): ''' Make ggplot2 style stacked barplot of cutting patterns annotated with waste and pattern quantities. Stacked bars are colored based on the cut type (either the cut length or waste). Returns ------- g : plotnine ggplot. ''' # self.make_plot_df() g = (p9.ggplot(mapping=p9.aes( x='Pattern', y='Length', fill='Length Cat'), data=self.plot_df) + p9.geom_bar(position='stack', stat='identity', color='black') + p9.scale_fill_brewer(type='qual', palette=2, name='Cut Type') + p9.geom_text(mapping=p9.aes(y='Length', label='Annotate'), position='stack') + p9.ggtitle('Pattern Cuts')) return (g)
def protobowl(fold=BUZZER_DEV_FOLD): df_rnn = pickle.load( open('output/buzzer/RNNBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_rnn = df_rnn.groupby(['Possibility', 'Outcome']) df_rnn = df_rnn.size().reset_index().rename(columns={0: 'Count'}) df_rnn['Model'] = pd.Series(['RNN' for _ in range(len(df_rnn))], index=df_rnn.index) df_mlp = pickle.load( open('output/buzzer/MLPBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_mlp = df_mlp.groupby(['Possibility', 'Outcome']) df_mlp = df_mlp.size().reset_index().rename(columns={0: 'Count'}) df_mlp['Model'] = pd.Series(['MLP' for _ in range(len(df_mlp))], index=df_mlp.index) df_thr = pickle.load( open('output/buzzer/ThresholdBuzzer/{}_protobowl.pkl'.format(fold), 'rb')) df_thr = df_thr.groupby(['Possibility', 'Outcome']) df_thr = df_thr.size().reset_index().rename(columns={0: 'Count'}) df_thr['Model'] = pd.Series(['Threshold' for _ in range(len(df_thr))], index=df_thr.index) df = df_rnn.append(df_mlp, ignore_index=True) df = df.append(df_thr, ignore_index=True) outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15]) df['Outcome'] = df['Outcome'].astype(outcome_type) model_type = CategoricalDtype(categories=['Threshold', 'MLP', 'RNN']) df['Model'] = df['Model'].astype(model_type) p = (ggplot(df) + geom_col(aes(x='Possibility', y='Count', fill='Outcome'), width=0.7) + facet_grid('Model ~') + coord_flip() + theme_fs() + theme(aspect_ratio=0.17) + scale_fill_brewer(type='div', palette=7)) figure_dir = os.path.join('output/buzzer/{}_protobowl.pdf'.format(fold)) p.save(figure_dir)
"output/comparison_stats/corpora_kl_divergence.tsv", sep="\t") kl_divergence_df.head() g = (p9.ggplot( kl_divergence_df.replace({ "biorxiv_vs_pmc": "bioRxiv-PMC", "biorxiv_vs_nytac": "bioRxiv-NYTAC", "pmc_vs_nytac": "PMC-NYTAC", }).rename(index=str, columns={"comparison": "Comparison"})) + p9.aes( x="factor(num_terms)", y="KL_divergence", fill="Comparison", color="Comparison", group="Comparison", ) + p9.geom_point(size=2) + p9.geom_line(linetype="dashed") + p9.scale_fill_brewer(type="qual", palette="Paired", direction=-1) + p9.scale_color_brewer( type="qual", palette="Paired", direction=-1, ) + p9.labs( x="Number of terms evaluated", y="Kullback–Leibler Divergence", ) + p9.theme_seaborn( context="paper", style="ticks", font_scale=1.8, ) + p9.theme(figure_size=(11, 8.5), text=p9.element_text(family="Arial"))) g.save("output/svg_files/corpora_kl_divergence.svg") g.save("output/figures/corpora_kl_divergence.png", dpi=500) print(g)
def plot_alt_benefit(plot_df, title='Benefit by Alternative', which='both', sensitivity=False, legend=True): '''Builds a stacked bar chart of the alternative benefits @ param plot_df: The df containing benefits for each alt by the criteria and total benefit @ param title: The title for the graph @ param which: which parts to plot. Acceptable values are 'total' for just total value. 'criteria' for just criteria level stacked bars' 'both' for total and criteria. The graphs will be faceted in this case Returns the ggplot graph to be displayed elsewhere''' _facet = which == 'both' if which == 'both': plot_df = plot_df elif which == 'total': plot_df = plot_df.loc[plot_df['type'] == 'Total Value'] elif which == 'criteria': plot_df = plot_df.loc[plot_df['type'] == 'Weighted Criterion Value'] else: print( which, 'is not an approved value for which.\n Enter "total", "criteria", or "both"' ) return (None) if legend: g = ( p9.ggplot(plot_df, p9.aes(x='Alternative', y='Benefit', fill='Criterion')) + p9.geom_col(stat='identity', position=p9.position_stack( vjust=.5)) # makes stacked bar plot + p9.scale_fill_brewer(type='qual', palette='Paired') ) # changes the color palette to one for qualitative scales) else: g = ( p9.ggplot(plot_df, p9.aes(x='Alternative', y='Benefit', fill='Criterion')) + p9.geom_col( p9.aes(show_legend=False), stat='identity', position=p9.position_stack(vjust=.5)) # makes stacked bar plot + p9.scale_fill_brewer( type='qual', palette='Paired', guide=False ) # changes the color palette to one for qualitative scales + p9.theme(legend_position=None)) # Builds the base plot g = ( g # + p9.geom_col(stat='identity',position=p9.position_stack(vjust=.5)) # makes stacked bar plot # + p9.scale_fill_brewer(type='qual',palette='Paired') # changes the color palette to one for qualitative scales + p9.geom_text(p9.aes(label='print_value'), position=p9.position_stack(vjust=.5), size=6, hjust='center') # adds weighted value to bars + p9.ggtitle(title) # makes the title + p9.theme(axis_text_x=p9.element_text( rotation=45, hjust=1)) # rotates x axis labels ) # Adds the facet if required if sensitivity: if _facet: return ((g + p9.facet_grid('type~Criterion Weight'))) else: return ((g + p9.facet_grid('Criterion Weight~'))) elif _facet: return ((g + p9.facet_grid('~type'))) else: return (g)
def generate_map(data, region, value_field, iso_field='iso', scale_params=None, plot_na_dots=False, tolerance=None, plot_size=8, out_region_color='#f0f0f0', na_color='#aaaaaa', line_color='#666666', projection=None): """ This function returns a map plot with the specified options. :param pandas.DataFrame data: Data to be plotted. :param str region: Region to center the map around. Countries outside the chosen region will be obscured. :param str value_field: Column of *data* with the values to be plotted. :param str iso_field: Column of *data* with the ISO3 codes for each country. :param dict scale_params: Dictionary of parameters to be passed to the ggplot corresponding color scale (continuous or discrete). :param bool plot_na_dots: Whether to plot the dots for small countries if said country doesn't have data available. :param int tolerance: Coordinate tolerance for polygon simplification, a higher number will result in simpler polygons and faster rendering (see DEFAULT_TOLERANCES). :param int plot_size: Size of the plot, which determines the relative sizes of the elements within. :param str out_region_color: Hex color of the countries that are out of the specified region. :param str na_color: Hex color of the countries with no data available. :param str line_color: Color of the country borders. :param str projection: Kind of map projection to be used in the map. Currently, Oceania (XOX) is only available in ESPG:4326 to enable wrapping. :returns: a ggplot-like plot with the map :rtype: plotnine.ggplot """ if projection is None: if region == 'XOX': projection = 'epsg4326' else: projection = 'robinson' if projection not in PROJECTION_DICT.keys(): raise ValueError('Projection "{}" not valid'.format(projection)) if scale_params is None: scale_params = {} if region not in REGION_BOUNDS[projection]: raise ValueError( '"region" not available. Valid regions are: {}'.format(', '.join( REGION_BOUNDS[projection].keys()))) if tolerance is None: tolerance = DEFAULT_TOLERANCES[projection][region] countries = GeoDataFrame.from_file( os.path.join(os.path.dirname(__file__), 'data/world-countries.shp')) # To plot Oceania we need the original EPSG:4326 to wrap around the 180º # longitude. In other cases transform to the desired projection. if region == 'XOX': countries.crs['lon_wrap'] = '180' # Wrap around longitude 180º XOX_countries = countries['continent'] == 'XOX' countries[XOX_countries] = countries[XOX_countries].to_crs( countries.crs) centroids = countries[XOX_countries].apply( lambda row: row['geometry'].centroid, axis=1) countries.loc[XOX_countries, 'lon'] = [c.x for c in centroids] countries.loc[XOX_countries, 'lat'] = [c.y for c in centroids] else: if projection != 'epsg4326': countries = countries.to_crs(PROJECTION_DICT[projection]) centroids = countries.apply(lambda row: row['geometry'].centroid, axis=1) countries['lon'] = [c.x for c in centroids] countries['lat'] = [c.y for c in centroids] countries['geometry'] = countries['geometry'].simplify(tolerance) upper_left, lower_right = REGION_BOUNDS[projection][region] limits_x = [upper_left[0], lower_right[0]] limits_y = [lower_right[1], upper_left[1]] ratio = (limits_x[1] - limits_x[0]) / (limits_y[1] - limits_y[0]) plot_data = pd.merge(countries, data, how='left', left_on='iso', right_on=iso_field) map_bounds = REGION_BOUNDS['epsg4326'][region] map_area = ((map_bounds[1][0] - map_bounds[0][0]) * (map_bounds[0][1] - map_bounds[1][1])) plot_data['plot_dot'] = (plot_data['pol_area'] < DOT_THRESHOLD * map_area) if not plot_na_dots: plot_data['plot_dot'] &= ~pd.isnull(plot_data[value_field]) if region != 'XWX': in_region = ((~pd.isnull(plot_data[value_field])) & (plot_data['continent'] == region)) in_region_missing = ((pd.isnull(plot_data[value_field])) & (plot_data['continent'] == region)) out_region = plot_data['continent'] != region else: in_region = ~pd.isnull(plot_data[value_field]) in_region_missing = pd.isnull(plot_data[value_field]) out_region = np.repeat(False, len(plot_data)) if plot_data[value_field].dtype == 'object': # Assume discrete values fill_scale = scale_fill_brewer(**scale_params, drop=False) else: # Assume continuous values fill_scale = scale_fill_gradient(**scale_params) plot_data_values = plot_data[in_region] plot_data_missing = plot_data[in_region_missing] plot_data_out_region = plot_data[out_region] dots_region = plot_data_values[plot_data_values['plot_dot']] dots_region_missing = plot_data_missing[plot_data_missing['plot_dot']] dots_out_region = plot_data_out_region[plot_data_out_region['plot_dot']] plt = ( ggplot() + geom_map(plot_data_values, aes(fill=value_field), color=line_color, size=0.3) + geom_map( plot_data_missing, aes(color='plot_dot'), fill=na_color, size=0.3) + geom_map(plot_data_out_region, fill=out_region_color, color=line_color, size=0.3) + geom_point(dots_region, aes(x='lon', y='lat', fill=value_field), size=3, stroke=.1, color=line_color) + geom_point(dots_region_missing, aes(x='lon', y='lat'), fill=na_color, size=3, stroke=.1, color=line_color) + geom_point(dots_out_region, aes(x='lon', y='lat'), fill=out_region_color, size=3, stroke=.1, color=line_color) + scale_x_continuous(breaks=[], limits=limits_x) + scale_y_continuous(breaks=[], limits=limits_y) + theme( figure_size=(plot_size * ratio, plot_size), panel_background=element_rect(fill='white', color='black'), # panel_border=element_rect(fill='white', # color='black', # size=.1), legend_background=element_rect( fill="white", color='black', size=.5), legend_box_just='left') + xlab('') + ylab('')) if len(plot_data_values.index) > 0: plt += fill_scale plt += scale_color_manual(name=' ', values=[line_color], breaks=[False], labels=['No data available']) if plot_data[value_field].dtype == 'object': plt += guides(fill=guide_legend(override_aes={'shape': None})) return { 'plot': plt, 'ratio': ratio, }
def show_prediction( self, samples, percent_kept: float = 0.95, side_cut_from: str = "both", show_community: bool = False, num_samples: int = 1000, bins: int = 50, ): """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param show_community: boolean indicating whether comparison to community predictions should be made :param num_samples: number of samples from the community :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation :return: ggplot graphics object """ if isinstance(samples, SubmissionMixtureParams): prediction = samples prediction_normed_samples = pd.Series([ logistic.sample_mixture(prediction) for _ in range(0, num_samples) ]) else: if isinstance(samples, list): samples = pd.Series(samples) if not type(samples) in [pd.Series, np.ndarray]: raise ValueError( "Samples should be a list, numpy arrray or pandas series") num_samples = samples.shape[0] prediction_normed_samples = self.normalize_samples(samples) title_name = ( f"Q: {self.name}" if self.name else "\n".join( textwrap.wrap(self.data["title"], 60)) # type: ignore ) if show_community: df = pd.DataFrame( data={ "community": [ # type: ignore self.sample_normalized_community() for _ in range(0, num_samples) ], "prediction": prediction_normed_samples, # type: ignore }) # import pdb # pdb.set_trace() # get domain for graph given the percentage of distribution kept (_xmin, _xmax) = self.get_central_quantiles(df, percent_kept=percent_kept, side_cut_from=side_cut_from) _xmin, _xmax = self.denormalize_samples([_xmin, _xmax]) df["prediction"] = self.denormalize_samples(df["prediction"]) df["community"] = self.denormalize_samples(df["community"]) df = pd.melt(df, var_name="sources", value_name="samples") # type: ignore return (ggplot(df, aes("samples", fill="sources")) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_histogram(position="identity", alpha=0.9) + scale_x_datetime(limits=(_xmin, _xmax)) + facet_wrap("sources", ncol=1) + labs( x="Prediction", y="Counts", title=title_name, ) + guides(fill=False) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1))) else: (_xmin, _xmax) = self.get_central_quantiles( prediction_normed_samples, percent_kept=percent_kept, side_cut_from=side_cut_from, ) _xmin, _xmax = self.denormalize_samples([_xmin, _xmax]) df = pd.DataFrame(data={ "prediction": self.denormalize_samples(prediction_normed_samples) }) return (ggplot(df, aes("prediction")) + geom_histogram(fill="#b3cde3", bins=bins) # + coord_cartesian(xlim = (_xmin,_xmax)) + scale_x_datetime(limits=(_xmin, _xmax)) + labs(x="Prediction", y="Counts", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
def show_prediction( self, samples, percent_kept: float = 0.95, side_cut_from: str = "both", show_community: bool = False, num_samples: int = 1000, ): """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param show_community: boolean indicating whether comparison to community predictions should be made :param num_samples: number of samples from the community :return: ggplot graphics object """ if isinstance(samples, SubmissionMixtureParams): prediction = samples prediction_normed_samples = pd.Series([ logistic.sample_mixture(prediction) for _ in range(0, num_samples) ]) prediction_true_scale_samples = self.denormalize_samples( prediction_normed_samples) else: if isinstance(samples, list): samples = pd.Series(samples) if not type(samples) in [pd.Series, np.ndarray]: raise ValueError( "Samples should be a list, numpy arrray or pandas series") num_samples = samples.shape[0] prediction_true_scale_samples = samples title_name = ( f"Q: {self.name}" if self.name else "\n".join( textwrap.wrap(self.data["title"], 60)) # type: ignore ) if show_community: df = pd.DataFrame( data={ "community": [ # type: ignore self.sample_community() for _ in range(0, num_samples) ], "prediction": prediction_true_scale_samples, }) # get domain for graph given the percentage of distribution kept (_xmin, _xmax) = self.get_central_quantiles(df, percent_kept=percent_kept, side_cut_from=side_cut_from) df = pd.melt(df, var_name="sources", value_name="samples") # type: ignore return (ggplot(df, aes("samples", fill="sources")) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_density(alpha=0.8) + xlim(_xmin, _xmax) + self._scale_x() + labs(x="Prediction", y="Density", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1))) else: df = pd.DataFrame( data={"prediction": prediction_true_scale_samples}) # get domain for graph given the percentage of distribution kept (_xmin, _xmax) = self.get_central_quantiles(df, percent_kept=percent_kept, side_cut_from=side_cut_from) return (ggplot(df, aes("prediction")) + geom_density(fill="#b3cde3", alpha=0.8) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_density(alpha=0.8) + xlim(_xmin, _xmax) + self._scale_x() + labs(x="Prediction", y="Density", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
def make_likert_chart_multi_year( survey_data, topic, labels, facet_by=[], five_is_high=False, exclude_new_contributors=False, ): """Make an offset stacked barchart showing the number of respondents at each rank or value for all columns in the topic. Each column in the topic is a facet, with the years displayed along the x-axis. Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with labels (list): List of strings to use as labels, corresponding to the numerical values given by the respondents. facet_by (list,optional): List of columns use for grouping five_is_high (bool, optiona ): Defaults to False. If True, five is considered the highest value in a ranking, otherwise it is taken as the lowest value. exclude_new_contributors (bool, optional): Defaults to False. If True, do not include any responses from contributors with less than one year of experience Returns: (plotnine.ggplot): Offset stacked barchart plot object which can be displayed in a notebook or saved out to a file """ facet_by = copy(facet_by) og_cols = [x for x in survey_data.columns if x.startswith(topic)] show_legend = True topic_data_long = get_multi_year_data_subset( survey_data, topic, facet_by, exclude_new_contributors ) if not five_is_high: topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0) mid_point = 3 if five_is_high else -3 top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point) if facet_by: fix = False if "." in facet_by: facet_by.remove(".") fix = True # Calculate proportion for each rank top_scores = top_scores.merge( topic_data_long.groupby(facet_by + ["year"]).count().reset_index(), on=facet_by + ["year"], ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) top_scores = top_scores.assign( level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols)) ) bottom_scores = bottom_scores.merge( topic_data_long.groupby(facet_by + ["year"]).count().reset_index(), on=facet_by + ["year"], ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) bottom_scores = bottom_scores.assign( level_1=bottom_scores.level_1_x * -1 / (bottom_scores.level_1_y / len(og_cols)) ) if fix: facet_by.append(".") else: # Calculate proportion for each rank top_scores = top_scores.merge( topic_data_long.groupby(["year"]).count().reset_index(), on=["year"] ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) top_scores = top_scores.assign( level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols)) ) bottom_scores = bottom_scores.merge( topic_data_long.groupby(["year"]).count().reset_index(), on=["year"] ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) bottom_scores = bottom_scores.assign( level_1=bottom_scores.level_1_x * -1 / (bottom_scores.level_1_y / len(og_cols)) ) vp = ( p9.ggplot( topic_data_long, p9.aes(x="factor(year)", fill="factor(rating)", color="factor(rating)"), ) + p9.geom_col( data=top_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, position=p9.position_stack(reverse=True), ) + p9.geom_col( data=bottom_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, position=p9.position_stack(), ) + p9.geom_hline(yintercept=0, color="white") ) if five_is_high: vp = ( vp + p9.scale_color_brewer( "div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels ) + p9.scale_fill_brewer("div", "RdBu", limits=[1, 2, 3, 4, 5], labels=labels) + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) ) else: vp = ( vp + p9.scale_color_brewer( "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels ) + p9.scale_fill_brewer( "div", "RdBu", limits=[-5, -4, -3, -2, -1], labels=labels ) + p9.theme(strip_text_y=p9.element_text(angle=0, ha="left")) ) if facet_by: facet_by.remove(".") else: facet_by.append(".") vp = ( vp + p9.facet_grid( facet_by + ["level_0"], labeller=lambda x: "\n".join( wrap( x.replace(topic, "").replace("_", " ").replace("/", "/ ").strip(), 15, ) ), ) + p9.theme( strip_text_x=p9.element_text(wrap=True, ma="left"), panel_spacing_x=0.1 ) ) return vp
def make_likert_chart( survey_data, topic, labels, facet_by=[], max_value=5, max_is_high=False, wrap_facets=True, sort_x=False, ): """Make an offset stacked barchart showing the number of respondents at each rank or value for all columns in the topic. Each column in the original data is a tick on the x-axis Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with labels (list): List of strings to use as labels, corresponding to the numerical values given by the respondents. facet_by (list,optional): List of columns use for grouping max_value (int, optional): Defaults to 5. The maximuum value a respondent can assign. max_is_high (bool, optiona ): Defaults to False. If True, the max_value is considered the highest value in a ranking, otherwise it is taken as the lowest value. wrap_facets (bool, optional): Defaults to True. If True, the facet labels are wrapped sort_x (bool, optional): Defaults to False. If True, the x-axis is sorted by the mean value for each column in the original data Returns: (plotnine.ggplot): Offset stacked barchart plot object which can be displayed in a notebook or saved out to a file """ mid_point = math.ceil(max_value / 2) og_cols = [x for x in survey_data.columns if x.startswith(topic)] show_legend = True topic_data_long = get_single_year_data_subset(survey_data, topic, facet_by) if not max_is_high: topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0) mid_point = -1 * mid_point top_scores, bottom_scores = split_for_likert(topic_data_long, mid_point) if facet_by: fix = False if "." in facet_by: facet_by.remove(".") fix = True top_scores = top_scores.merge( topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) top_scores = top_scores.assign( level_1=top_scores.level_1_x / (top_scores.level_1_y / len(og_cols)) ) bottom_scores = bottom_scores.merge( topic_data_long.groupby(facet_by).count().reset_index(), on=facet_by ).rename(columns={"rating_x": "rating", "level_0_x": "level_0"}) bottom_scores = bottom_scores.assign( level_1=bottom_scores.level_1_x * -1 / (bottom_scores.level_1_y / len(og_cols)) ) if fix: facet_by.append(".") else: bottom_scores = bottom_scores.assign(level_1=bottom_scores.level_1 * -1) if sort_x: x_sort_order = ( topic_data_long.groupby("level_0") .mean() .sort_values("rating") .reset_index()["level_0"] .values.tolist() ) x_sort_order.reverse() else: x_sort_order = topic_data_long["level_0"].unique().tolist() vp = ( p9.ggplot( topic_data_long, p9.aes(x="level_0", fill="factor(rating)", color="factor(rating)"), ) + p9.geom_col( data=top_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, position=p9.position_stack(reverse=True), ) + p9.geom_col( data=bottom_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, position=p9.position_stack(), ) + p9.geom_hline(yintercept=0, color="white") + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) + p9.scale_x_discrete( limits=x_sort_order, labels=[ "\n".join( textwrap.wrap(x.replace(topic, "").replace("_", " "), width=35)[0:2] ) for x in x_sort_order ], ) ) if max_is_high: vp = ( vp + p9.scale_color_brewer( "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels ) + p9.scale_fill_brewer( "div", "RdBu", limits=list(range(1, max_value + 1)), labels=labels ) ) else: vp = ( vp + reverse_scale_fill_brewer( "div", "RdBu", limits=list(reversed(range(-max_value, 0))), labels=labels, ) + reverse_scale_color_brewer( "div", "RdBu", limits=list(reversed(range(-max_value, 0))), labels=labels, ) ) if facet_by: if wrap_facets: vp = ( vp + p9.facet_grid(facet_by, labeller=lambda x: "\n".join(wrap(x, 15))) + p9.theme( strip_text_x=p9.element_text( wrap=True, va="bottom", margin={"b": -0.5} ) ) ) else: vp = vp + p9.facet_grid(facet_by, space="free", labeller=lambda x: x) return vp
def make_single_likert_chart(survey_data, column, facet, labels, five_is_high=False): """Make an offset stacked barchart showing the number of respondents at each rank or value for a single columns in the original data. Each facet is shown as a tick on the x-axis Args: survey_data (pandas.DataFrame): Raw data read in from Kubernetes Survey topic (str): String that all questions of interest start with labels (list): List of strings to use as labels, corresponding to the numerical values given by the respondents. facet (str): Column used for grouping five_is_high (bool, optionalc): Defaults to False. If True, 5 is considered the highest value in a ranking, otherwise it is taken as the lowest value. Returns: (plotnine.ggplot): Offset stacked barchart plot object which can be displayed in a notebook or saved out to a file """ mid_point = 3 cols = [column, facet] show_legend = True topic_data = survey_data[cols] topic_data_long = make_long(topic_data, facet) if not five_is_high: topic_data_long = topic_data_long.assign(rating=topic_data_long.rating * -1.0) x = topic_data_long.columns.tolist() x.remove("level_1") x.remove("level_0") if not five_is_high: mid_point *= -1 top_cutoff = topic_data_long["rating"] >= mid_point bottom_cutoff = topic_data_long["rating"] <= mid_point top_scores = ( topic_data_long[top_cutoff] .groupby(x) .count() .reset_index() .sort_index(ascending=False) ) top_scores.loc[top_scores["rating"] == mid_point, "level_1"] = ( top_scores[top_scores["rating"] == mid_point]["level_1"] / 2.0 ) top_scores = top_scores.merge( topic_data_long.groupby(facet).count().reset_index(), on=facet ) top_scores = top_scores.assign(level_1=top_scores.level_1_x / top_scores.level_1_y) bottom_scores = topic_data_long[bottom_cutoff].groupby(x).count().reset_index() bottom_scores.loc[bottom_scores["rating"] == mid_point, "level_1"] = ( bottom_scores[bottom_scores["rating"] == mid_point]["level_1"] / 2.0 ) bottom_scores = bottom_scores.merge( topic_data_long.groupby(facet).count().reset_index(), on=facet ) bottom_scores = bottom_scores.assign( level_1=bottom_scores.level_1_x * -1 / bottom_scores.level_1_y ) vp = ( p9.ggplot( topic_data_long, p9.aes(x=facet, fill="factor(rating_x)", color="factor(rating_x)"), ) + p9.geom_col( data=top_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, position=p9.position_stack(reverse=True), ) + p9.geom_col( data=bottom_scores, mapping=p9.aes(y="level_1"), show_legend=show_legend, size=0.25, ) + p9.geom_hline(yintercept=0, color="white") + p9.theme( axis_text_x=p9.element_text(angle=45, ha="right"), strip_text_y=p9.element_text(angle=0, ha="left"), ) + p9.scale_x_discrete( limits=topic_data_long[facet].unique().tolist(), labels=[ x.replace("_", " ") for x in topic_data_long[facet].unique().tolist() ], ) ) if five_is_high: vp = ( vp + p9.scale_color_brewer( "div", "RdBu", limits=[1, 2, 3, 4, 5], labels=["\n".join(wrap(x, 15)) for x in labels], ) + p9.scale_fill_brewer( "div", "RdBu", limits=[1, 2, 3, 4, 5], labels=["\n".join(wrap(x, 15)) for x in labels], ) ) else: vp = ( vp + reverse_scale_fill_brewer( "div", "RdBu", limits=[-1, -2, -3, -4, -5], labels=["\n".join(wrap(x, 15)) for x in labels], ) + reverse_scale_color_brewer( "div", "RdBu", limits=[-1, -2, -3, -4, -5], labels=["\n".join(wrap(x, 15)) for x in labels], ) ) return vp
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Filter and merge 10x data. Save to AnnData object. """) parser.add_argument( '-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) parser.add_argument('--tsv_file', action='store', dest='tsv', required=True, help='cell_filtered_per_experiment tsv file.') parser.add_argument( '-of', '--output_file', action='store', dest='of', default='', help='Basename of output png file. Will have .png appended.\ (default: %(default)s)') options = parser.parse_args() # Get basename of the output file out_file_base = options.of if out_file_base == '': out_file_base = '{}'.format( os.path.basename(options.tsv.rstrip('tsv.gz').rstrip('\\.'))) # Load the data df = pd.read_csv(options.tsv, sep='\t') # Get the total number of input cells per sample df_before_filters = df[df.filter_type.isin(['before_filters'])] df_before_filters = df_before_filters.set_index('experiment_id') # Check if any difference between before and after filters. If not, # return early. df_after_filters = df[df.filter_type.isin(['after_filters'])] filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[ df_after_filters.experiment_id, 'n_cells_left_in_adata'].values if all(filt): print("No difference detected before and after filters. No plots.") return () # Set some plotting parameters plt_height = 16 # 1.5 * df.experiment_id.nunique() # Plot the number of cells before and after all filters across experiments df_plt = df[df.filter_type.isin(['before_filters', 'after_filters'])] gplt = plt9.ggplot( df_plt, plt9.aes( x='experiment_id', y='n_cells_left_in_adata', # label='n_cells', fill='filter_type')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_bar(stat='identity', position='dodge') # gplt = gplt + plt9.geom_text(vjust=1.6, color='white', size=3.5) gplt = gplt + plt9.scale_y_continuous( trans='log10', labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') gplt = gplt + plt9.labs(title='', y='Number of cells', x='', fill='') # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245 gplt = gplt + plt9.theme( # legend_position='bottom', subplots_adjust={'bottom': 0.15}, legend_position=(.5, .05), legend_direction='horizontal', legend_title=plt9.element_blank()) gplt = gplt + plt9.coord_flip() gplt.save('{}-n_cells_before_after.png'.format(out_file_base), dpi=300, width=4, height=plt_height) # Plot the final fraction of cells filtered per experiment df_plt = df_after_filters.copy() # Invert the numbers, so instead of the number of cells that pass, get # the number of cells that fail at each filter. df_plt.n_cells_left_in_adata = df_before_filters.loc[ df_plt.experiment_id, 'n_cells_left_in_adata'].values - df_plt.n_cells_left_in_adata # Now calculate the fraction removed df_plt['fraction_cells'] = df_plt.n_cells_left_in_adata / \ df_before_filters.loc[ df_plt.experiment_id, 'n_cells_left_in_adata' ].values gplt = plt9.ggplot( df_plt, plt9.aes(x='experiment_id', y='fraction_cells', fill='filter_type')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_bar(stat='identity', position='dodge') if df_plt.filter_type.nunique() < 9: gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') gplt = gplt + plt9.labs( title='', y='Fraction of total cells excluded', x='', fill='Filter') # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245 gplt = gplt + plt9.theme( # legend_position='bottom', subplots_adjust={'bottom': 0.15}, legend_position=(.5, .05), legend_direction='vertical') gplt = gplt + plt9.coord_flip() gplt.save('{}-fraction_before_after.png'.format(out_file_base), dpi=300, width=4, height=plt_height) # Plot the number of cells falling into each filter acoss experiments. # NOTE: cells can fall into multiple filters. # Remove the rows that we do not want df_plt = df[~df.filter_type.isin(['before_filters', 'after_filters'])] df_plt = df_plt[~df_plt.filter_type.str.contains('after_filter')] # Invert the numbers, so instead of the number of cells that pass, get # the number of cells that fail at each filter. df_plt.n_cells_left_in_adata = df_before_filters.loc[ df_plt.experiment_id, 'n_cells_left_in_adata'].values - df_plt.n_cells_left_in_adata gplt = plt9.ggplot( df_plt, plt9.aes(x='experiment_id', y='n_cells_left_in_adata', fill='filter_type')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_bar(stat='identity', position='dodge') if df_plt.filter_type.nunique() < 9: gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') gplt = gplt + plt9.labs( title='', y='Number of cells excluded', x='', fill='Filter') # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245 gplt = gplt + plt9.theme( # legend_position='bottom', subplots_adjust={'bottom': 0.15}, legend_position=(.5, .05), legend_direction='vertical') gplt = gplt + plt9.coord_flip() gplt.save('{}-n_cells_excluded.png'.format(out_file_base), dpi=300, width=4, height=plt_height) # Plot the ratio of the total number of cells removed in each filter across # experiments. # NOTE: cells can fall into multiple filters. df_plt['fraction_cells'] = df_plt.n_cells_left_in_adata / \ df_before_filters.loc[ df_plt.experiment_id, 'n_cells_left_in_adata' ].values gplt = plt9.ggplot( df_plt, plt9.aes(x='experiment_id', y='fraction_cells', fill='filter_type')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_bar(stat='identity', position='dodge') if df_plt.filter_type.nunique() < 9: gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') gplt = gplt + plt9.labs( title='', y='Fraction of total cells excluded', x='', fill='Filter') # NOTE: legend_position bug https://github.com/has2k1/plotnine/issues/245 gplt = gplt + plt9.theme( # legend_position='bottom', subplots_adjust={'bottom': 0.15}, legend_position=(.5, .05), legend_direction='vertical') gplt = gplt + plt9.coord_flip() gplt.save('{}-fraction_cells_excluded.png'.format(out_file_base), dpi=300, width=4, height=plt_height)
"scala": "Scala", "C": "C", "sas": "SAS" } skills_summary_lang = skills_summary_df[skills_summary_df.attribute.isin( languages)] skills_summary_lang = skills_summary_lang.replace(to_replace=lang_clean) skills_summary_lang = sort_df(skills_summary_lang, var_col="attribute") lang_plot = ( p9.ggplot(skills_summary_lang, p9.aes('attribute', 'value', fill='type', show_legend=False)) + p9.geom_col() + p9.coord_flip() + p9.scale_y_continuous(expand=[0, 0]) + p9.labs(y="Frequency", x="Language", fill="") + p9.scale_fill_brewer(palette="Blues") + p9.facet_wrap('~type')) lang_plot.save(filename='figs/lang_plot.png', height=5, width=5, units='in', dpi=1000) lang_plot #Software programs = ["tableau", "docker", "bigquery", "jira", "spark", "hadoop"] prog_clean = { "tableau": "Tableau", "docker": "Docker", "bigquery": "Google BigQuery", "jira": "Jira",
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Calcualte and compare LISI across a series of reduced dims and categorical variables. """) parser.add_argument( '-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) # parser.add_argument( # '-h5', '--h5_anndata', # action='store', # dest='h5', # required=True, # help='H5 AnnData file.' # ) parser.add_argument( '-rf', '--reduced_dims_tsv', action='store', dest='reduced_dims', required=True, help='List of tab-delimited files of reduced dimensions (e.g., PCs)\ for each cell. First column is cell_barcode. List should be\ split by "::" (e.g. file1.tsv.gz::file2.tsv.gz).') parser.add_argument( '-lbl', '--reduced_dims_tsv_labels', action='store', dest='reduced_dims_labels', required=True, help='String of labels for each reduced_dims_tsv file. List should be\ split by "::".') parser.add_argument( '-mf', '--metadata_tsv', action='store', dest='metadata_tsv', required=True, help='Tab-delimited file of metadata for each cell. First column\ is cell_barcode.') parser.add_argument( '-mv', '--metadata_columns', action='store', dest='metadata_columns', default='experiment_id', help='Comma separated string of categorical variables to calculate\ LISI with.\ (default: %(default)s)') parser.add_argument('-p', '--perplexity', action='store', dest='perplexity', default=30.0, type=float, help='Perplexity.\ (default: %(default)s)') parser.add_argument( '-of', '--output_file', action='store', dest='of', default='', help='Basename of output files, assuming output in current working \ directory.\ (default: <metadata_tsv>-lisi)') options = parser.parse_args() # Fixed settings. # verbose = True # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = '{}-lisi'.format( os.path.basename( options.metadata_tsv.rstrip('tsv.gz').rstrip('.'))) # Get the columns to use lisi_columns = options.metadata_columns.split(',') # lisi_columns = ['experiment_id', 'batch'] lisi_columns_dtype = dict( zip(lisi_columns, ['category'] * len(lisi_columns))) # Load the metadata file file_meta = options.metadata_tsv df_meta = pd.read_csv(file_meta, sep='\t', index_col='cell_barcode', dtype=lisi_columns_dtype) # Load the reduced dims. files = options.reduced_dims.split('::') labels = options.reduced_dims_labels.split('::') assert len(files) == len(labels), 'ERROR: check files and labels input' # Make a dict of theoretical maximum LISI value for each label. lisi_limit = {} for col in lisi_columns: n_cat = len(df_meta[col].cat.categories) lisi_limit[col] = n_cat list_lisi = [] for i in range(len(files)): df_reduced_dims = pd.read_csv(files[i], sep='\t', index_col='cell_barcode') # Run lisi and save results to dataframe _df_lisi = pd.DataFrame(hm.compute_lisi( df_reduced_dims.loc[df_meta.index, :], df_meta[lisi_columns], lisi_columns), columns=lisi_columns) _df_lisi['file'] = files[i] _df_lisi['label'] = labels[i] _df_lisi['cell_barcode'] = df_meta.index list_lisi.append(_df_lisi) # Make one long dataframe. df_lisi = pd.concat(list_lisi) # Make cell_barcode the first column. cols = list(df_lisi.columns) cols = [cols[-1]] + cols[:-1] # Save the results df_lisi[cols].to_csv('{}.tsv.gz'.format(out_file_base), sep='\t', index=False, quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression='gzip') # Compare the lisi distributions n_labels = len(labels) for lisi_column in lisi_columns: # Make density plot. gplt = plt9.ggplot(df_lisi, plt9.aes( fill='label', x='label', y=lisi_column, )) gplt = gplt + plt9.theme_bw(base_size=12) gplt = gplt + plt9.geom_violin(alpha=0.9) gplt = gplt + plt9.geom_boxplot( group='label', position=plt9.position_dodge(width=.9), width=.1, fill='white', outlier_alpha=0 # Do not know how to totally remove outliers. ) # Add a line at the theoretical maximum gplt = gplt + plt9.geom_hline( plt9.aes(yintercept=lisi_limit[lisi_column])) # gplt = gplt + plt9.facet_grid('{} ~ .'.format(label)) gplt = gplt + plt9.labs(x='Reduced dimensions', y='LISI', title='') gplt = gplt + plt9.theme( axis_text_x=plt9.element_text(angle=-45, hjust=0)) gplt = gplt + plt9.theme(legend_position='none') if n_labels != 0 and n_labels < 9: gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') gplt.save( '{}-{}-violin.png'.format(out_file_base, lisi_column), dpi=300, width=4 * (n_labels / 4), height=10, # height=4*(n_samples/4), limitsize=False) # Make ecdf. gplt = plt9.ggplot(df_lisi, plt9.aes( x=lisi_column, color='label', )) gplt = gplt + plt9.theme_bw(base_size=12) gplt = gplt + plt9.stat_ecdf(alpha=0.8) gplt = gplt + plt9.labs( x='LISI', y='Cumulative density', # color='Reduction', title='') if n_labels != 0 and n_labels < 9: gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual') gplt.save('{}-{}-ecdf.pdf'.format(out_file_base, lisi_column), dpi=300, width=10, height=4, limitsize=False)