def count_efms(): rates1_df, _, _ = get_concatenated_raw_data('standard') rates2_df, _, _ = get_concatenated_raw_data('anaerobic') rates_df = pd.concat([rates1_df, rates2_df]).drop_duplicates() produces_biomass = (rates_df[D.R_BIOMASS] > 1e-8) requires_oxygen = (rates_df[D.R_OXYGEN_DEPENDENT] != 0).any(axis=1) sensitive_oxygen = (rates_df[D.R_OXYGEN_SENSITIVE].abs() > 1e-8).any( axis=1) print 'produces biomass = %d' % produces_biomass.sum() print 'requires oxygen = %d' % (produces_biomass & (requires_oxygen == True) & (sensitive_oxygen == False)).sum() print 'sensitive to oxygen = %d' % (produces_biomass & (requires_oxygen == False) & (sensitive_oxygen == True)).sum() print 'aerobic = %d' % (produces_biomass & (sensitive_oxygen == False)).sum() print 'anaerobic = %d' % (produces_biomass & (requires_oxygen == False)).sum() print 'infeasible = %d' % (produces_biomass & (requires_oxygen == True) & (sensitive_oxygen == True)).sum() print 'facultative = %d' % (produces_biomass & (requires_oxygen == False) & (sensitive_oxygen == False)).sum() # write the data table of the main Pareto figure to a csv file: data = pd.read_pickle(os.path.join(D.TEMP_DIR, 'standard.pkl')) data.to_csv(os.path.join(D.TEMP_DIR, 'standard.csv')) data = pd.read_pickle(os.path.join(D.TEMP_DIR, 'anaerobic.pkl')) data.to_csv(os.path.join(D.TEMP_DIR, 'anaerobic.csv'))
def __init__(self, figure_data): rates_df, _, _, _ = get_concatenated_raw_data('standard') self.active_df = (rates_df == 0) # a boolean DataFrame of the active reactions in each EFM self.reactions = sorted(self.active_df.columns, key=lambda s: (int(re.findall('[rR]+(\d+)', s)[0]), s)) yield_df = figure_data['standard'][D.YIELD_L] self.xticklabels = list(map(D.GET_REACTION_NAME, self.reactions)) self.yticklabels = list(reversed(self.xticklabels)) self.yd_double = pd.DataFrame(index=self.reactions, columns=self.reactions, dtype=float) self.yd_epistasis = pd.DataFrame(index=self.reactions, columns=self.reactions, dtype=float) max_yield = yield_df.max() for r1 in self.reactions: for r2 in self.reactions: inds = self.active_df[self.active_df[r1] & self.active_df[r2]].index self.yd_double.loc[r1, r2] = yield_df[inds].max() / max_yield self.yd_double.fillna(0, inplace=True) for r1 in self.reactions: for r2 in self.reactions: self.yd_epistasis.loc[r1, r2] = f_epi(self.yd_double, r1, r2) self.mask1 = np.zeros_like(self.yd_double, dtype=np.bool) self.mask1[np.triu_indices_from(self.mask1, 1)] = True self.mask2 = np.zeros_like(self.yd_epistasis, dtype=np.bool) self.mask2[np.triu_indices_from(self.mask2, 0)] = True
def plot_correlations(self): # Figure that calculates the Euclidean distance between each EFM and # the "experimental" flow, and overlays that information on the # standard "Pareto" plot exp_flux_df = self.fluxes_df.copy() # remove the exchange reactions (xchg_*) exp_flux_df = exp_flux_df.loc[ exp_flux_df.reaction_id.str.find('xchg') != 0, :] exp_flux_df.reaction_id = exp_flux_df.reaction_id.apply( D.FIX_REACTION_ID) fig0, axs0 = plt.subplots(1, 2, figsize=(15, 7)) rates_df, params_df, km_df, enzyme_abundance_df = \ get_concatenated_raw_data('standard') CORR_FLUX_L = 'correlation with exp fluxes' LOG_LIKELIHOOD_L = 'log likelihood of flow' figure_data = D.get_figure_data() data = figure_data['standard'] data[CORR_FLUX_L] = rates_df.transpose().corr().loc[9999] # calculate the likelihood of each EFM according to the measured flux # distribution data[LOG_LIKELIHOOD_L] = 0 joined_rates = rates_df.T joined_rates['std'] = exp_flux_df[D.MEAS_STDEV_L] joined_rates['std'] = joined_rates['std'].fillna( 0) + 1.0 # add a baseline stdev of 10% for efm in data.index: x = (joined_rates[efm] - joined_rates[9999]) / joined_rates['std'] log_likelihood = -(x**2).sum() / 2 data.loc[efm, LOG_LIKELIHOOD_L] = log_likelihood data.loc[data[D.STRICTLY_ANAEROBIC_L], D.GROWTH_RATE_L] = 0 # remove oxygen-sensitive EFMs cmap = D.pareto_cmap(0.88) D.plot_basic_pareto(data, axs0[0], x=D.YIELD_L, y=D.GROWTH_RATE_L, c=CORR_FLUX_L, cmap=cmap, vmin=0, vmax=1, linewidth=0, s=20) D.plot_basic_pareto(data, axs0[1], x=D.YIELD_L, y=D.GROWTH_RATE_L, c=LOG_LIKELIHOOD_L, cmap=cmap, linewidth=0, s=20, vmin=-100000, vmax=0) for ax in axs0: for efm in D.efm_dict.keys(): xy = np.array(data.loc[efm, [D.YIELD_L, D.GROWTH_RATE_L]].tolist()) xytext = xy + np.array((-1, 0.025)) ax.annotate(xy=xy, s=D.efm_dict[efm]['label'], xycoords='data', xytext=xytext, arrowprops=dict(facecolor='black', shrink=0.05, width=2, headwidth=4)) ax.set_xlim(-1e-3, 1.1 * data[D.YIELD_L].max()) ax.set_ylim(-1e-3, 1.15 * data[D.GROWTH_RATE_L].max()) axs0[0].set_title('distance from measured fluxes (correlation)') axs0[1].set_title('distance from measured fluxes (likelihood)') fig0.tight_layout() fig0.savefig(os.path.join(D.OUTPUT_DIR, 'Fig_flux_correlation.pdf'))
raise Exception('cannot balance reaction "%s": %s' % (rxn, r.write_formula())) r = Reaction(row.to_dict()) # transpose S to comply with the standard orientation for constraint-based flux models return stoich_df.transpose() if __name__ == '__main__': S = read_stoichiometry() rxn_df = S.apply(lambda r: Reaction(r.to_dict())).to_frame() rxn_df.columns = ['Reaction'] rxn_df['dG0_prime'] = list( rxn_df['Reaction'].apply(lambda r: r.dG0_prime())) rates_df, _, _, _ = get_concatenated_raw_data('standard') # keep only reactions that are in S (i.e., not external) rates_df = rates_df[S.columns] #%% BOUNDS = Bounds(default_lb=1e-6, default_ub=1e-2) mdf_data_dict = {} for efm, row in rates_df.iterrows(): fluxes = row[row != 0] pathway = Pathway(rxn_df.loc[fluxes.index, 'Reaction'], fluxes.values, rxn_df.loc[fluxes.index, 'dG0_prime'], bounds=BOUNDS) mdf_data_dict[efm] = pathway.calc_mdf() print(efm, mdf_data_dict[efm].mdf)
def plot_tsne_figure(figure_data, figsize=(15, 13)): data = figure_data['standard'] # each one of the pareto zipfiles contains the rates of all the EFMs # so we arbitrarily chose Fig3_pareto to get them. rates_df, _, _, _ = get_concatenated_raw_data('standard') X = rates_df.as_matrix() model = TSNE(n_components=2) np.set_printoptions(suppress=True) X_new = model.fit_transform(X) rates_df_new = pd.DataFrame(index=rates_df.index, columns=('t-SNE dim 1', 't-SNE dim 2')) rates_df_new.iloc[:, 0] = X_new[:, 0] rates_df_new.iloc[:, 1] = X_new[:, 1] data = rates_df_new.join(data) #%% fig, axs = plt.subplots(3, 3, figsize=figsize, sharex=True, sharey=True) axs = list(axs.flat) for i, ax in enumerate(axs): ax.annotate(chr(ord('a') + i), xy=(0.04, 0.98), xycoords='axes fraction', ha='left', va='top', size=20) xdata = rates_df_new.iloc[:, 0] ydata = rates_df_new.iloc[:, 1] axs[0].scatter(xdata, ydata, s=15, c=(0.2, 0.2, 0.7), alpha=0.3) for efm in D.efm_dict.keys(): xy = (xdata[efm], ydata[efm]) axs[0].annotate(s=D.efm_dict[efm]['label'], xy=xy, xycoords='data', xytext=(30, 5), textcoords='offset points', arrowprops=dict(facecolor='black', shrink=0.05, width=2, headwidth=4), ha='left', va='bottom') plot_parameters = [ { 'c': D.YIELD_L, 'title': 'biomass yield' }, { 'c': D.GROWTH_RATE_L, 'title': 'growth rate' }, { 'c': D.OXYGEN_L, 'title': 'oxygen uptake' }, { 'c': D.ACE_L, 'title': 'acetate secretion' }, { 'c': D.NH3_L, 'title': 'ammonia uptake' }, { 'c': D.SUCCINATE_L, 'title': 'succinate secretion' }, { 'c': D.ED_L, 'title': 'ED pathway' }, { 'c': D.PPP_L, 'title': 'pentose phosphate pathway', }, ] for i, d in enumerate(plot_parameters): d['ax'] = axs[i + 1] D.plot_basic_pareto(data, x=rates_df_new.columns[0], y=rates_df_new.columns[1], c=d['c'], ax=d['ax'], cmap='copper_r', linewidth=0.2, s=10) d['ax'].set_title(d['title']) fig.tight_layout() return fig
data = figure_data['standard'] # remove oxygen-sensitive EFMs data.loc[data[D.STRICTLY_ANAEROBIC_L], D.GROWTH_RATE_L] = 0 D.plot_basic_pareto(data, ax2c, x=D.YIELD_L, y=D.GROWTH_RATE_L, efm_dict=D.efm_dict, facecolors=D.PARETO_NEUTRAL_COLOR, edgecolors='none') ax2c.set_xlim(-1e-3, 1.1*data[D.YIELD_L].max()) ax2c.set_ylim(-1e-3, 1.15*data[D.GROWTH_RATE_L].max()) ax2c.set_title('glucose = 100 mM, O$_2$ = 3.7 mM') fig2c.tight_layout() fig2c.savefig(os.path.join(D.OUTPUT_DIR, 'Fig_web4.pdf')) # %% histogram of all different EFM growth rates in a specific condition fig5 = plt.figure(figsize=(5, 5)) ax5 = fig5.add_subplot(1, 1, 1) efm = allocation_pie_chart(ax5, D.STD_CONC['glucoseExt'], D.STD_CONC['oxygen']) rates_df, full_df = get_concatenated_raw_data('sweep_glucose') df = full_df[full_df['efm'] == efm] v_BM = D.BIOMASS_MW * D.SECONDS_IN_HOUR * rates_df.at[efm, D.R_BIOMASS] # make a new DataFrame where the index is the glucose concentration # and the columns are the reactions and values are the costs. absol = full_df[full_df['efm'] == efm].pivot(index=full_df.columns[1], columns='reaction', values='E_i') fig5.savefig(os.path.join(D.OUTPUT_DIR, 'Fig_web3.pdf'))
figS6 = plot_tsne_figure(figure_data) D.savefig(figS6, 'S6') # %% SI Figure 7 # make bar plots for each reaction, counting how many EFMs it participates figS7, axS7 = plt.subplots(2, 2, figsize=(15, 12)) for i, ax in enumerate(axS7.flat): ax.annotate(chr(ord('a') + i), xy=(0.04, 0.98), xycoords='axes fraction', ha='left', va='top', size=20) rates1_df, _, _, _ = get_concatenated_raw_data('standard') rates2_df, _, _, _ = get_concatenated_raw_data('anaerobic') rates_df = pd.concat([rates1_df, rates2_df]).drop_duplicates() reaction_counts = 100 * (rates_df.abs() > 1e-8).sum(0) / rates_df.shape[0] plt.subplots_adjust(hspace=0.3) reaction_counts.sort_values(inplace=True) reaction_counts.plot(kind='bar', ax=axS7[0, 0], color=D.BAR_COLOR, linewidth=0) axS7[0, 0].set_ylim(0, 100) axS7[0, 0].set_ylabel('\% of EFMs using this reaction') axS7[0, 0].set_xticklabels(map(D.GET_REACTION_NAME, reaction_counts.index))