Exemplo n.º 1
0
def count_efms():
    rates1_df, _, _ = get_concatenated_raw_data('standard')
    rates2_df, _, _ = get_concatenated_raw_data('anaerobic')
    rates_df = pd.concat([rates1_df, rates2_df]).drop_duplicates()

    produces_biomass = (rates_df[D.R_BIOMASS] > 1e-8)
    requires_oxygen = (rates_df[D.R_OXYGEN_DEPENDENT] != 0).any(axis=1)
    sensitive_oxygen = (rates_df[D.R_OXYGEN_SENSITIVE].abs() > 1e-8).any(
        axis=1)

    print 'produces biomass = %d' % produces_biomass.sum()
    print 'requires oxygen = %d' % (produces_biomass &
                                    (requires_oxygen == True) &
                                    (sensitive_oxygen == False)).sum()
    print 'sensitive to oxygen = %d' % (produces_biomass &
                                        (requires_oxygen == False) &
                                        (sensitive_oxygen == True)).sum()

    print 'aerobic = %d' % (produces_biomass &
                            (sensitive_oxygen == False)).sum()
    print 'anaerobic = %d' % (produces_biomass &
                              (requires_oxygen == False)).sum()
    print 'infeasible = %d' % (produces_biomass & (requires_oxygen == True) &
                               (sensitive_oxygen == True)).sum()
    print 'facultative = %d' % (produces_biomass & (requires_oxygen == False) &
                                (sensitive_oxygen == False)).sum()

    # write the data table of the main Pareto figure to a csv file:
    data = pd.read_pickle(os.path.join(D.TEMP_DIR, 'standard.pkl'))
    data.to_csv(os.path.join(D.TEMP_DIR, 'standard.csv'))

    data = pd.read_pickle(os.path.join(D.TEMP_DIR, 'anaerobic.pkl'))
    data.to_csv(os.path.join(D.TEMP_DIR, 'anaerobic.csv'))
    def __init__(self, figure_data):
        rates_df, _, _, _ = get_concatenated_raw_data('standard')
        self.active_df = (rates_df == 0) # a boolean DataFrame of the active reactions in each EFM
        self.reactions = sorted(self.active_df.columns,
            key=lambda s: (int(re.findall('[rR]+(\d+)', s)[0]), s))

        yield_df = figure_data['standard'][D.YIELD_L]
        self.xticklabels = list(map(D.GET_REACTION_NAME, self.reactions))
        self.yticklabels = list(reversed(self.xticklabels))
        self.yd_double = pd.DataFrame(index=self.reactions,
                                      columns=self.reactions, dtype=float)
        self.yd_epistasis = pd.DataFrame(index=self.reactions,
                                         columns=self.reactions, dtype=float)
        max_yield = yield_df.max()
        for r1 in self.reactions:
            for r2 in self.reactions:
                inds = self.active_df[self.active_df[r1] & self.active_df[r2]].index
                self.yd_double.loc[r1, r2] = yield_df[inds].max() / max_yield
        self.yd_double.fillna(0, inplace=True)

        for r1 in self.reactions:
            for r2 in self.reactions:
                self.yd_epistasis.loc[r1, r2] = f_epi(self.yd_double, r1, r2)

        self.mask1 = np.zeros_like(self.yd_double, dtype=np.bool)
        self.mask1[np.triu_indices_from(self.mask1, 1)] = True
        self.mask2 = np.zeros_like(self.yd_epistasis, dtype=np.bool)
        self.mask2[np.triu_indices_from(self.mask2, 0)] = True
    def plot_correlations(self):
        # Figure that calculates the Euclidean distance between each EFM and
        # the "experimental" flow, and overlays that information on the
        # standard "Pareto" plot

        exp_flux_df = self.fluxes_df.copy()

        # remove the exchange reactions (xchg_*)
        exp_flux_df = exp_flux_df.loc[
            exp_flux_df.reaction_id.str.find('xchg') != 0, :]
        exp_flux_df.reaction_id = exp_flux_df.reaction_id.apply(
            D.FIX_REACTION_ID)

        fig0, axs0 = plt.subplots(1, 2, figsize=(15, 7))
        rates_df, params_df, km_df, enzyme_abundance_df = \
            get_concatenated_raw_data('standard')

        CORR_FLUX_L = 'correlation with exp fluxes'
        LOG_LIKELIHOOD_L = 'log likelihood of flow'

        figure_data = D.get_figure_data()
        data = figure_data['standard']

        data[CORR_FLUX_L] = rates_df.transpose().corr().loc[9999]
        # calculate the likelihood of each EFM according to the measured flux
        # distribution
        data[LOG_LIKELIHOOD_L] = 0

        joined_rates = rates_df.T
        joined_rates['std'] = exp_flux_df[D.MEAS_STDEV_L]
        joined_rates['std'] = joined_rates['std'].fillna(
            0) + 1.0  # add a baseline stdev of 10%
        for efm in data.index:
            x = (joined_rates[efm] - joined_rates[9999]) / joined_rates['std']
            log_likelihood = -(x**2).sum() / 2
            data.loc[efm, LOG_LIKELIHOOD_L] = log_likelihood

        data.loc[data[D.STRICTLY_ANAEROBIC_L],
                 D.GROWTH_RATE_L] = 0  # remove oxygen-sensitive EFMs
        cmap = D.pareto_cmap(0.88)
        D.plot_basic_pareto(data,
                            axs0[0],
                            x=D.YIELD_L,
                            y=D.GROWTH_RATE_L,
                            c=CORR_FLUX_L,
                            cmap=cmap,
                            vmin=0,
                            vmax=1,
                            linewidth=0,
                            s=20)
        D.plot_basic_pareto(data,
                            axs0[1],
                            x=D.YIELD_L,
                            y=D.GROWTH_RATE_L,
                            c=LOG_LIKELIHOOD_L,
                            cmap=cmap,
                            linewidth=0,
                            s=20,
                            vmin=-100000,
                            vmax=0)

        for ax in axs0:
            for efm in D.efm_dict.keys():
                xy = np.array(data.loc[efm,
                                       [D.YIELD_L, D.GROWTH_RATE_L]].tolist())
                xytext = xy + np.array((-1, 0.025))
                ax.annotate(xy=xy,
                            s=D.efm_dict[efm]['label'],
                            xycoords='data',
                            xytext=xytext,
                            arrowprops=dict(facecolor='black',
                                            shrink=0.05,
                                            width=2,
                                            headwidth=4))
            ax.set_xlim(-1e-3, 1.1 * data[D.YIELD_L].max())
            ax.set_ylim(-1e-3, 1.15 * data[D.GROWTH_RATE_L].max())
        axs0[0].set_title('distance from measured fluxes (correlation)')
        axs0[1].set_title('distance from measured fluxes (likelihood)')
        fig0.tight_layout()

        fig0.savefig(os.path.join(D.OUTPUT_DIR, 'Fig_flux_correlation.pdf'))
                raise Exception('cannot balance reaction "%s": %s' %
                                (rxn, r.write_formula()))
            r = Reaction(row.to_dict())

    # transpose S to comply with the standard orientation for constraint-based flux models
    return stoich_df.transpose()


if __name__ == '__main__':
    S = read_stoichiometry()
    rxn_df = S.apply(lambda r: Reaction(r.to_dict())).to_frame()
    rxn_df.columns = ['Reaction']
    rxn_df['dG0_prime'] = list(
        rxn_df['Reaction'].apply(lambda r: r.dG0_prime()))

    rates_df, _, _, _ = get_concatenated_raw_data('standard')

    # keep only reactions that are in S (i.e., not external)
    rates_df = rates_df[S.columns]

    #%%
    BOUNDS = Bounds(default_lb=1e-6, default_ub=1e-2)
    mdf_data_dict = {}
    for efm, row in rates_df.iterrows():
        fluxes = row[row != 0]
        pathway = Pathway(rxn_df.loc[fluxes.index, 'Reaction'],
                          fluxes.values,
                          rxn_df.loc[fluxes.index, 'dG0_prime'],
                          bounds=BOUNDS)
        mdf_data_dict[efm] = pathway.calc_mdf()
        print(efm, mdf_data_dict[efm].mdf)
Exemplo n.º 5
0
def plot_tsne_figure(figure_data, figsize=(15, 13)):
    data = figure_data['standard']
    # each one of the pareto zipfiles contains the rates of all the EFMs
    # so we arbitrarily chose Fig3_pareto to get them.

    rates_df, _, _, _ = get_concatenated_raw_data('standard')
    X = rates_df.as_matrix()

    model = TSNE(n_components=2)
    np.set_printoptions(suppress=True)
    X_new = model.fit_transform(X)

    rates_df_new = pd.DataFrame(index=rates_df.index,
                                columns=('t-SNE dim 1', 't-SNE dim 2'))
    rates_df_new.iloc[:, 0] = X_new[:, 0]
    rates_df_new.iloc[:, 1] = X_new[:, 1]
    data = rates_df_new.join(data)

    #%%
    fig, axs = plt.subplots(3, 3, figsize=figsize, sharex=True, sharey=True)
    axs = list(axs.flat)
    for i, ax in enumerate(axs):
        ax.annotate(chr(ord('a') + i),
                    xy=(0.04, 0.98),
                    xycoords='axes fraction',
                    ha='left',
                    va='top',
                    size=20)

    xdata = rates_df_new.iloc[:, 0]
    ydata = rates_df_new.iloc[:, 1]
    axs[0].scatter(xdata, ydata, s=15, c=(0.2, 0.2, 0.7), alpha=0.3)
    for efm in D.efm_dict.keys():
        xy = (xdata[efm], ydata[efm])
        axs[0].annotate(s=D.efm_dict[efm]['label'],
                        xy=xy,
                        xycoords='data',
                        xytext=(30, 5),
                        textcoords='offset points',
                        arrowprops=dict(facecolor='black',
                                        shrink=0.05,
                                        width=2,
                                        headwidth=4),
                        ha='left',
                        va='bottom')
    plot_parameters = [
        {
            'c': D.YIELD_L,
            'title': 'biomass yield'
        },
        {
            'c': D.GROWTH_RATE_L,
            'title': 'growth rate'
        },
        {
            'c': D.OXYGEN_L,
            'title': 'oxygen uptake'
        },
        {
            'c': D.ACE_L,
            'title': 'acetate secretion'
        },
        {
            'c': D.NH3_L,
            'title': 'ammonia uptake'
        },
        {
            'c': D.SUCCINATE_L,
            'title': 'succinate secretion'
        },
        {
            'c': D.ED_L,
            'title': 'ED pathway'
        },
        {
            'c': D.PPP_L,
            'title': 'pentose phosphate pathway',
        },
    ]

    for i, d in enumerate(plot_parameters):
        d['ax'] = axs[i + 1]
        D.plot_basic_pareto(data,
                            x=rates_df_new.columns[0],
                            y=rates_df_new.columns[1],
                            c=d['c'],
                            ax=d['ax'],
                            cmap='copper_r',
                            linewidth=0.2,
                            s=10)
        d['ax'].set_title(d['title'])
    fig.tight_layout()
    return fig
Exemplo n.º 6
0
data = figure_data['standard']
# remove oxygen-sensitive EFMs
data.loc[data[D.STRICTLY_ANAEROBIC_L], D.GROWTH_RATE_L] = 0
D.plot_basic_pareto(data, ax2c, x=D.YIELD_L, y=D.GROWTH_RATE_L,
                    efm_dict=D.efm_dict,
                    facecolors=D.PARETO_NEUTRAL_COLOR, edgecolors='none')
ax2c.set_xlim(-1e-3, 1.1*data[D.YIELD_L].max())
ax2c.set_ylim(-1e-3, 1.15*data[D.GROWTH_RATE_L].max())
ax2c.set_title('glucose = 100 mM, O$_2$ = 3.7 mM')
fig2c.tight_layout()

fig2c.savefig(os.path.join(D.OUTPUT_DIR, 'Fig_web4.pdf'))

# %% histogram of all different EFM growth rates in a specific condition
fig5 = plt.figure(figsize=(5, 5))
ax5 = fig5.add_subplot(1, 1, 1)

efm = allocation_pie_chart(ax5, D.STD_CONC['glucoseExt'],
                           D.STD_CONC['oxygen'])
rates_df, full_df = get_concatenated_raw_data('sweep_glucose')

df = full_df[full_df['efm'] == efm]
v_BM = D.BIOMASS_MW * D.SECONDS_IN_HOUR * rates_df.at[efm, D.R_BIOMASS]

# make a new DataFrame where the index is the glucose concentration
# and the columns are the reactions and values are the costs.
absol = full_df[full_df['efm'] == efm].pivot(index=full_df.columns[1],
                                             columns='reaction',
                                             values='E_i')
fig5.savefig(os.path.join(D.OUTPUT_DIR, 'Fig_web3.pdf'))
    figS6 = plot_tsne_figure(figure_data)
    D.savefig(figS6, 'S6')

    # %% SI Figure 7
    # make bar plots for each reaction, counting how many EFMs it participates
    figS7, axS7 = plt.subplots(2, 2, figsize=(15, 12))

    for i, ax in enumerate(axS7.flat):
        ax.annotate(chr(ord('a') + i),
                    xy=(0.04, 0.98),
                    xycoords='axes fraction',
                    ha='left',
                    va='top',
                    size=20)

    rates1_df, _, _, _ = get_concatenated_raw_data('standard')
    rates2_df, _, _, _ = get_concatenated_raw_data('anaerobic')
    rates_df = pd.concat([rates1_df, rates2_df]).drop_duplicates()

    reaction_counts = 100 * (rates_df.abs() > 1e-8).sum(0) / rates_df.shape[0]

    plt.subplots_adjust(hspace=0.3)
    reaction_counts.sort_values(inplace=True)
    reaction_counts.plot(kind='bar',
                         ax=axS7[0, 0],
                         color=D.BAR_COLOR,
                         linewidth=0)
    axS7[0, 0].set_ylim(0, 100)
    axS7[0, 0].set_ylabel('\% of EFMs using this reaction')
    axS7[0, 0].set_xticklabels(map(D.GET_REACTION_NAME, reaction_counts.index))