def add_thermodynamics(cursor): from groups import GroupMissingTrainDataError, GroupDecompositionError gc = GroupContribution(sqlite_name="gibbs.sqlite", html_name="pathologic") gc.init() cursor.execute("DROP TABLE IF EXISTS yeast_inchi2thermo") cursor.execute( "CREATE TABLE yeast_inchi2thermo (inchi TEXT, charge INT, nH INT, dG0_f REAL)" ) cursor.execute("DROP INDEX IF EXISTS yeast_inchi2thermo_idx") cursor.execute( "CREATE INDEX yeast_inchi2thermo_idx ON yeast_inchi2thermo (inchi);") inchi_list = [] for row in cursor.execute("SELECT distinct(inchi) " \ "FROM yeast_species2inchi WHERE inchi IS NOT NULL"): inchi = row[0] inchi_list.append(str(inchi)) for inchi in inchi_list: try: mol = Molecule.FromInChI(str(inchi)) pmap = gc.Mol2PseudoisomerMap(mol) for ((z, nH), dG0) in pmap.iteritems(): cursor.execute( "INSERT INTO yeast_inchi2thermo VALUES(?,?,?,?)", [inchi, z, nH, dG0]) except (IOError, GroupMissingTrainDataError, GroupDecompositionError): sys.stderr.write( "Cannot convert the following InChI to a pybel Molecule")
def add_thermodynamics(cursor): from groups import GroupMissingTrainDataError, GroupDecompositionError gc = GroupContribution(sqlite_name="gibbs.sqlite", html_name="pathologic") gc.init() cursor.execute("DROP TABLE IF EXISTS yeast_inchi2thermo") cursor.execute("CREATE TABLE yeast_inchi2thermo (inchi TEXT, charge INT, nH INT, dG0_f REAL)") cursor.execute("DROP INDEX IF EXISTS yeast_inchi2thermo_idx") cursor.execute("CREATE INDEX yeast_inchi2thermo_idx ON yeast_inchi2thermo (inchi);") inchi_list = [] for row in cursor.execute("SELECT distinct(inchi) " \ "FROM yeast_species2inchi WHERE inchi IS NOT NULL"): inchi = row[0] inchi_list.append(str(inchi)) for inchi in inchi_list: try: mol = Molecule.FromInChI(str(inchi)) pmap = gc.Mol2PseudoisomerMap(mol) for ((z, nH), dG0) in pmap.iteritems(): cursor.execute("INSERT INTO yeast_inchi2thermo VALUES(?,?,?,?)", [inchi, z, nH, dG0]) except (IOError, GroupMissingTrainDataError, GroupDecompositionError): sys.stderr.write("Cannot convert the following InChI to a pybel Molecule")
def main(): options, _ = flags.MakeOpts().parse_args(sys.argv) c_mid = options.c_mid pH = options.ph pMg = options.pmg I = options.i_s T = default_T db = SqliteDatabase("../res/gibbs.sqlite") kegg = Kegg.getInstance() G = GroupContribution(db) G.init() print ("Parameters: T=%f K, pH=%.2g, pMg=%.2g, " "I=%.2gM, Median concentration=%.2gM" % (T, pH, pMg, I, c_mid)) cmap = {} if not options.ignore_cofactors: if options.full_metabolites: print "Fixing concentrations of all known metabolites" cmap = reversibility.GetFullConcentrationMap(G) else: print "Fixing concentrations of co-factors" cmap = reversibility.GetConcentrationMap(kegg) else: print "Not fixing concentrations of co-factors" if options.report_mode: print "Output used metabolites concentrations" while True: mid = GetModuleIdInput() rid_flux_list = kegg.mid2rid_map[mid] for rid, flux in rid_flux_list: try: reaction = kegg.rid2reaction(rid) print "Reaction Name", reaction.name print "\tKegg Id", reaction.rid print "\tEC", reaction.ec_list rev = reversibility.CalculateReversability( reaction.sparse, G, pH=pH, I=I, pMg=pMg, T=T, concentration_map=cmap ) if rev == None: dG = G.estimate_dG_reaction(reaction.sparse, pH=pH, pMg=pMg, I=I, T=T, c0=c_mid, media="glucose") print "\tReversibility: No free compounds, dG = %.2g" % dG else: corrected_reversibility = flux * rev print "\tReversibility %.2g" % corrected_reversibility if options.report_mode: for cid, s in reaction.sparse.iteritems(): if cid in cmap: print "(%d C%05d) %s\t: %.2g" % (s, cid, kegg.cid2name(cid), cmap[cid]) else: print "(%d C%05d) %s\t: Free concentration" % (s, cid, kegg.cid2name(cid)) except Exception: print "\tCouldn't calculate irreversibility"
def test_single_modules(mids): from pygibbs.groups import GroupContribution db = SqliteDatabase('../res/gibbs.sqlite') html_writer = HtmlWriter("../res/thermodynamic_module_analysis.html") gc = GroupContribution(db, html_writer) gc.init() for mid in mids: html_writer.write("<h2>M%05d</h2>\n" % mid) S, rids, fluxes, cids = gc.kegg.get_module(mid) thermodynamic_pathway_analysis(S, rids, fluxes, cids, gc, html_writer)
def test_all_modules(): from pygibbs.groups import GroupContribution gc = GroupContribution(sqlite_name="gibbs.sqlite", html_name="dG0_test") gc.init() c_range = (1e-6, 1e-2) c_mid = 1e-3 pH = 8 I = 0.1 T = 300 map_cid = {201:2, 454:8} # CIDs that should be mapped to other CIDs because they are unspecific (like NTP => ATP) cids_with_missing_dG_f = set() f = open("../res/feasibility.csv", "w") csv_output = csv.writer(f) csv_output.writerow(("MID", "module name", "pH", "I", "T", "pCr", "MTDF")) for mid in sorted(gc.kegg().mid2rid_map.keys()): module_name = gc.kegg().mid2name_map[mid] try: S, _rids, _fluxes, cids = gc.kegg().get_module(mid) except KeggMissingModuleException as e: sys.stderr.write("WARNING: " + str(e) + "\n") continue _Nr, Nc = S.shape for pH in [5, 6, 7, 8, 9]: for I in [0.0, 0.1, 0.2, 0.3, 0.4]: dG0_f = pylab.zeros((Nc, 1)) bounds = [] for c in range(Nc): cid = map_cid.get(cids[c], cids[c]) try: pmap = gc.cid2PseudoisomerMap(cid) dG0_f[c] = gc.pmap_to_dG0(pmap, pH, I, T) except MissingCompoundFormationEnergy as e: if (cid not in cids_with_missing_dG_f): sys.stderr.write("Setting the dG0_f of C%05d to NaN because: %s\n"\ % (cid, str(e))) cids_with_missing_dG_f.add(cid) dG0_f[c] = pylab.nan bounds = [gc.kegg().cid2bounds.get(cid, (None, None)) for cid in cids] try: _dG_f, _concentrations, pCr = find_pCr(S, dG0_f, c_mid=c_mid, ratio=3.0, bounds=bounds) except LinProgNoSolutionException: sys.stderr.write("M%05d: Pathway is theoretically infeasible\n" % mid) pCr = None try: _dG_f, _concentrations, MTDF = find_mtdf(S, dG0_f, c_range=c_range, bounds=bounds) except LinProgNoSolutionException: sys.stderr.write("M%05d: Pathway is theoretically infeasible\n" % mid) MTDF = None csv_output.writerow([mid, module_name, pH, I, T, pCr, MTDF]) f.close()
def LoadAllEstimators(): db_public = SqliteDatabase('../data/public_data.sqlite') db_gibbs = SqliteDatabase('../res/gibbs.sqlite') if not db_gibbs.DoesTableExist('prc_pseudoisomers'): nist_regression = NistRegression(db_gibbs) nist_regression.Train() tables = {'alberty': (db_public, 'alberty_pseudoisomers', 'Alberty'), 'PRC': (db_gibbs, 'prc_pseudoisomers', 'our method (PRC)')} estimators = {} for key, (db, table_name, thermo_name) in tables.iteritems(): if db.DoesTableExist(table_name): estimators[key] = PsuedoisomerTableThermodynamics.FromDatabase( db, table_name, name=thermo_name) else: logging.warning('The table %s does not exist in %s' % (table_name, str(db))) estimators['hatzi_gc'] = Hatzi(use_pKa=False) #estimators['hatzi_gc_pka'] = Hatzi(use_pKa=True) if db.DoesTableExist('bgc_pseudoisomers'): estimators['BGC'] = GroupContribution(db=db_gibbs, transformed=True) estimators['BGC'].init() estimators['BGC'].name = 'our method (BGC)' if db.DoesTableExist('pgc_pseudoisomers'): estimators['PGC'] = GroupContribution(db=db_gibbs, transformed=False) estimators['PGC'].init() estimators['PGC'].name = 'our method (PGC)' estimators['UGC'] = UnifiedGroupContribution(db=db_gibbs) estimators['UGC'].init() estimators['UGC'].name = 'our method (UGC)' estimators['C1'] = ReactionThermodynamics.FromCsv( '../data/thermodynamics/c1_reaction_thermodynamics.csv', estimators['alberty']) if 'PGC' in estimators: estimators['merged'] = BinaryThermodynamics(estimators['alberty'], estimators['PGC']) estimators['merged_C1'] = BinaryThermodynamics(estimators['C1'], estimators['PGC']) for thermo in estimators.values(): thermo.load_bounds('../data/thermodynamics/concentration_bounds.csv') return estimators
def try_kegg_api(): db = SqliteDatabase('../res/gibbs.sqlite') html_writer = HtmlWriter('../res/dG0_test.html') G = GroupContribution(db, html_writer=html_writer) G.init() wsdl = 'http://soap.genome.jp/KEGG.wsdl' serv = WSDL.Proxy(wsdl) rid_file = open('../res/eco_rids.txt', 'w') rids = set() for x in serv.list_pathways('eco'): pathway_id = x['entry_id'] for reaction_id in serv.get_reactions_by_pathway(pathway_id): rid = int(reaction_id[4:]) if rid not in rids: rids.add(rid) rid_file.write('%d\n' % rid) rid_file.close() c_mid = 1e-3 pH, pMg, I, T = (7.0, 3.0, 0.1, 298.15) rid2reversibility = {} misses = 0 for rid in sorted(rids): try: reaction = G.kegg.rid2reaction(rid) r = CalculateReversability(reaction, G, c_mid, pH, pMg, I, T) rid2reversibility[rid] = r except thermodynamics.MissingCompoundFormationEnergy: misses += 1 continue print 'hits = %d, misses = %d' % len(rid2reversibility), misses median = pylab.median(rid2reversibility.values()) print 'median = %.1f' % median pylab.figure() pylab.hold(True) plotting.cdf(rid2reversibility.values(), 'all reactions', 'r', show_median=True) pylab.show()
def main(): estimators = LoadAllEstimators() args, _ = MakeOpts(estimators).parse_args(sys.argv) # Make sure we have all the data. db = SqliteDatabase("../res/gibbs.sqlite") G = GroupContribution(db=db, html_writer=NullHtmlWriter(), transformed=args.transformed) G.init() print "Exporting KEGG compounds to %s" % args.compounds_out_filename csv_writer = csv.writer(open(args.compounds_out_filename, "w")) csv_writer.writerow(["KEGG ID", "nH", "CHARGE", "nMg", "dG0_f"]) for cid in sorted(G.get_all_cids()): try: for nH, z, nMg, dG0 in G.cid2PseudoisomerMap(cid).ToMatrix(): csv_writer.writerow(["C%05d" % cid, nH, z, nMg, "%.1f" % dG0]) except MissingCompoundFormationEnergy as e: csv_writer.writerow(["C%05d" % cid, None, None, None, str(e)]) print "Exporting KEGG reactions to %s" % args.reactions_out_filename csv_writer = csv.writer(open(args.reactions_out_filename, "w")) csv_writer.writerow( ["KEGG ID", "dG'0_r (pH=%.1f, I=%.2f, pMg=%.1f, T=%.1f)" % (args.ph, args.i_s, args.pmg, args.temp)] ) for rid in sorted(G.kegg.get_all_rids()): reaction = G.kegg.rid2reaction(rid) try: reaction.Balance(balance_water=True) dG0_r = reaction.PredictReactionEnergy(G, pH=args.ph, pMg=args.pmg, I=args.i_s, T=args.temp) csv_writer.writerow(["R%05d" % rid, "%.1f" % dG0_r]) except ( KeggParseException, MissingCompoundFormationEnergy, KeggReactionNotBalancedException, MissingReactionEnergy, KeyError, OpenBabelError, ) as e: csv_writer.writerow(["R%05d" % rid, str(e)])
def LoadAllEstimators(): db_public = SqliteDatabase('../data/public_data.sqlite') db_gibbs = SqliteDatabase('../res/gibbs.sqlite') if not db_gibbs.DoesTableExist('prc_pseudoisomers'): nist_regression = NistRegression(db_gibbs) nist_regression.Train() tables = { 'alberty': (db_public, 'alberty_pseudoisomers', 'Alberty'), 'PRC': (db_gibbs, 'prc_pseudoisomers', 'our method (PRC)') } estimators = {} for key, (db, table_name, thermo_name) in tables.iteritems(): if db.DoesTableExist(table_name): estimators[key] = PsuedoisomerTableThermodynamics.FromDatabase( db, table_name, name=thermo_name) else: logging.warning('The table %s does not exist in %s' % (table_name, str(db))) estimators['hatzi_gc'] = Hatzi(use_pKa=False) #estimators['hatzi_gc_pka'] = Hatzi(use_pKa=True) if db.DoesTableExist('bgc_pseudoisomers'): estimators['BGC'] = GroupContribution(db=db_gibbs, transformed=True) estimators['BGC'].init() estimators['BGC'].name = 'our method (BGC)' if db.DoesTableExist('pgc_pseudoisomers'): estimators['PGC'] = GroupContribution(db=db_gibbs, transformed=False) estimators['PGC'].init() estimators['PGC'].name = 'our method (PGC)' estimators['UGC'] = UnifiedGroupContribution(db=db_gibbs) estimators['UGC'].init() estimators['UGC'].name = 'our method (UGC)' estimators['C1'] = ReactionThermodynamics.FromCsv( '../data/thermodynamics/c1_reaction_thermodynamics.csv', estimators['alberty']) if 'PGC' in estimators: estimators['merged'] = BinaryThermodynamics(estimators['alberty'], estimators['PGC']) estimators['merged_C1'] = BinaryThermodynamics(estimators['C1'], estimators['PGC']) for thermo in estimators.values(): thermo.load_bounds('../data/thermodynamics/concentration_bounds.csv') return estimators
def main(): estimators = LoadAllEstimators() args, _ = MakeOpts(estimators).parse_args(sys.argv) # Make sure we have all the data. db = SqliteDatabase('../res/gibbs.sqlite') G = GroupContribution(db=db, html_writer=NullHtmlWriter(), transformed=args.transformed) G.init() print 'Exporting KEGG compounds to %s' % args.compounds_out_filename csv_writer = csv.writer(open(args.compounds_out_filename, 'w')) csv_writer.writerow(["KEGG ID", "nH", "CHARGE", "nMg", "dG0_f"]) for cid in sorted(G.get_all_cids()): try: for nH, z, nMg, dG0 in G.cid2PseudoisomerMap(cid).ToMatrix(): csv_writer.writerow(["C%05d" % cid, nH, z, nMg, "%.1f" % dG0]) except MissingCompoundFormationEnergy as e: csv_writer.writerow(["C%05d" % cid, None, None, None, str(e)]) print 'Exporting KEGG reactions to %s' % args.reactions_out_filename csv_writer = csv.writer(open(args.reactions_out_filename, 'w')) csv_writer.writerow(["KEGG ID", "dG'0_r (pH=%.1f, I=%.2f, pMg=%.1f, T=%.1f)" % (args.ph, args.i_s, args.pmg, args.temp)]) for rid in sorted(G.kegg.get_all_rids()): reaction = G.kegg.rid2reaction(rid) try: reaction.Balance(balance_water=True) dG0_r = reaction.PredictReactionEnergy(G, pH=args.ph, pMg=args.pmg, I=args.i_s, T=args.temp) csv_writer.writerow(["R%05d" % rid, "%.1f" % dG0_r]) except (KeggParseException, MissingCompoundFormationEnergy, KeggReactionNotBalancedException, MissingReactionEnergy, KeyError, OpenBabelError) as e: csv_writer.writerow(["R%05d" % rid, str(e)])
def main(): options, _ = flags.MakeOpts().parse_args(sys.argv) c_mid = options.c_mid pH = options.ph pMg = options.pmg I = options.i_s T = default_T db = SqliteDatabase('../res/gibbs.sqlite') kegg = Kegg.getInstance() G = GroupContribution(db) G.init() print( 'Parameters: T=%f K, pH=%.2g, pMg=%.2g, ' 'I=%.2gM, Median concentration=%.2gM' % (T, pH, pMg, I, c_mid)) cmap = {} if not options.ignore_cofactors: if options.full_metabolites: print 'Fixing concentrations of all known metabolites' cmap = reversibility.GetFullConcentrationMap(G) else: print 'Fixing concentrations of co-factors' cmap = reversibility.GetConcentrationMap(kegg) else: print 'Not fixing concentrations of co-factors' if options.report_mode: print 'Output used metabolites concentrations' while True: mid = GetModuleIdInput() rid_flux_list = kegg.mid2rid_map[mid] for rid, flux in rid_flux_list: try: reaction = kegg.rid2reaction(rid) print 'Reaction Name', reaction.name print '\tKegg Id', reaction.rid print '\tEC', reaction.ec_list rev = reversibility.CalculateReversability( reaction.sparse, G, pH=pH, I=I, pMg=pMg, T=T, concentration_map=cmap) if rev == None: dG = G.estimate_dG_reaction(reaction.sparse, pH=pH, pMg=pMg, I=I, T=T, c0=c_mid, media='glucose') print '\tReversibility: No free compounds, dG = %.2g' % dG else: corrected_reversibility = flux * rev print '\tReversibility %.2g' % corrected_reversibility if options.report_mode: for cid, s in reaction.sparse.iteritems(): if cid in cmap: print '(%d C%05d) %s\t: %.2g' % ( s, cid, kegg.cid2name(cid), cmap[cid]) else: print '(%d C%05d) %s\t: Free concentration' % ( s, cid, kegg.cid2name(cid)) except Exception: print '\tCouldn\'t calculate irreversibility'
def test_all_modules(): from pygibbs.groups import GroupContribution gc = GroupContribution(sqlite_name="gibbs.sqlite", html_name="dG0_test") gc.init() c_range = (1e-6, 1e-2) c_mid = 1e-3 pH = 8 I = 0.1 T = 300 map_cid = { 201: 2, 454: 8 } # CIDs that should be mapped to other CIDs because they are unspecific (like NTP => ATP) cids_with_missing_dG_f = set() f = open("../res/feasibility.csv", "w") csv_output = csv.writer(f) csv_output.writerow(("MID", "module name", "pH", "I", "T", "pCr", "MTDF")) for mid in sorted(gc.kegg().mid2rid_map.keys()): module_name = gc.kegg().mid2name_map[mid] try: S, _rids, _fluxes, cids = gc.kegg().get_module(mid) except KeggMissingModuleException as e: sys.stderr.write("WARNING: " + str(e) + "\n") continue _Nr, Nc = S.shape for pH in [5, 6, 7, 8, 9]: for I in [0.0, 0.1, 0.2, 0.3, 0.4]: dG0_f = pylab.zeros((Nc, 1)) bounds = [] for c in range(Nc): cid = map_cid.get(cids[c], cids[c]) try: pmap = gc.cid2PseudoisomerMap(cid) dG0_f[c] = gc.pmap_to_dG0(pmap, pH, I, T) except MissingCompoundFormationEnergy as e: if (cid not in cids_with_missing_dG_f): sys.stderr.write("Setting the dG0_f of C%05d to NaN because: %s\n"\ % (cid, str(e))) cids_with_missing_dG_f.add(cid) dG0_f[c] = pylab.nan bounds = [ gc.kegg().cid2bounds.get(cid, (None, None)) for cid in cids ] try: _dG_f, _concentrations, pCr = find_pCr(S, dG0_f, c_mid=c_mid, ratio=3.0, bounds=bounds) except LinProgNoSolutionException: sys.stderr.write( "M%05d: Pathway is theoretically infeasible\n" % mid) pCr = None try: _dG_f, _concentrations, MTDF = find_mtdf(S, dG0_f, c_range=c_range, bounds=bounds) except LinProgNoSolutionException: sys.stderr.write( "M%05d: Pathway is theoretically infeasible\n" % mid) MTDF = None csv_output.writerow([mid, module_name, pH, I, T, pCr, MTDF]) f.close()
def CalculateThermo(): parser = MakeOpts() options, _ = parser.parse_args(sys.argv) pH, I, pMg, T = options.pH, options.I, options.pMg, options.T db = SqliteDatabase('../res/gibbs.sqlite') G = GroupContribution(db=db) G.init() ignore_protonations = False list_of_mols = [] if options.smiles: list_of_mols.append({'id':options.smiles, 'mol':options.smiles, 'format':'smiles'}) elif options.inchi: list_of_mols.append({'id':options.inchi, 'mol':options.inchi, 'format':'inchi'}) elif options.csv_input_filename: for row in csv.DictReader(open(options.csv_input_filename, 'r')): if "InChI" in row: list_of_mols.append({'id':row["ID"], 'mol':row["InChI"], 'format':'inchi'}) elif "smiles" in row: list_of_mols.append({'id':row["ID"], 'mol':row["smiles"], 'format':'smiles'}) else: raise Exception("There must be one molecular ID column: InChI or smiles") else: parser.error("must use either -s or -c option") if options.biochemical: print ("Calculating biochemical formation energies for %s compounds" " at pH = %.1f, I = %.2f, pMg = %.1f, T = %.2f" % (len(list_of_mols), pH, I, pMg, T)) else: print ("Calculating chemical formation energies for %s compounds" % len(list_of_mols)) rowdicts = [] for mol_dict in list_of_mols: mol_id = mol_dict['id'] diss_table = Molecule._GetDissociationTable(mol_dict['mol'], fmt=mol_dict['format']) try: mol = diss_table.GetMostAbundantMol(pH, I, pMg, T) or \ diss_table.GetAnyMol() if mol is None: raise Exception("Cannot convert input string to Molecule: " + mol_dict['mol']) decomposition = G.Mol2Decomposition(mol, ignore_protonations=ignore_protonations) groupvec = decomposition.AsVector() dG0 = G.groupvec2val(groupvec) nH = decomposition.Hydrogens() nMg = decomposition.Magnesiums() diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg) pmap = diss_table.GetPseudoisomerMap() if options.biochemical: dG0_prime = pmap.Transform(pH, pMg, I, T) rowdicts.append({'ID':mol_id, 'pH':pH, 'I':I, 'pMg':pMg, 'dG0\'':"%.1f" % dG0_prime, 'groupvec':str(groupvec)}) else: for p_nH, p_z, p_nMg, p_dG0 in pmap.ToMatrix(): rowdicts.append({'ID':mol_id, 'nH':p_nH, 'charge':p_z, 'nMg':p_nMg, 'dG0':"%.1f" % p_dG0, 'groupvec':str(groupvec)}) except GroupDecompositionError: rowdicts.append({'ID':mol_id, 'error':"cannot decompose"}) except GroupMissingTrainDataError: rowdicts.append({'ID':mol_id, 'groupvec':str(groupvec), 'error':"missing training data"}) if options.csv_output_filename is not None: out_fp = open(options.csv_output_filename, 'w') print "writing results to %s ... " % options.csv_output_filename else: out_fp = sys.stdout if options.biochemical: titles = ['ID', 'error', 'pH', 'I', 'pMg', 'dG0\'', 'groupvec'] else: titles = ['ID', 'error', 'nH', 'nMg', 'charge', 'dG0', 'groupvec'] csv_writer = csv.DictWriter(out_fp, titles) csv_writer.writeheader() csv_writer.writerows(rowdicts)
def CalculateThermo(): parser = MakeOpts() options, _ = parser.parse_args(sys.argv) pH, I, pMg, T = options.pH, options.I, options.pMg, options.T db = SqliteDatabase('../res/gibbs.sqlite') G = GroupContribution(db=db) G.init() ignore_protonations = False list_of_mols = [] if options.smiles: list_of_mols.append({ 'id': options.smiles, 'mol': options.smiles, 'format': 'smiles' }) elif options.inchi: list_of_mols.append({ 'id': options.inchi, 'mol': options.inchi, 'format': 'inchi' }) elif options.csv_input_filename: for row in csv.DictReader(open(options.csv_input_filename, 'r')): if "InChI" in row: list_of_mols.append({ 'id': row["ID"], 'mol': row["InChI"], 'format': 'inchi' }) elif "smiles" in row: list_of_mols.append({ 'id': row["ID"], 'mol': row["smiles"], 'format': 'smiles' }) else: raise Exception( "There must be one molecular ID column: InChI or smiles") else: parser.error("must use either -s or -c option") if options.biochemical: print( "Calculating biochemical formation energies for %s compounds" " at pH = %.1f, I = %.2f, pMg = %.1f, T = %.2f" % (len(list_of_mols), pH, I, pMg, T)) else: print("Calculating chemical formation energies for %s compounds" % len(list_of_mols)) rowdicts = [] for mol_dict in list_of_mols: mol_id = mol_dict['id'] diss_table = Molecule._GetDissociationTable(mol_dict['mol'], fmt=mol_dict['format']) try: mol = diss_table.GetMostAbundantMol(pH, I, pMg, T) or \ diss_table.GetAnyMol() if mol is None: raise Exception("Cannot convert input string to Molecule: " + mol_dict['mol']) decomposition = G.Mol2Decomposition( mol, ignore_protonations=ignore_protonations) groupvec = decomposition.AsVector() dG0 = G.groupvec2val(groupvec) nH = decomposition.Hydrogens() nMg = decomposition.Magnesiums() diss_table.SetFormationEnergyByNumHydrogens(dG0, nH, nMg) pmap = diss_table.GetPseudoisomerMap() if options.biochemical: dG0_prime = pmap.Transform(pH, pMg, I, T) rowdicts.append({ 'ID': mol_id, 'pH': pH, 'I': I, 'pMg': pMg, 'dG0\'': "%.1f" % dG0_prime, 'groupvec': str(groupvec) }) else: for p_nH, p_z, p_nMg, p_dG0 in pmap.ToMatrix(): rowdicts.append({ 'ID': mol_id, 'nH': p_nH, 'charge': p_z, 'nMg': p_nMg, 'dG0': "%.1f" % p_dG0, 'groupvec': str(groupvec) }) except GroupDecompositionError: rowdicts.append({'ID': mol_id, 'error': "cannot decompose"}) except GroupMissingTrainDataError: rowdicts.append({ 'ID': mol_id, 'groupvec': str(groupvec), 'error': "missing training data" }) if options.csv_output_filename is not None: out_fp = open(options.csv_output_filename, 'w') print "writing results to %s ... " % options.csv_output_filename else: out_fp = sys.stdout if options.biochemical: titles = ['ID', 'error', 'pH', 'I', 'pMg', 'dG0\'', 'groupvec'] else: titles = ['ID', 'error', 'nH', 'nMg', 'charge', 'dG0', 'groupvec'] csv_writer = csv.DictWriter(out_fp, titles) csv_writer.writeheader() csv_writer.writerows(rowdicts)
def main(): db = database.SqliteDatabase('../res/gibbs.sqlite') html_writer = HtmlWriter("../res/nist/report.html") gc = GroupContribution(db) gc.override_gc_with_measurements = True gc.init() grad = GradientAscent(gc) nist = Nist(db, html_writer, gc.kegg()) nist.FromDatabase() alberty = Alberty() hatzi = Hatzi() if True: grad.load_nist_data(nist, alberty, skip_missing_reactions=False, T_range=(298, 314)) grad.verify_results("Alberty", alberty, html_writer) #grad.write_pseudoisomers("../res/nist/nist_dG0_f.csv") #html_writer.write("<h2>Using Group Contribution (Hatzimanikatis' implementation)</h2>") #html_writer.write("<h3>Correlation with the reduced NIST database (containing only compounds that appear in Alberty's list)</h3>") #logging.info("calculate the correlation between Hatzimanikatis' predictions and the reduced NIST database") #grad.verify_results("Hatzimanikatis_Reduced", hatzi, html_writer) #grad.load_nist_data(nist, hatzi, skip_missing_reactions=True, T_range=(298, 314)) grad.verify_results("Hatzimanikatis", hatzi, html_writer) #grad.load_nist_data(nist, gc, skip_missing_reactions=True, T_range=(298, 314)) grad.verify_results("Milo", gc, html_writer) elif False: # Run the gradient ascent algorithm, where the starting point is the same file used for training the GC algorithm grad.load_dG0_data("../data/thermodynamics/dG0.csv") # load the data for the anchors (i.e. compounds whose dG0 should not be changed - usually their value will be 0). grad.anchors = grad.load_dG0_data("../data/thermodynamics/nist_anchors.csv") grad.load_nist_data(nist, grad, skip_missing_reactions=True) print "Training %d compounds using %d reactions: " % (len(grad.cid2pmap_dict.keys()), len(grad.data)) grad.hill_climb(max_i=20000) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient1") elif False: # Run the gradient ascent algorithm, where the starting point is Alberty's table from (Mathematica 2006) grad.load_nist_data(nist, alberty, skip_missing_reactions=True) print "Training %d compounds using %d reactions: " % (len(grad.cid2pmap_dict.keys()), len(grad.data)) grad.cid2pmap_dict = alberty.cid2pmap_dict grad.hill_climb(max_i=20000) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient2") elif False: # Run the gradient ascent algorithm, where the starting point is Alberty's table from (Mathematica 2006) # Use DETERMINISTIC gradient ascent grad.load_nist_data(nist, alberty, skip_missing_reactions=True, T_range=(24 + 273.15, 40 + 273.15)) print "Training %d compounds using %d reactions: " % (len(grad.cid2pmap_dict.keys()), len(grad.data)) grad.cid2pmap_dict = alberty.cid2pmap_dict grad.deterministic_hill_climb(max_i=200) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient_deterministic") elif False: # Run the gradient ascent algorithm, where the starting point arbitrary (predict all of the NIST compounds) grad = GradientAscent(gc) grad.load_nist_data(nist, skip_missing_reactions=False) print "Training %d compounds using %d reactions: " % (len(grad.cid2pmap_dict.keys()), len(grad.data)) grad.hill_climb(max_i=20000) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient3") elif False: # Use Alberty's table from (Mathematica 2006) to calculate the dG0 of all possible reactions in KEGG grad = GradientAscent(gc) grad.cid2pmap_dict = alberty.cid2pmap_dict (pH, I, T) = (7, 0, 300) counter = 0 for rid in grad.kegg.get_all_rids(): sparse_reaction = grad.kegg.rid2sparse_reaction(rid) try: dG0 = grad.reaction_to_dG0(sparse_reaction, pH, I, T) print "R%05d: dG0_r = %.2f [kJ/mol]" % (rid, dG0) counter += 1 except MissingCompoundFormationEnergy as e: #print "R%05d: missing formation energy of C%05d" % (rid, e.cid) pass print "Managed to calculate the dG0 of %d reactions" % counter elif False: util._mkdir("../res/nist/fig") csv_writer = csv.writer(open("../res/nist/pseudoisomers.csv", "w")) cid_set = set() for row in nist.data: sparce_reaction = row['sparse'] cid_set.update(sparce_reaction.keys()) html_writer.write("<table border=1>\n") for cid in sorted(list(cid_set)): html_writer.write(" <tr><td>C%05d</td><td>%s</td><td>" % (cid, grad.kegg.cid2name(cid))) try: mol = grad.kegg.cid2mol(cid) img_fname = '../res/nist/fig/C%05d.png' % cid html_writer.embed_img(img_fname, "C%05d" % cid) mol.draw(show=False, filename=img_fname) except AssertionError as e: html_writer.write("WARNING: cannot draw C%05d - %s" % (cid, str(e))) except KeggParseException as e: html_writer.write("WARNING: cannot draw C%05d - %s" % (cid, str(e))) html_writer.write("</td><td>") if (cid in alberty.cid2pmap_dict): for (nH, z) in alberty.cid2pmap_dict[cid].keys(): html_writer.write("(nH=%d, z=%d)<br>" % (nH, z)) csv_writer.writerow((cid, nH, z)) else: nH = grad.kegg.cid2num_hydrogens(cid) z = grad.kegg.cid2charge(cid) html_writer.write("unknown pseudoisomers<br>") html_writer.write("(nH=%d, z=%d)" % (nH, z)) csv_writer.writerow((cid, nH, z)) html_writer.write("</td></tr>\n") html_writer.write("</table>\n") html_writer.close()
def main(): db = database.SqliteDatabase('../res/gibbs.sqlite') html_writer = HtmlWriter("../res/nist/report.html") gc = GroupContribution(db) gc.override_gc_with_measurements = True gc.init() grad = GradientAscent(gc) nist = Nist(db, html_writer, gc.kegg()) nist.FromDatabase() alberty = Alberty() hatzi = Hatzi() if True: grad.load_nist_data(nist, alberty, skip_missing_reactions=False, T_range=(298, 314)) grad.verify_results("Alberty", alberty, html_writer) #grad.write_pseudoisomers("../res/nist/nist_dG0_f.csv") #html_writer.write("<h2>Using Group Contribution (Hatzimanikatis' implementation)</h2>") #html_writer.write("<h3>Correlation with the reduced NIST database (containing only compounds that appear in Alberty's list)</h3>") #logging.info("calculate the correlation between Hatzimanikatis' predictions and the reduced NIST database") #grad.verify_results("Hatzimanikatis_Reduced", hatzi, html_writer) #grad.load_nist_data(nist, hatzi, skip_missing_reactions=True, T_range=(298, 314)) grad.verify_results("Hatzimanikatis", hatzi, html_writer) #grad.load_nist_data(nist, gc, skip_missing_reactions=True, T_range=(298, 314)) grad.verify_results("Milo", gc, html_writer) elif False: # Run the gradient ascent algorithm, where the starting point is the same file used for training the GC algorithm grad.load_dG0_data("../data/thermodynamics/dG0.csv") # load the data for the anchors (i.e. compounds whose dG0 should not be changed - usually their value will be 0). grad.anchors = grad.load_dG0_data( "../data/thermodynamics/nist_anchors.csv") grad.load_nist_data(nist, grad, skip_missing_reactions=True) print "Training %d compounds using %d reactions: " % (len( grad.cid2pmap_dict.keys()), len(grad.data)) grad.hill_climb(max_i=20000) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient1") elif False: # Run the gradient ascent algorithm, where the starting point is Alberty's table from (Mathematica 2006) grad.load_nist_data(nist, alberty, skip_missing_reactions=True) print "Training %d compounds using %d reactions: " % (len( grad.cid2pmap_dict.keys()), len(grad.data)) grad.cid2pmap_dict = alberty.cid2pmap_dict grad.hill_climb(max_i=20000) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient2") elif False: # Run the gradient ascent algorithm, where the starting point is Alberty's table from (Mathematica 2006) # Use DETERMINISTIC gradient ascent grad.load_nist_data(nist, alberty, skip_missing_reactions=True, T_range=(24 + 273.15, 40 + 273.15)) print "Training %d compounds using %d reactions: " % (len( grad.cid2pmap_dict.keys()), len(grad.data)) grad.cid2pmap_dict = alberty.cid2pmap_dict grad.deterministic_hill_climb(max_i=200) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient_deterministic") elif False: # Run the gradient ascent algorithm, where the starting point arbitrary (predict all of the NIST compounds) grad = GradientAscent(gc) grad.load_nist_data(nist, skip_missing_reactions=False) print "Training %d compounds using %d reactions: " % (len( grad.cid2pmap_dict.keys()), len(grad.data)) grad.hill_climb(max_i=20000) grad.save_energies(grad.gc.comm, "gradient_cid2prm") grad.verify_results("gradient3") elif False: # Use Alberty's table from (Mathematica 2006) to calculate the dG0 of all possible reactions in KEGG grad = GradientAscent(gc) grad.cid2pmap_dict = alberty.cid2pmap_dict (pH, I, T) = (7, 0, 300) counter = 0 for rid in grad.kegg.get_all_rids(): sparse_reaction = grad.kegg.rid2sparse_reaction(rid) try: dG0 = grad.reaction_to_dG0(sparse_reaction, pH, I, T) print "R%05d: dG0_r = %.2f [kJ/mol]" % (rid, dG0) counter += 1 except MissingCompoundFormationEnergy as e: #print "R%05d: missing formation energy of C%05d" % (rid, e.cid) pass print "Managed to calculate the dG0 of %d reactions" % counter elif False: util._mkdir("../res/nist/fig") csv_writer = csv.writer(open("../res/nist/pseudoisomers.csv", "w")) cid_set = set() for row in nist.data: sparce_reaction = row['sparse'] cid_set.update(sparce_reaction.keys()) html_writer.write("<table border=1>\n") for cid in sorted(list(cid_set)): html_writer.write(" <tr><td>C%05d</td><td>%s</td><td>" % (cid, grad.kegg.cid2name(cid))) try: mol = grad.kegg.cid2mol(cid) img_fname = '../res/nist/fig/C%05d.png' % cid html_writer.embed_img(img_fname, "C%05d" % cid) mol.draw(show=False, filename=img_fname) except AssertionError as e: html_writer.write("WARNING: cannot draw C%05d - %s" % (cid, str(e))) except KeggParseException as e: html_writer.write("WARNING: cannot draw C%05d - %s" % (cid, str(e))) html_writer.write("</td><td>") if (cid in alberty.cid2pmap_dict): for (nH, z) in alberty.cid2pmap_dict[cid].keys(): html_writer.write("(nH=%d, z=%d)<br>" % (nH, z)) csv_writer.writerow((cid, nH, z)) else: nH = grad.kegg.cid2num_hydrogens(cid) z = grad.kegg.cid2charge(cid) html_writer.write("unknown pseudoisomers<br>") html_writer.write("(nH=%d, z=%d)" % (nH, z)) csv_writer.writerow((cid, nH, z)) html_writer.write("</td></tr>\n") html_writer.write("</table>\n") html_writer.close()
v[0, i] = S[cids.index(cid), j] if (abs(v * var_P_N) > 1e-10).any(): dG0_r_prime[0, j] = np.nan return dG0_r_prime if __name__ == "__main__": from pygibbs.groups import GroupContribution db_public = SqliteDatabase('../data/public_data.sqlite') db_gibbs = SqliteDatabase('../res/gibbs.sqlite') alberty = PsuedoisomerTableThermodynamics.FromDatabase(\ db_public, 'alberty_pseudoisomers', name='alberty') pgc = GroupContribution(db=db_gibbs, transformed=False) pgc.init() pgc.name = "PGC" merged = BinaryThermodynamics(alberty, pgc) S = np.matrix([[-1, 1, 0, 0, 0, 0, 0, 0, 0], [ 0, -1, -1, 1, 0, 0, -1, 1, 1], [ 0, 0, 0, -1, 1, 1, 0, 0, 0], [ 0, -1, -1, 0, 1, 1, -1, 1, 1]]).T cids = [311, 158, 10, 566, 24, 36, 2, 8, 9] print alberty.GetTransformedFormationEnergies(cids) print alberty.GetTransfromedReactionEnergies(S, cids) print pgc.GetTransfromedReactionEnergies(S, cids) dG0_r_primes = merged.GetTransfromedReactionEnergies(S, cids)