def __init__(self, sparse, arrow='<=>', rid=None): for cid, coeff in sparse.iteritems(): if not (isinstance(coeff, float) or isinstance(coeff, int)): raise ValueError( 'All values in KeggReaction must be integers or floats') self.sparse = dict(filter(lambda (k, v): v, sparse.items())) self.arrow = arrow self.rid = rid self.ccache = CompoundCacher()
def main(fname, pH, I, T): ccache = CompoundCacher() for row in csv.reader(open(fname, 'r'), delimiter='\t'): compound_id = re.findall('(C[0-9]+)_10', row[0])[0] dG0 = float(row[1]) comp = ccache.get_compound(compound_id) dG0_prime = dG0 + comp.transform_neutral(pH, I, T) print '%s\t%f\t%f' % (compound_id, dG0, dG0_prime) ccache.dump()
def get_ddG0(rxn_dict, pH, I, novel_mets): ccache = CompoundCacher() # ddG0 = get_transform_ddG0(rxn_dict, ccache, pH, I, T) T = 298.15 ddG0_forward = 0 for compound_id, coeff in rxn_dict.items(): if novel_mets != None and compound_id in novel_mets: comp = novel_mets[compound_id] else: comp = ccache.get_compound(compound_id) ddG0_forward += coeff * comp.transform_pH7(pH, I, T) return ddG0_forward
def __init__(self, S, cids, rids=None): self.S = S self.cids = cids self.rids = rids assert len(self.cids) == self.S.shape[0] if self.rids is not None: assert len(self.rids) == self.S.shape[1] self.ccache = CompoundCacher() # remove H+ from the stoichiometric matrix if it exists if 'C00080' in self.cids: i = self.cids.index('C00080') self.S = np.vstack((self.S[:i, :], self.S[i + 1:, :])) self.cids.pop(i)
def __init__(self): self.ccache = CompoundCacher() thermo_params, self.cids_that_dont_decompose = TrainingData.get_all_thermo_params() cids = set() for d in thermo_params: cids = cids.union(d['reaction'].keys()) cids = sorted(cids) # convert the list of reactions in sparse notation into a full # stoichiometric matrix, where the rows (compounds) are according to the # CID list 'cids'. self.S = np.zeros((len(cids), len(thermo_params))) for k, d in enumerate(thermo_params): for cid, coeff in d['reaction'].iteritems(): self.S[cids.index(cid), k] = coeff self.cids = cids self.dG0_prime = np.array([d['dG\'0'] for d in thermo_params]) self.T = np.array([d['T'] for d in thermo_params]) self.I = np.array([d['I'] for d in thermo_params]) self.pH = np.array([d['pH'] for d in thermo_params]) self.pMg = np.array([d['pMg'] for d in thermo_params]) self.weight = np.array([d['weight'] for d in thermo_params]) self.reference = [d['reference'] for d in thermo_params] self.description = [d['description'] for d in thermo_params] rxn_inds_to_balance = [i for i in xrange(len(thermo_params)) if thermo_params[i]['balance']] self.balance_reactions(rxn_inds_to_balance) self.reverse_transform()
def __init__(self, S, cids, rids=None): self.S = S self.cids = cids self.rids = rids assert len(self.cids) == self.S.shape[0] if self.rids is not None: assert len(self.rids) == self.S.shape[1] self.ccache = CompoundCacher.getInstance()
def __init__(self, sparse, arrow='<=>', rid=None): for cid, coeff in sparse.iteritems(): if not (isinstance(coeff, float) or isinstance(coeff, int)): raise ValueError('All values in KeggReaction must be integers or floats') self.sparse = dict(filter(lambda (k,v):v, sparse.items())) self.arrow = arrow self.rid = rid self.ccache = CompoundCacher()
def main(fname, pH, I, T): ccache = CompoundCacher.getInstance() for row in csv.reader(open(fname, 'r'), delimiter='\t'): cid = re.findall('C([0-9]+)_10', row[0])[0] cid = int(cid) dG0 = float(row[1]) comp = ccache.get_kegg_compound(cid) dG0_prime = dG0 + comp.transform_neutral(pH, I, T) print 'C%05d\t%f\t%f' % (cid, dG0, dG0_prime) ccache.dump()
def __init__(self, training_data): """ Initialize G matrix, and then use the python script "inchi2gv.py" to decompose each of the compounds that has an InChI and save the decomposition as a row in the G matrix. """ self.ccache = CompoundCacher.getInstance() self.groups_data = GroupsData.FromGroupsFile(GROUP_CSV, transformed=False) self.inchi2gv = InChI2GroupVector(self.groups_data) self.group_names = self.groups_data.GetGroupNames() self.training_data = training_data
def __init__(self, S, cids, rids=None): self.S = S self.cids = cids self.rids = rids assert len(self.cids) == self.S.shape[0] if self.rids is not None: assert len(self.rids) == self.S.shape[1] self.ccache = CompoundCacher() # remove H+ from the stoichiometric matrix if it exists if 'C00080' in self.cids: i = self.cids.index('C00080') self.S = np.vstack((self.S[:i,:], self.S[i+1:,:])) self.cids.pop(i)
def __init__(self): self.ccache = CompoundCacher.getInstance() base_path = os.path.split(os.path.realpath(__file__))[0] fname, weight = TrainingData.FNAME_DICT['TECRDB'] fname = os.path.join(base_path, fname) tecrdb_params = TrainingData.read_tecrdb(fname, weight) fname, weight = TrainingData.FNAME_DICT['FORMATION'] fname = os.path.join(base_path, fname) formation_params, cids_that_dont_decompose = TrainingData.read_formations(fname, weight) fname, weight = TrainingData.FNAME_DICT['REDOX'] fname = os.path.join(base_path, fname) redox_params = TrainingData.read_redox(fname, weight) thermo_params = tecrdb_params + formation_params + redox_params cids = set() for d in thermo_params: cids = cids.union(d['reaction'].keys()) cids = sorted(cids) # convert the list of reactions in sparse notation into a full # stoichiometric matrix, where the rows (compounds) are according to the # CID list 'cids'. self.S = np.zeros((len(cids), len(thermo_params))) for k, d in enumerate(thermo_params): for cid, coeff in d['reaction'].iteritems(): self.S[cids.index(cid), k] = coeff self.cids = cids self.cids_that_dont_decompose = cids_that_dont_decompose self.dG0_prime = np.array([d['dG\'0'] for d in thermo_params]) self.T = np.array([d['T'] for d in thermo_params]) self.I = np.array([d['I'] for d in thermo_params]) self.pH = np.array([d['pH'] for d in thermo_params]) self.pMg = np.array([d['pMg'] for d in thermo_params]) self.weight = np.array([d['weight'] for d in thermo_params]) rxn_inds_to_balance = [i for i in xrange(len(thermo_params)) if thermo_params[i]['balance']] self.balance_reactions(rxn_inds_to_balance) self.reverse_transform()
def __init__(self): self.ccache = CompoundCacher.getInstance() # verify that the files exist for fname, _ in TrainingData.FNAME_DICT.values(): if not os.path.exists(fname): raise Exception('file not found: ' + fname) tecrdb_params = TrainingData.read_tecrdb() formation_params, cids_that_dont_decompose = TrainingData.read_formations( ) redox_params = TrainingData.read_redox() thermo_params = tecrdb_params + formation_params + redox_params cids = set() for d in thermo_params: cids = cids.union(d['reaction'].keys()) cids = sorted(cids) # convert the list of reactions in sparse notation into a full # stoichiometric matrix, where the rows (compounds) are according to the # CID list 'cids'. self.S = np.zeros((len(cids), len(thermo_params))) for k, d in enumerate(thermo_params): for cid, coeff in d['reaction'].iteritems(): self.S[cids.index(cid), k] = coeff self.cids = cids self.cids_that_dont_decompose = cids_that_dont_decompose self.dG0_prime = np.array([d['dG\'0'] for d in thermo_params]) self.T = np.array([d['T'] for d in thermo_params]) self.I = np.array([d['I'] for d in thermo_params]) self.pH = np.array([d['pH'] for d in thermo_params]) self.pMg = np.array([d['pMg'] for d in thermo_params]) self.weight = np.array([d['weight'] for d in thermo_params]) rxn_inds_to_balance = [ i for i in xrange(len(thermo_params)) if thermo_params[i]['balance'] ] self.balance_reactions(rxn_inds_to_balance) self.reverse_transform()
def __init__(self): self.ccache = CompoundCacher.getInstance() # verify that the files exist for fname, _ in TrainingData.FNAME_DICT.values(): if not os.path.exists(fname): raise Exception('file not found: ' + fname) tecrdb_params = TrainingData.read_tecrdb() formation_params, cids_that_dont_decompose = TrainingData.read_formations() redox_params = TrainingData.read_redox() thermo_params = tecrdb_params + formation_params + redox_params cids = set() for d in thermo_params: cids = cids.union(d['reaction'].keys()) cids = sorted(cids) # convert the list of reactions in sparse notation into a full # stoichiometric matrix, where the rows (compounds) are according to the # CID list 'cids'. self.S = np.zeros((len(cids), len(thermo_params))) for k, d in enumerate(thermo_params): for cid, coeff in d['reaction'].iteritems(): self.S[cids.index(cid), k] = coeff self.cids = cids; self.cids_that_dont_decompose = cids_that_dont_decompose self.dG0_prime = np.array([d['dG\'0'] for d in thermo_params]) self.T = np.array([d['T'] for d in thermo_params]) self.I = np.array([d['I'] for d in thermo_params]) self.pH = np.array([d['pH'] for d in thermo_params]) self.pMg = np.array([d['pMg'] for d in thermo_params]) self.weight = np.array([d['weight'] for d in thermo_params]) rxn_inds_to_balance = [i for i in xrange(len(thermo_params)) if thermo_params[i]['balance']] self.balance_reactions(rxn_inds_to_balance) self.reverse_transform()
def __init__(self, training_data): """ Initialize G matrix, and then use the python script "inchi2gv.py" to decompose each of the compounds that has an InChI and save the decomposition as a row in the G matrix. """ self.ccache = CompoundCacher.getInstance() self.groups_data = init_groups_data() self.inchi2gv = InChI2GroupVector(self.groups_data) self.group_names = self.groups_data.GetGroupNames() self.train_cids = training_data.cids self.train_S = training_data.S self.train_b = np.matrix(training_data.dG0).T self.train_w = np.matrix(training_data.weight).T self.train_G = None self.train_S_joined = None self.model_S_joined = None self.params = None
def is_balanced(self): cids = list(self.keys()) coeffs = np.array([self.sparse[cid] for cid in cids], ndmin=2).T elements, Ematrix = CompoundCacher.getInstance().get_kegg_ematrix(cids) conserved = Ematrix.T * coeffs if np.any(np.isnan(conserved), 0): logging.debug('cannot test reaction balancing because of unspecific ' 'compound formulas: %s' % self.write_formula()) return True if np.any(conserved != 0, 0): logging.debug('unbalanced reaction: %s' % self.write_formula()) for j in np.where(conserved[:, 0])[0].flat: logging.debug('there are %d more %s atoms on the right-hand side' % (conserved[j, 0], elements[j])) return False return True
def is_balanced(self): cids = list(self.keys()) coeffs = np.array([self.sparse[cid] for cid in cids], ndmin=2).T elements, Ematrix = CompoundCacher.getInstance().get_kegg_ematrix(cids) conserved = Ematrix.T * coeffs if np.any(np.isnan(conserved), 0): logging.debug( 'cannot test reaction balancing because of unspecific ' 'compound formulas: %s' % self.write_formula()) return True if np.any(conserved != 0, 0): logging.debug('unbalanced reaction: %s' % self.write_formula()) for j in np.where(conserved[:, 0])[0].flat: logging.debug( 'there are %d more %s atoms on the right-hand side' % (conserved[j, 0], elements[j])) return False return True
def __init__(self, training_data=None): if training_data is None: training_data = TrainingData() self.train_cids = list(training_data.cids) self.cids_joined = list(training_data.cids) self.train_S = training_data.S self.model_S_joined = np.matrix(self.train_S) self.train_S_joined = self.model_S_joined self.train_b = np.matrix(training_data.dG0).T self.train_w = np.matrix(training_data.weight).T self.train_G = None self.params = None self.ccache = CompoundCacher() self.groups_data = inchi2gv.init_groups_data() self.decomposer = inchi2gv.InChIDecomposer(self.groups_data) self.group_names = self.groups_data.GetGroupNames() self.Nc = len(self.cids_joined) self.Ng = len(self.group_names)
import sys sys.path.append("../python") import inchi2gv from compound_cacher import CompoundCacher from molecule import Molecule # logger = logging.getLogger('') # logger.setLevel(logging.DEBUG) ccache = CompoundCacher("../cache/compounds.json") groups_data = inchi2gv.init_groups_data() group_list = groups_data.GetGroupNames() group_names = groups_data.GetGroupNames() decomposer = inchi2gv.InChIDecomposer(groups_data) # test the decomposition of ATP into groups ATP_inchi = ccache.get_compound("C00002").inchi group_def = decomposer.inchi_to_groupvec(ATP_inchi) for j, group_name in enumerate(group_names): if group_def[j] != 0: print group_name, " x %d" % group_def[j] patterns = ["c~[O;+0]", "c~[O;+1]", "c~[n;+1]~c", "c~[n;+0]~c", "c~[n;-1]~c"] for cid in ["C00255", "C01007"]: comp = ccache.get_compound(cid) print "-" * 50, "\n%s" % cid inchi = comp.inchi mol = Molecule.FromInChI(inchi) print mol.ToSmiles()
import sys sys.path.append('../python') import inchi2gv from compound_cacher import CompoundCacher from molecule import Molecule #logger = logging.getLogger('') #logger.setLevel(logging.DEBUG) ccache = CompoundCacher('../cache/compounds.json') groups_data = inchi2gv.init_groups_data() group_list = groups_data.GetGroupNames() group_names = groups_data.GetGroupNames() decomposer = inchi2gv.InChIDecomposer(groups_data) # test the decomposition of ATP into groups ATP_inchi = ccache.get_compound('C00002').inchi group_def = decomposer.inchi_to_groupvec(ATP_inchi) for j, group_name in enumerate(group_names): if group_def[j] != 0: print group_name, ' x %d' % group_def[j] patterns = ['c~[O;+0]', 'c~[O;+1]', 'c~[n;+1]~c', 'c~[n;+0]~c', 'c~[n;-1]~c'] for cid in ['C00255', 'C01007']: comp = ccache.get_compound(cid) print "-" * 50, '\n%s' % cid inchi = comp.inchi mol = Molecule.FromInChI(inchi) print mol.ToSmiles()
import sys, logging sys.path.append('../python') from compound import Compound from inchi2gv import init_groups_data, InChI2GroupVector, GroupDecompositionError from compound_cacher import CompoundCacher from molecule import Molecule #logger = logging.getLogger('') #logger.setLevel(logging.DEBUG) ccache = CompoundCacher.getInstance('../cache/compounds.json') groups_data = init_groups_data() group_list = groups_data.GetGroupNames() inchi2gv_converter = InChI2GroupVector(groups_data) patterns = ['c~[O;+0]', 'c~[O;+1]', 'c~[n;+1]~c', 'c~[n;+0]~c', 'c~[n;-1]~c'] for cid in [255, 1007]: comp = ccache.get_kegg_compound(cid) print "-"*50, '\nC%05d' % cid inchi = comp.inchi mol = Molecule.FromInChI(inchi) print mol.ToSmiles() print mol.FindSmarts("c~[n;+1]~c") try: groupvec = inchi2gv_converter.InChI2GroupVector(inchi) sys.stdout.write(str(groupvec) + '\n') except GroupDecompositionError as e: sys.stderr.write(str(e) + '\n') sys.stderr.write(e.GetDebugTable())
class KeggModel(object): def __del__(self): self.ccache.dump() def __init__(self, S, cids, rids=None): self.S = S self.cids = cids self.rids = rids assert len(self.cids) == self.S.shape[0] if self.rids is not None: assert len(self.rids) == self.S.shape[1] self.ccache = CompoundCacher() # remove H+ from the stoichiometric matrix if it exists if 'C00080' in self.cids: i = self.cids.index('C00080') self.S = np.vstack((self.S[:i, :], self.S[i + 1:, :])) self.cids.pop(i) @staticmethod def from_file(fname, arrow='<=>', format='kegg', has_reaction_ids=False): """ reads a file containing reactions in KEGG format Arguments: fname - the filename to read arrow - the string used as the 'arrow' in each reaction (default: '<=>') format - the text file format provided ('kegg', 'tsv' or 'csv') has_reaction_ids - a boolean flag indicating if there is a column of reaction IDs (separated from the reaction with whitespaces) Return a KeggModel """ fd = open(fname, 'r') if format == 'kegg': model = KeggModel.from_formulas(fd.readlines(), arrow, has_reaction_ids) elif format == 'tsv': model = KeggModel.from_csv(fd, has_reaction_ids=has_reaction_ids, delimiter='\t') elif format == 'csv': model = KeggModel.from_csv(fd, has_reaction_ids=has_reaction_ids, delimiter=None) fd.close() return model @staticmethod def from_csv(fd, has_reaction_ids=True, delimiter=None): csv_reader = csv.reader(fd, delimiter=delimiter) if has_reaction_ids: rids = csv_reader.next() rids = rids[1:] else: rids = None S = [] cids = [] for i, row in enumerate(csv_reader): cids.append(row[0]) S.append([float(x) for x in row[1:]]) S = np.array(S) return KeggModel(S, cids, rids) @staticmethod def from_kegg_reactions(kegg_reactions, has_reaction_ids=False): if has_reaction_ids: rids = [r.rid for r in kegg_reactions] else: rids = None cids = set() for reaction in kegg_reactions: cids = cids.union(reaction.keys()) # convert the list of reactions in sparse notation into a full # stoichiometric matrix, where the rows (compounds) are according to the # CID list 'cids'. cids = sorted(cids) S = np.matrix(np.zeros((len(cids), len(kegg_reactions)))) for i, reaction in enumerate(kegg_reactions): S[:, i] = np.matrix(reaction.dense(cids)) logging.debug( 'Successfully loaded %d reactions (involving %d unique compounds)' % (S.shape[1], S.shape[0])) return KeggModel(S, cids, rids) @staticmethod def from_formulas(reaction_strings, arrow='<=>', has_reaction_ids=False, raise_exception=False): """ parses a list of reactions in KEGG format Arguments: reaction_strings - a list of reactions in KEGG format arrow - the string used as the 'arrow' in each reaction (default: '<=>') has_reaction_ids - a boolean flag indicating if there is a column of reaction IDs (separated from the reaction with whitespaces) Return values: S - a stoichiometric matrix cids - the KEGG compound IDs in the same order as the rows of S """ try: reactions = [] not_balanced_count = 0 for line in reaction_strings: rid = None if has_reaction_ids: tokens = re.findall('(\w+)\s+(.*)', line.strip())[0] rid = tokens[0] line = tokens[1] try: reaction = KeggReaction.parse_formula(line, arrow, rid) except KeggParseException as e: logging.warning(str(e)) reaction = KeggReaction({}) if not reaction.is_balanced(fix_water=True, raise_exception=raise_exception): not_balanced_count += 1 logging.warning('Model contains an unbalanced reaction: ' + line) reaction = KeggReaction({}) reactions.append(reaction) logging.debug('Adding reaction: ' + reaction.write_formula()) if not_balanced_count > 0: warning_str = '%d out of the %d reactions are not chemically balanced' % \ (not_balanced_count, len(reaction_strings)) logging.debug(warning_str) return KeggModel.from_kegg_reactions(reactions, has_reaction_ids) except ValueError as e: if raise_exception: raise e else: logging.debug(str(e)) return None def add_thermo(self, cc): # check that all CIDs in the reaction are already cached by CC Nc, Nr = self.S.shape reactions = [] for j in xrange(Nr): sparse = { self.cids[i]: self.S[i, j] for i in xrange(Nc) if self.S[i, j] != 0 } reaction = KeggReaction(sparse) reactions.append(reaction) self.dG0, self.cov_dG0 = cc.get_dG0_r_multi(reactions) def get_transformed_dG0(self, pH, I, T): """ returns the estimated dG0_prime and the standard deviation of each estimate (i.e. a measure for the uncertainty). """ dG0_prime = self.dG0 + self._get_transform_ddG0(pH=pH, I=I, T=T) dG0_std = np.matrix(np.sqrt(np.diag(self.cov_dG0))).T U, s, V = np.linalg.svd(self.cov_dG0, full_matrices=True) sqrt_Sigma = np.matrix(U) * np.matrix(np.diag(s**0.5)) * np.matrix(V) return dG0_prime, dG0_std, sqrt_Sigma def _get_transform_ddG0(self, pH, I, T): """ needed in order to calculate the transformed Gibbs energies of the model reactions. Returns: an array (whose length is self.S.shape[1]) with the differences between DrG0_prime and DrG0. Therefore, one must add this array to the chemical Gibbs energies of reaction (DrG0) to get the transformed values """ ddG0_compounds = np.matrix(np.zeros((self.S.shape[0], 1))) for i, cid in enumerate(self.cids): comp = self.ccache.get_compound(cid) ddG0_compounds[i, 0] = comp.transform_pH7(pH, I, T) ddG0_forward = np.dot(self.S.T, ddG0_compounds) return ddG0_forward def check_S_balance(self, fix_water=False): elements, Ematrix = self.ccache.get_element_matrix(self.cids) conserved = Ematrix.T * self.S if fix_water: # This part only looks for imbalanced oxygen and uses extra # H2O molecules (on either side of the reaction equation) to # balance them. Keep in mind that also the e- balance is affected # by the water (and hydrogen is not counted at all). if 'C00001' not in self.cids: self.S = np.vstack([self.S, np.zeros((1, self.S.shape[1]))]) self.cids.append('C00001') elements, Ematrix = self.ccache.get_element_matrix(self.cids) i_h2o = self.cids.index('C00001') add_water = -conserved[elements.index('O'), :] self.S[i_h2o, :] += add_water conserved += Ematrix[i_h2o, :].T * add_water rxnFil = np.any(conserved[:, range(self.S.shape[1])], axis=0) unbalanced_ind = np.nonzero(rxnFil)[1] if unbalanced_ind != []: logging.warning('There are (%d) unbalanced reactions in S. ' 'Setting their coefficients to 0.' % len(unbalanced_ind.flat)) if self.rids is not None: logging.warning( 'These are the unbalanced reactions: ' + ', '.join([self.rids[i] for i in unbalanced_ind.flat])) self.S[:, unbalanced_ind] = 0 return self def write_reaction_by_index(self, r): sparse = dict([(cid, self.S[i, r]) for i, cid in enumerate(self.cids) if self.S[i, r] != 0]) if self.rids is not None: reaction = KeggReaction(sparse, rid=self.rids[r]) else: reaction = KeggReaction(sparse) return reaction.write_formula() def get_unidirectional_S(self): S_plus = np.copy(self.S) S_minus = np.copy(self.S) S_plus[self.S < 0] = 0 S_minus[self.S > 0] = 0 return S_minus, S_plus
# Test that numpy can be imported and its version is rather new try: import numpy if StrictVersion(numpy.__version__) < StrictVersion('1.6.2'): sys.stderr.write('WARNING: your NumPy version is lower than 1.6.2 ' 'and might not work properly. Please upgrade to ' 'a newer version.\n') except ImportError: sys.stderr.write('NumPy is not installed. Please go to http://www.numpy.org ' 'and follow the installation instructions.\n') err_num += 1 try: from compound_cacher import CompoundCacher ccache = CompoundCacher() atp_comp = ccache.get_compound('C00002') assert(smiles_ATP_pH7 == atp_comp.smiles_pH7) except AssertionError: sys.stderr.write('Internal Error: the SMILES string for ATP is wrong.\n') err_num += 1 except Exception as e: sys.stderr.write('Error using Compound Cacher: ' + str(e)) err_num += 1 # Test inchi2gv.py try: import inchi2gv groups_data = inchi2gv.init_groups_data() decomposer = inchi2gv.InChIDecomposer(groups_data) groupvec1 = decomposer.inchi_to_groupvec(inchi_ATP)
'phase': 'aqueous', 'dG0_f': np.round(dG0_f, 2), 'nH': nH, 'z': z, 'nMg': 0 } yield d if __name__ == '__main__': import sys, json logger = logging.getLogger('') logger.setLevel(logging.DEBUG) from compound_cacher import CompoundCacher, CompoundEncoder from molecule import Molecule, OpenBabelError ccache = CompoundCacher(cache_fname=None) for compound_id in ['C00087', 'C00282', 'C00237']: comp = Compound.from_kegg(compound_id) try: mol = Molecule.FromInChI(str(comp.inchi)) sys.stderr.write( '%s : formula = %s, nE = %s' % (str(comp.inchi), mol.GetFormula(), mol.GetNumElectrons())) except OpenBabelError: pass ccache.add(comp) sys.stderr.write( '\ncompound id = %s, nH = %s, z = %s, pKa = %s, bag = %s\n\n\n' % (compound_id, str(comp.nHs), str(comp.zs), str( comp.pKas), str(comp.atom_bag)))
class KeggModel(object): def __del__(self): self.ccache.dump() def __init__(self, S, cids, rids=None): self.S = S self.cids = cids self.rids = rids assert len(self.cids) == self.S.shape[0] if self.rids is not None: assert len(self.rids) == self.S.shape[1] self.ccache = CompoundCacher() # remove H+ from the stoichiometric matrix if it exists if 'C00080' in self.cids: i = self.cids.index('C00080') self.S = np.vstack((self.S[:i,:], self.S[i+1:,:])) self.cids.pop(i) @staticmethod def from_file(fname, arrow='<=>', format='kegg', has_reaction_ids=False): """ reads a file containing reactions in KEGG format Arguments: fname - the filename to read arrow - the string used as the 'arrow' in each reaction (default: '<=>') format - the text file format provided ('kegg', 'tsv' or 'csv') has_reaction_ids - a boolean flag indicating if there is a column of reaction IDs (separated from the reaction with whitespaces) Return a KeggModel """ fd = open(fname, 'r') if format == 'kegg': model = KeggModel.from_formulas(fd.readlines(), arrow, has_reaction_ids) elif format == 'tsv': model = KeggModel.from_csv(fd, has_reaction_ids=has_reaction_ids, delimiter='\t') elif format == 'csv': model = KeggModel.from_csv(fd, has_reaction_ids=has_reaction_ids, delimiter=None) fd.close() return model @staticmethod def from_csv(fd, has_reaction_ids=True, delimiter=None): csv_reader = csv.reader(fd, delimiter=delimiter) if has_reaction_ids: rids = csv_reader.next() rids = rids[1:] else: rids = None S = [] cids = [] for i, row in enumerate(csv_reader): cids.append(row[0]) S.append([float(x) for x in row[1:]]) S = np.array(S) return KeggModel(S, cids, rids) @staticmethod def from_kegg_reactions(kegg_reactions, has_reaction_ids=False): if has_reaction_ids: rids = [r.rid for r in kegg_reactions] else: rids = None cids = set() for reaction in kegg_reactions: cids = cids.union(reaction.keys()) # convert the list of reactions in sparse notation into a full # stoichiometric matrix, where the rows (compounds) are according to the # CID list 'cids'. cids = sorted(cids) S = np.matrix(np.zeros((len(cids), len(kegg_reactions)))) for i, reaction in enumerate(kegg_reactions): S[:, i] = np.matrix(reaction.dense(cids)) logging.debug('Successfully loaded %d reactions (involving %d unique compounds)' % (S.shape[1], S.shape[0])) return KeggModel(S, cids, rids) @staticmethod def from_formulas(reaction_strings, arrow='<=>', has_reaction_ids=False, raise_exception=False): """ parses a list of reactions in KEGG format Arguments: reaction_strings - a list of reactions in KEGG format arrow - the string used as the 'arrow' in each reaction (default: '<=>') has_reaction_ids - a boolean flag indicating if there is a column of reaction IDs (separated from the reaction with whitespaces) Return values: S - a stoichiometric matrix cids - the KEGG compound IDs in the same order as the rows of S """ try: reactions = [] not_balanced_count = 0 for line in reaction_strings: rid = None if has_reaction_ids: tokens = re.findall('(\w+)\s+(.*)', line.strip())[0] rid = tokens[0] line = tokens[1] try: reaction = KeggReaction.parse_formula(line, arrow, rid) except KeggParseException as e: logging.warning(str(e)) reaction = KeggReaction({}) if not reaction.is_balanced(fix_water=True, raise_exception=raise_exception): not_balanced_count += 1 logging.warning('Model contains an unbalanced reaction: ' + line) reaction = KeggReaction({}) reactions.append(reaction) logging.debug('Adding reaction: ' + reaction.write_formula()) if not_balanced_count > 0: warning_str = '%d out of the %d reactions are not chemically balanced' % \ (not_balanced_count, len(reaction_strings)) logging.debug(warning_str) return KeggModel.from_kegg_reactions(reactions, has_reaction_ids) except ValueError as e: if raise_exception: raise e else: logging.debug(str(e)) return None def add_thermo(self, cc): # check that all CIDs in the reaction are already cached by CC Nc, Nr = self.S.shape reactions = [] for j in xrange(Nr): sparse = {self.cids[i]:self.S[i,j] for i in xrange(Nc) if self.S[i,j] != 0} reaction = KeggReaction(sparse) reactions.append(reaction) self.dG0, self.cov_dG0 = cc.get_dG0_r_multi(reactions) def get_transformed_dG0(self, pH, I, T): """ returns the estimated dG0_prime and the standard deviation of each estimate (i.e. a measure for the uncertainty). """ dG0_prime = self.dG0 + self._get_transform_ddG0(pH=pH, I=I, T=T) dG0_std = np.matrix(np.sqrt(np.diag(self.cov_dG0))).T U, s, V = np.linalg.svd(self.cov_dG0, full_matrices=True) sqrt_Sigma = np.matrix(U) * np.matrix(np.diag(s**0.5)) * np.matrix(V) return dG0_prime, dG0_std, sqrt_Sigma def _get_transform_ddG0(self, pH, I, T): """ needed in order to calculate the transformed Gibbs energies of the model reactions. Returns: an array (whose length is self.S.shape[1]) with the differences between DrG0_prime and DrG0. Therefore, one must add this array to the chemical Gibbs energies of reaction (DrG0) to get the transformed values """ ddG0_compounds = np.matrix(np.zeros((self.S.shape[0], 1))) for i, cid in enumerate(self.cids): comp = self.ccache.get_compound(cid) ddG0_compounds[i, 0] = comp.transform_pH7(pH, I, T) ddG0_forward = np.dot(self.S.T, ddG0_compounds) return ddG0_forward def check_S_balance(self): elements, Ematrix = self.ccache.get_element_matrix(self.cids) conserved = Ematrix.T * self.S rxnFil = np.any(conserved[:,range(self.S.shape[1])],axis=0) unbalanced_ind = np.nonzero(rxnFil)[1] if unbalanced_ind != []: logging.warning('There are (%d) unbalanced reactions in S. ' 'Setting their coefficients to 0.' % len(unbalanced_ind.flat)) if self.rids is not None: logging.warning('These are the unbalanced reactions: ' + ', '.join([self.rids[i] for i in unbalanced_ind.flat])) self.S[:, unbalanced_ind] = 0 return self def write_reaction_by_index(self, r): sparse = dict([(cid, self.S[i, r]) for i, cid in enumerate(self.cids) if self.S[i, r] != 0]) if self.rids is not None: reaction = KeggReaction(sparse, rid=self.rids[r]) else: reaction = KeggReaction(sparse) return reaction.write_formula() def get_unidirectional_S(self): S_plus = np.copy(self.S) S_minus = np.copy(self.S) S_plus[self.S < 0] = 0 S_minus[self.S > 0] = 0 return S_minus, S_plus
def __init__(self, S, cids): self.S = S self.cids = cids assert len(self.cids) == self.S.shape[0] self.ccache = CompoundCacher.getInstance()
# -*- coding: utf-8 -*- """ Created on Thu Aug 7 21:00:31 2014 @author: eladn """ import sys from compound_cacher import CompoundCacher compound_id = sys.argv[1] CompoundCacher.RebuildCompoundJSON() ccache = CompoundCacher() ccache.remove(compound_id) comp = ccache.get_compound(compound_id) ccache.dump()
class ComponentContribution(object): def __init__(self, training_data=None): if training_data is None: training_data = TrainingData() self.train_cids = list(training_data.cids) self.cids_joined = list(training_data.cids) self.train_S = training_data.S self.model_S_joined = np.matrix(self.train_S) self.train_S_joined = self.model_S_joined self.train_b = np.matrix(training_data.dG0).T self.train_w = np.matrix(training_data.weight).T self.train_G = None self.params = None self.ccache = CompoundCacher() self.groups_data = inchi2gv.init_groups_data() self.decomposer = inchi2gv.InChIDecomposer(self.groups_data) self.group_names = self.groups_data.GetGroupNames() self.Nc = len(self.cids_joined) self.Ng = len(self.group_names) @staticmethod def init(): if os.path.exists(CC_CACHE_FNAME): logging.debug('Loading component-contributions from cache') return ComponentContribution.from_matfile(CC_CACHE_FNAME) else: logging.debug('Calculating the component-contributions from raw data') cc = ComponentContribution() cc.save_matfile(CC_CACHE_FNAME) return cc def save_matfile(self, file_name): if self.params is None: self.train() savemat(file_name, self.params, oned_as='row') @staticmethod def from_matfile(file_name, training_data=None): cc = ComponentContribution(training_data=training_data) cc.params = loadmat(file_name) return cc def get_major_ms_dG0_f(self, compound_id): """ Returns the chemical formation energy of the major MS at pH 7. If the compound is part of the training set, returns the value that was calculated during training. Otherwise, we use pure group contribution (if possible) on the groups of the major MS. """ if compound_id is None: raise ValueError('given compound ID is None') if self.params is None: self.train() if compound_id in self.cids_joined: i = self.cids_joined.index(compound_id) return self.params['dG0_cc'][i, 0] else: # Decompose the compound and calculate the 'formation energy' # using the group contributions. # Note that the length of the group contribution vector we get # from CC is longer than the number of groups in "groups_data" # since we artifically added fictive groups to represent all the # non-decomposable compounds. Therefore, we truncate the # dG0_gc vector since here we only use GC for compounds which # are not in cids_joined anyway. comp = self.ccache.get_compound(compound_id) try: group_vec = self.decomposer.smiles_to_groupvec(comp.smiles_pH7) g = np.matrix(group_vec.ToArray()) dG0_gc = self.params['dG0_gc'][0:self.Ng, :] return float(np.dot(g, dG0_gc)) except inchi2gv.GroupDecompositionError: return np.nan def _decompose_reaction(self, reaction): if self.params is None: self.train() cids = list(self.params['cids']) G = self.params['G'] # calculate the reaction stoichiometric vector and the group incidence # vector (x and g) x = np.matrix(np.zeros((self.Nc, 1))) x_prime = [] G_prime = [] for compound_id, coeff in reaction.iteritems(): if compound_id in self.cids_joined: i = cids.index(compound_id) x[i, 0] = coeff else: # Decompose the compound and calculate the 'formation energy' # using the group contributions. # Note that the length of the group contribution vector we get # from CC is longer than the number of groups in "groups_data" # since we artifically added fictive groups to represent all the # non-decomposable compounds. Therefore, we truncate the # dG0_gc vector since here we only use GC for compounds which # are not in cids_joined anyway. x_prime.append(coeff) comp = self.ccache.get_compound(compound_id) group_vec = self.decomposer.smiles_to_groupvec(comp.smiles_pH7) G_prime.append(group_vec.ToArray()) if x_prime != []: g = np.matrix(x_prime) * np.vstack(G_prime) else: g = np.matrix(np.zeros((1, 1))) g.resize((G.shape[1], 1)) return x, g def get_dG0_r(self, reaction, include_analysis=False): """ Arguments: reaction - a KeggReaction object Returns: the CC estimation for this reaction's untransformed dG0 (i.e. using the major MS at pH 7 for each of the reactants) """ try: x, g = self._decompose_reaction(reaction) except inchi2gv.GroupDecompositionError: if not include_analysis: return 0, 1e5 else: return 0, 1e5, [] v_r = np.matrix(self.params['preprocess_v_r']) v_g = np.matrix(self.params['preprocess_v_g']) C1 = np.matrix(self.params['preprocess_C1']) C2 = np.matrix(self.params['preprocess_C2']) C3 = np.matrix(self.params['preprocess_C3']) dG0_cc = float(x.T * v_r + g.T * v_g) s_cc_sqr = float(x.T * C1 * x + 2 * x.T * C2 * g + g.T * C3 * g) if not include_analysis: return dG0_cc, np.sqrt(s_cc_sqr) else: # Analyse the contribution of each training observation to this # reaction's dG0 estimate. G1 = np.matrix(self.params['preprocess_G1']) G2 = np.matrix(self.params['preprocess_G2']) G3 = np.matrix(self.params['preprocess_G3']) S = np.matrix(self.params['preprocess_S']) S_count = np.matrix(self.params['preprocess_S_count']) cids = self.params['cids'] # dG0_cc = (x*G1 + x*G2 + g*G3)*b weights_rc = (x.T * G1).round(5) weights_gc = (x.T * G2 + g.T * G3).round(5) weights = weights_rc + weights_gc orders = sorted(range(weights.shape[1]), key=lambda j:abs(weights[0, j]), reverse=True) analysis = [] for j in orders: if abs(weights[0, j]) < 1e-5: continue r = KeggReaction({cids[i]:S[i,j] for i in xrange(S.shape[0]) if S[i,j] != 0}) analysis.append({'index': j, 'w_rc': weights_rc[0, j], 'w_gc': weights_gc[0, j], 'reaction': r, 'count': int(S_count[0, j])}) return dG0_cc, np.sqrt(s_cc_sqr), analysis def get_dG0_r_multi(self, reactions): """ Arguments: reaction - a KeggReaction object Returns: the CC estimation for this reaction's untransformed dG0 (i.e. using the major MS at pH 7 for each of the reactants) """ X = [] G = [] for reaction in reactions: try: x, g = self._decompose_reaction(reaction) except inchi2gv.GroupDecompositionError: x = np.zeros((self.Nc, 1)) g = np.zeros((self.params['G'].shape[1], 1)) X.append(list(x.flat)) G.append(list(g.flat)) X = np.matrix(X).T G = np.matrix(G).T v_r = np.matrix(self.params['preprocess_v_r']) v_g = np.matrix(self.params['preprocess_v_g']) C1 = np.matrix(self.params['preprocess_C1']) C2 = np.matrix(self.params['preprocess_C2']) C3 = np.matrix(self.params['preprocess_C3']) dG0_cc = X.T * v_r + G.T * v_g U = X.T * C1 * X + X.T * C2 * G + G.T * C2.T * X + G.T * C3 * G return dG0_cc, U def get_compound_json(self, compound_id): """ adds the component-contribution estimation to the JSON """ if compound_id is None: raise ValueError('given compound ID is None') if self.params is None: self.train() d = {'CID': compound_id} comp = self.ccache.get_compound(compound_id) gv = None if compound_id in self.cids_joined: i = self.cids_joined.index(compound_id) gv = self.params['G'][i, :] major_ms_dG0_f = self.params['dG0_cc'][i, 0] d['compound_index'] = i elif comp.smiles_pH7 is not None: # decompose the compounds in the training_data and add to G try: group_def = self.decomposer.smiles_to_groupvec(comp.smiles_pH7) gv = np.matrix(group_def.ToArray()) # we need to truncate the dG0_gc matrix from all the group # dimensions that correspond to non-decomposable compounds # from the training set dG0_gc = self.params['dG0_gc'][0:self.Ng, :] major_ms_dG0_f = float(np.dot(gv, dG0_gc)) except inchi2gv.GroupDecompositionError: d['error'] = 'We cannot estimate the formation energy of this compound ' +\ 'because its structure is too small or too complex to ' +\ 'decompose to groups' major_ms_dG0_f = np.nan else: d['error'] = 'We cannot estimate the formation energy of this compound ' +\ 'because it has no defined structure' major_ms_dG0_f = np.nan if gv is not None: sparse_gv = filter(lambda x: x[1] != 0, enumerate(gv.flat)) d['group_vector'] = sparse_gv if not np.isnan(major_ms_dG0_f): d['pmap'] = {'source': 'Component Contribution (2013)', 'species': list(comp.get_species(major_ms_dG0_f, default_T))} d['num_electrons'] = comp.atom_bag.get('e-', 0) if comp.inchi is not None: d['InChI'] = comp.inchi try: mol = Molecule.FromInChI(str(comp.inchi)) d['mass'] = mol.GetExactMass() d['formula'] = mol.GetFormula() except OpenBabelError: if compound_id == 'C00282': # an exception for hydrogen d['mass'] = 2.0157 d['formula'] = 'H2' else: d['mass'] = 0 d['formula'] = '' return d def estimate_kegg_model(self, model_S, model_cids): # standardize the CID list of the training data and the model # and create new (larger) matrices for each one cids_new = [cid for cid in model_cids if cid not in self.train_cids] self.cids_joined += cids_new self.Nc = len(self.cids_joined) self.model_S_joined = ComponentContribution._zero_pad_S( model_S, model_cids, self.cids_joined) self.train_S_joined = ComponentContribution._zero_pad_S( self.train_S, self.train_cids, self.cids_joined) self.train() dG0_cc = self.params['dG0_cc'] cov_dG0 = self.params['cov_dG0'] MSE_kerG = self.params['MSE_kerG'] model_dG0 = self.model_S_joined.T * dG0_cc model_cov_dG0 = self.model_S_joined.T * cov_dG0 * self.model_S_joined return model_dG0, model_cov_dG0, MSE_kerG def create_group_incidence_matrix(self): """ Initialize G matrix, and then use the python script "inchi2gv.py" to decompose each of the compounds that has an InChI and save the decomposition as a row in the G matrix. """ G = np.zeros((self.Nc, self.Ng)) cpd_inds_without_gv = [] # decompose the compounds in the training_data and add to G for i, compound_id in enumerate(self.cids_joined): smiles_pH7 = self.ccache.get_compound(compound_id).smiles_pH7 try: group_def = self.decomposer.smiles_to_groupvec(smiles_pH7) for j in xrange(len(self.group_names)): G[i, j] = group_def[j] except inchi2gv.GroupDecompositionError: # for compounds that have no InChI or are not decomposable # add a unique 1 in a new column cpd_inds_without_gv.append(i) N_non_decomposable = len(cpd_inds_without_gv) add_G = np.zeros((self.Nc, N_non_decomposable)) for j, i in enumerate(cpd_inds_without_gv): add_G[i, j] = 1 return np.matrix(np.hstack([G, add_G])) def train(self): """ Estimate standard Gibbs energies of formation """ self.train_G = self.create_group_incidence_matrix() S = self.train_S_joined G = self.train_G b = self.train_b w = self.train_w m, n = S.shape assert G.shape[0] == m assert b.shape == (n, 1) assert w.shape == (n, 1) # Apply weighing W = np.diag(w.flat) GS = G.T * S # Linear regression for the reactant layer (aka RC) inv_S, r_rc, P_R_rc, P_N_rc = ComponentContribution._invert_project(S * W) # Linear regression for the group layer (aka GC) inv_GS, r_gc, P_R_gc, P_N_gc = ComponentContribution._invert_project(GS * W) # calculate the group contributions dG0_gc = inv_GS.T * W * b # Calculate the contributions in the stoichiometric space dG0_rc = inv_S.T * W * b dG0_cc = P_R_rc * dG0_rc + P_N_rc * G * dG0_gc # Calculate the residual error (unweighted squared error divided by N - rank) e_rc = (S.T * dG0_rc - b) MSE_rc = float((e_rc.T * W * e_rc) / (n - r_rc)) # MSE_rc = (e_rc.T * e_rc) / (n - r_rc) e_gc = (GS.T * dG0_gc - b) MSE_gc = float((e_gc.T * W * e_gc) / (n - r_gc)) # MSE_gc = (e_gc.T * e_gc) / (n - r_gc) # Calculate the MSE of GC residuals for all reactions in ker(G). # This will help later to give an estimate of the uncertainty for such # reactions, which otherwise would have a 0 uncertainty in the GC method. kerG_inds = list(np.where(np.all(GS == 0, 0))[1].flat) e_kerG = e_gc[kerG_inds] MSE_kerG = float((e_kerG.T * e_kerG) / len(kerG_inds)) MSE_inf = 1e10 # Calculate the uncertainty covariance matrices # [inv_S_orig, ~, ~, ~] = invertProjection(S); # [inv_GS_orig, ~, ~, ~] = invertProjection(GS); inv_SWS, _, _, _ = ComponentContribution._invert_project(S * W * S.T) inv_GSWGS, _, _, _ = ComponentContribution._invert_project(GS * W * GS.T) #V_rc = P_R_rc * (inv_S_orig.T * W * inv_S_orig) * P_R_rc #V_gc = P_N_rc * G * (inv_GS_orig.T * W * inv_GS_orig) * G' * P_N_rc V_rc = P_R_rc * inv_SWS * P_R_rc V_gc = P_N_rc * G * inv_GSWGS * G.T * P_N_rc # V_rc = P_R_rc * (inv_S_orig.T * inv_S_orig) * P_R_rc # V_gc = P_N_rc * G * (inv_GS_orig.T * inv_GS_orig) * G.T * P_N_rc V_inf = P_N_rc * G * P_N_gc * G.T * P_N_rc # Calculate the total of the contributions and covariances cov_dG0 = V_rc * MSE_rc + V_gc * MSE_gc + V_inf * MSE_inf # preprocessing matrices (for calculating the contribution of each # observation) G1 = P_R_rc * inv_S.T * W G2 = P_N_rc * G * inv_GS.T * W G3 = inv_GS.T * W S_uniq, P_col = ComponentContribution._col_uniq(S) S_counter = np.sum(P_col, 0) preprocess_G1 = G1 * P_col preprocess_G2 = G2 * P_col preprocess_G3 = G3 * P_col # preprocessing matrices (for quick calculation of uncertainty) preprocess_C1 = cov_dG0 preprocess_C2 = MSE_gc * P_N_rc * G * inv_GSWGS + MSE_inf * G * P_N_gc preprocess_C3 = MSE_gc * inv_GSWGS + MSE_inf * P_N_gc # Put all the calculated data in 'params' for the sake of debugging self.params = {'b': self.train_b, 'train_S': self.train_S_joined, 'model_S': self.model_S_joined, 'train_cids': self.train_cids, 'cids': self.cids_joined, 'w': self.train_w, 'G': self.train_G, 'dG0_rc': dG0_rc, 'dG0_gc': dG0_gc, 'dG0_cc': dG0_cc, 'cov_dG0': cov_dG0, 'V_rc': V_rc, 'V_gc': V_gc, 'V_inf': V_inf, 'MSE_rc': MSE_rc, 'MSE_gc': MSE_gc, 'MSE_kerG': MSE_kerG, 'MSE_inf': MSE_inf, 'P_R_rc': P_R_rc, 'P_R_gc': P_R_gc, 'P_N_rc': P_N_rc, 'P_N_gc': P_N_gc, 'inv_S': inv_S, 'inv_GS': inv_GS, 'inv_SWS': inv_SWS, 'inv_GSWGS': inv_GSWGS, 'preprocess_v_r': dG0_cc, 'preprocess_v_g': dG0_gc, 'G1': G1, 'G2': G2, 'G3': G3, 'preprocess_G1': preprocess_G1, 'preprocess_G2': preprocess_G2, 'preprocess_G3': preprocess_G3, 'preprocess_S': S_uniq, 'preprocess_S_count': S_counter, 'preprocess_C1': preprocess_C1, 'preprocess_C2': preprocess_C2, 'preprocess_C3': preprocess_C3} @staticmethod def _zero_pad_S(S, cids_orig, cids_joined): """ takes a stoichiometric matrix with a given list of IDs 'cids' and adds 0-rows so that the list of IDs will be 'cids_joined' """ if not set(cids_orig).issubset(cids_joined): raise Exception('The full list is missing some IDs in "cids"') full_S = np.zeros((len(cids_joined), S.shape[1])) for i, cid in enumerate(cids_orig): S_row = S[i, :] full_S[cids_joined.index(cid), :] = S_row return np.matrix(full_S) @staticmethod def _invert_project(A, eps=1e-10): n, m = A.shape U, S, V = LINALG.svd(A) inv_A = V * np.linalg.pinv(S) * U.T r = (S > eps).sum() P_R = U[:, :r] * U[:, :r].T P_N = U[:, r:] * U[:, r:].T return inv_A, r, P_R, P_N @staticmethod def _row_uniq(A): """ A procedure usually performed before linear regression (i.e. solving Ax = y). If the matrix A contains repeating rows, it is advisable to combine all of them to one row, and the observed value corresponding to that row will be the average of the original observations. Input: A - a 2D NumPy array Returns: A_unique, P_row where A_unique has the same number of columns as A, but with unique rows. P_row is a matrix that can be used to map the original rows to the ones in A_unique (all values in P_row are 0 or 1). """ # convert the rows of A into tuples so we can compare them A_tuples = [tuple(A[i,:].flat) for i in xrange(A.shape[0])] A_unique = list(sorted(set(A_tuples), reverse=True)) # create the projection matrix that maps the rows in A to rows in # A_unique P_col = np.matrix(np.zeros((len(A_unique), len(A_tuples)))) for j, tup in enumerate(A_tuples): # find the indices of the unique row in A_unique which correspond # to this original row in A (represented as 'tup') i = A_unique.index(tup) P_col[i, j] = 1 return np.matrix(A_unique), P_col @staticmethod def _col_uniq(A): A_unique, P_col = ComponentContribution._row_uniq(A.T) return A_unique.T, P_col.T
class TrainingData(object): # a dictionary of the filenames of the training data and the relative # weight of each one FNAME_DICT = {'TECRDB' : ('../data/TECRDB.tsv', 1.0), 'FORMATION' : ('../data/formation_energies_transformed.tsv', 1.0), 'REDOX' : ('../data/redox.tsv', 1.0)} def __del__(self): self.ccache.dump() def __init__(self): self.ccache = CompoundCacher() thermo_params, self.cids_that_dont_decompose = TrainingData.get_all_thermo_params() cids = set() for d in thermo_params: cids = cids.union(d['reaction'].keys()) cids = sorted(cids) # convert the list of reactions in sparse notation into a full # stoichiometric matrix, where the rows (compounds) are according to the # CID list 'cids'. self.S = np.zeros((len(cids), len(thermo_params))) for k, d in enumerate(thermo_params): for cid, coeff in d['reaction'].iteritems(): self.S[cids.index(cid), k] = coeff self.cids = cids self.dG0_prime = np.array([d['dG\'0'] for d in thermo_params]) self.T = np.array([d['T'] for d in thermo_params]) self.I = np.array([d['I'] for d in thermo_params]) self.pH = np.array([d['pH'] for d in thermo_params]) self.pMg = np.array([d['pMg'] for d in thermo_params]) self.weight = np.array([d['weight'] for d in thermo_params]) self.reference = [d['reference'] for d in thermo_params] self.description = [d['description'] for d in thermo_params] rxn_inds_to_balance = [i for i in xrange(len(thermo_params)) if thermo_params[i]['balance']] self.balance_reactions(rxn_inds_to_balance) self.reverse_transform() def savemat(self, fname): d = {'dG0_prime': self.dG0_prime, 'dG0': self.dG0, 'T': self.T, 'I': self.I, 'pH': self.pH, 'pMg': self.pMg, 'weight': self.weight, 'cids': self.cids} savemat(fname, d, oned_as='row') def savecsv(self, fname): csv_output = csv.writer(open(fname, 'w')) csv_output.writerow(['reaction', 'T', 'I', 'pH', 'reference', 'dG0', 'dG0_prime']) for j in xrange(self.S.shape[1]): sparse = {self.cids[i]: self.S[i, j] for i in xrange(self.S.shape[0])} r_string = KeggReaction(sparse).write_formula() csv_output.writerow([r_string, self.T[j], self.I[j], self.pH[j], self.reference[j], self.dG0[j], self.dG0_prime[j]]) @staticmethod def str2double(s): """ casts a string to float, but if the string is empty return NaN """ if s == '': return np.nan else: return float(s) @staticmethod def read_tecrdb(fname, weight): """Read the raw data of TECRDB (NIST)""" thermo_params = [] # columns are: reaction, dG'0, T, I, pH, pMg, weight, balance? headers = ["URL", "REF_ID", "METHOD", "EVAL", "EC", "ENZYME NAME", "REACTION IN KEGG IDS", "REACTION IN COMPOUND NAMES", "K", "K'", "T", "I", "pH", "pMg"] for row_list in csv.reader(open(fname, 'r'), delimiter='\t'): if row_list == []: continue row = dict(zip(headers, row_list)) if (row['K\''] == '') or (row['T'] == '') or (row['pH'] == ''): continue # parse the reaction reaction = KeggReaction.parse_formula(row['REACTION IN KEGG IDS'], arrow='=') # calculate dG'0 dG0_prime = -R * TrainingData.str2double(row['T']) * \ np.log(TrainingData.str2double(row['K\''])) try: thermo_params.append({'reaction': reaction, 'dG\'0' : dG0_prime, 'T': TrainingData.str2double(row['T']), 'I': TrainingData.str2double(row['I']), 'pH': TrainingData.str2double(row['pH']), 'pMg': TrainingData.str2double(row['pMg']), 'weight': weight, 'balance': True, 'reference': row['REF_ID'], 'description': row['REACTION IN COMPOUND NAMES']}) except ValueError: raise Exception('Cannot parse row: ' + str(row)) logging.debug('Successfully added %d reactions from TECRDB' % len(thermo_params)) return thermo_params @staticmethod def read_formations(fname, weight): """Read the Formation Energy data""" # columns are: reaction, dG'0, T, I, pH, pMg, weight, balance? thermo_params = [] cids_that_dont_decompose = set() # fields are: cid, name, dG'0, pH, I, pMg, T, decompose?, # compound_ref, remark for row in csv.DictReader(open(fname, 'r'), delimiter='\t'): if int(row['decompose']) == 0: cids_that_dont_decompose.add(row['cid']) if row['dG\'0'] != '': rxn = KeggReaction({row['cid'] : 1}) thermo_params.append({'reaction': rxn, 'dG\'0' : TrainingData.str2double(row['dG\'0']), 'T': TrainingData.str2double(row['T']), 'I': TrainingData.str2double(row['I']), 'pH': TrainingData.str2double(row['pH']), 'pMg': TrainingData.str2double(row['pMg']), 'weight': weight, 'balance': False, 'reference': row['compound_ref'], 'description': row['name'] + ' formation'}) logging.debug('Successfully added %d formation energies' % len(thermo_params)) return thermo_params, cids_that_dont_decompose @staticmethod def read_redox(fname, weight): """Read the Reduction potential data""" # columns are: reaction, dG'0, T, I, pH, pMg, weight, balance? thermo_params = [] # fields are: name, CID_ox, nH_ox, charge_ox, CID_red, # nH_red, charge_red, E'0, pH, I, pMg, T, ref for row in csv.DictReader(open(fname, 'r'), delimiter='\t'): delta_nH = TrainingData.str2double(row['nH_red']) - \ TrainingData.str2double(row['nH_ox']) delta_charge = TrainingData.str2double(row['charge_red']) - \ TrainingData.str2double(row['charge_ox']) delta_e = delta_nH - delta_charge dG0_prime = -F * TrainingData.str2double(row['E\'0']) * delta_e rxn = KeggReaction({row['CID_ox'] : -1, row['CID_red'] : 1}) thermo_params.append({'reaction': rxn, 'dG\'0' : dG0_prime, 'T': TrainingData.str2double(row['T']), 'I': TrainingData.str2double(row['I']), 'pH': TrainingData.str2double(row['pH']), 'pMg': TrainingData.str2double(row['pMg']), 'weight': weight, 'balance': False, 'reference': row['ref'], 'description': row['name'] + ' redox'}) logging.debug('Successfully added %d redox potentials' % len(thermo_params)) return thermo_params @staticmethod def get_all_thermo_params(): base_path = os.path.split(os.path.realpath(__file__))[0] fname, weight = TrainingData.FNAME_DICT['TECRDB'] fname = os.path.join(base_path, fname) tecrdb_params = TrainingData.read_tecrdb(fname, weight) fname, weight = TrainingData.FNAME_DICT['FORMATION'] fname = os.path.join(base_path, fname) formation_params, cids_that_dont_decompose = TrainingData.read_formations(fname, weight) fname, weight = TrainingData.FNAME_DICT['REDOX'] fname = os.path.join(base_path, fname) redox_params = TrainingData.read_redox(fname, weight) thermo_params = tecrdb_params + formation_params + redox_params return thermo_params, cids_that_dont_decompose def balance_reactions(self, rxn_inds_to_balance): """ use the chemical formulas from the InChIs to verify that each and every reaction is balanced """ elements, Ematrix = self.ccache.get_element_matrix(self.cids) cpd_inds_without_formula = list(np.nonzero(np.any(np.isnan(Ematrix), 1))[0].flat) Ematrix[np.isnan(Ematrix)] = 0 S_without_formula = self.S[cpd_inds_without_formula, :] rxn_inds_without_formula = np.nonzero(np.any(S_without_formula != 0, 0))[0] rxn_inds_to_balance = set(rxn_inds_to_balance).difference(rxn_inds_without_formula) # need to check that all elements are balanced (except H, but including e-) # if only O is not balanced, add water molecules if 'O' in elements: i_H2O = self.cids.index('C00001') j_O = elements.index('O') conserved = np.dot(Ematrix.T, self.S) for k in rxn_inds_to_balance: self.S[i_H2O, k] = self.S[i_H2O, k] - conserved[j_O, k] # recalculate conservation matrix conserved = Ematrix.T * self.S rxn_inds_to_remove = [k for k in rxn_inds_to_balance if np.any(conserved[:, k] != 0, 0)] for k in rxn_inds_to_remove: sprs = {} for i in np.nonzero(self.S[:, k])[0]: sprs[self.cids[i]] = self.S[i, k] reaction = KeggReaction(sprs) logging.debug('unbalanced reaction #%d: %s' % (k, reaction.write_formula())) for j in np.where(conserved[:, k])[0].flat: logging.debug('there are %d more %s atoms on the right-hand side' % (conserved[j, k], elements[j])) rxn_inds_to_keep = \ set(range(self.S.shape[1])).difference(rxn_inds_to_remove) rxn_inds_to_keep = sorted(rxn_inds_to_keep) self.S = self.S[:, rxn_inds_to_keep] self.dG0_prime = self.dG0_prime[rxn_inds_to_keep] self.T = self.T[rxn_inds_to_keep] self.I = self.I[rxn_inds_to_keep] self.pH = self.pH[rxn_inds_to_keep] self.pMg = self.pMg[rxn_inds_to_keep] self.weight = self.weight[rxn_inds_to_keep] self.reference = [self.reference[i] for i in rxn_inds_to_keep] self.description = [self.description[i] for i in rxn_inds_to_keep] logging.debug('After removing %d unbalanced reactions, the stoichiometric ' 'matrix contains: ' '%d compounds and %d reactions' % (len(rxn_inds_to_remove), self.S.shape[0], self.S.shape[1])) def reverse_transform(self): """ Calculate the reverse transform for all reactions in training_data. """ n_rxns = self.S.shape[1] reverse_ddG0 = np.zeros(n_rxns) self.I[np.isnan(self.I)] = 0.25 # default ionic strength is 0.25M self.pMg[np.isnan(self.pMg)] = 14 # default pMg is 14 for i in xrange(n_rxns): for j in np.nonzero(self.S[:, i])[0]: cid = self.cids[j] if cid == 'C00080': # H+ should be ignored in the Legendre transform continue comp = self.ccache.get_compound(cid) ddG0 = comp.transform_pH7(self.pH[i], self.I[i], self.T[i]) reverse_ddG0[i] = reverse_ddG0[i] + ddG0 * self.S[j, i] self.dG0 = self.dG0_prime - reverse_ddG0
class KeggReaction(object): def __init__(self, sparse, arrow='<=>', rid=None): for cid, coeff in sparse.iteritems(): if not (isinstance(coeff, float) or isinstance(coeff, int)): raise ValueError('All values in KeggReaction must be integers or floats') self.sparse = dict(filter(lambda (k,v):v, sparse.items())) self.arrow = arrow self.rid = rid self.ccache = CompoundCacher() def keys(self): return self.sparse.keys() def iteritems(self): return self.sparse.iteritems() def __str__(self): return self.write_formula() def reverse(self): """ reverse the direction of the reaction by negating all stoichiometric coefficients """ self.sparse = dict( (k, -v) for (k, v) in self.sparse.iteritems() ) @staticmethod def parse_reaction_formula_side(s): """ Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003' Ignores stoichiometry. Returns: The set of CIDs. """ if s.strip() == "null": return {} compound_bag = {} for member in re.split('\s+\+\s+', s): tokens = member.split(None, 1) if len(tokens) == 0: continue if len(tokens) == 1: amount = 1 key = member else: try: amount = float(tokens[0]) except ValueError: raise KeggParseException( "Non-specific reaction: %s" % s) key = tokens[1] try: compound_bag[key] = compound_bag.get(key, 0) + amount except ValueError: raise KeggParseException( "Non-specific reaction: %s" % s) return compound_bag @staticmethod def parse_formula(formula, arrow='<=>', rid=None): """ Parses a two-sided formula such as: 2 C00001 => C00002 + C00003 Return: The set of substrates, products and the direction of the reaction """ tokens = formula.split(arrow) if len(tokens) < 2: raise KeggParseException('Reaction does not contain the arrow sign (%s): %s' % (arrow, formula)) if len(tokens) > 2: raise KeggParseException('Reaction contains more than one arrow sign (%s): %s' % (arrow, formula)) left = tokens[0].strip() right = tokens[1].strip() sparse_reaction = {} for cid, count in KeggReaction.parse_reaction_formula_side(left).iteritems(): sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count for cid, count in KeggReaction.parse_reaction_formula_side(right).iteritems(): sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count return KeggReaction(sparse_reaction, arrow, rid=rid) @staticmethod def write_compound_and_coeff(compound_id, coeff): if coeff == 1: return compound_id else: return "%g %s" % (coeff, compound_id) def write_formula(self): """String representation.""" left = [] right = [] for cid, coeff in sorted(self.sparse.iteritems()): if coeff < 0: left.append(KeggReaction.write_compound_and_coeff(cid, -coeff)) elif coeff > 0: right.append(KeggReaction.write_compound_and_coeff(cid, coeff)) return "%s %s %s" % (' + '.join(left), self.arrow, ' + '.join(right)) def _get_reaction_atom_bag(self, raise_exception=False): """ Use for checking if all elements are conserved. Returns: An atom_bag of the differences between the sides of the reaction. E.g. if there is one extra C on the left-hand side, the result will be {'C': -1}. """ try: cids = list(self.keys()) coeffs = map(self.sparse.__getitem__, cids) coeffs = np.matrix(coeffs) cached_cids = set(map(str, self.ccache.compound_id2inchi.keys())) if not cached_cids.issuperset(cids): missing_cids = set(cids).difference(cached_cids) warning_str = 'The following compound IDs are not in the cache, ' + \ 'make sure they appear in kegg_additions.tsv and ' + \ 'then run compound_cacher.py: ' + \ ', '.join(sorted(missing_cids)) raise ValueError(warning_str) elements, Ematrix = self.ccache.get_element_matrix(cids) conserved = coeffs * Ematrix if np.any(np.isnan(conserved), 1): warning_str = 'cannot test reaction balancing because of unspecific ' + \ 'compound formulas: %s' % self.write_formula() raise ValueError(warning_str) atom_bag = {} if np.any(conserved != 0, 1): logging.debug('unbalanced reaction: %s' % self.write_formula()) for j, c in enumerate(conserved.flat): if c != 0: logging.debug('there are %d more %s atoms on the right-hand side' % (c, elements[j])) atom_bag[str(elements[j])] = c return atom_bag except ValueError as e: if raise_exception: raise e else: logging.debug(str(e)) return None def is_balanced(self, fix_water=False, raise_exception=False): reaction_atom_bag = self._get_reaction_atom_bag(raise_exception) if reaction_atom_bag is None: # this means some compound formulas are missing return False if fix_water and 'O' in reaction_atom_bag: self.sparse.setdefault('C00001', 0) self.sparse['C00001'] += -reaction_atom_bag['O'] if self.sparse['C00001'] == 0: del self.sparse['C00001'] reaction_atom_bag = self._get_reaction_atom_bag() return len(reaction_atom_bag) == 0 def is_empty(self): return len(self.sparse) == 0 def dense(self, cids): s = np.matrix(np.zeros((len(cids), 1))) for cid, coeff in self.iteritems(): s[cids.index(cid), 0] = coeff return s def get_transform_ddG0(self, pH, I, T): """ needed in order to calculate the transformed Gibbs energies of reactions. Returns: The difference between DrG0_prime and DrG0 for this reaction. Therefore, this value must be added to the chemical Gibbs energy of reaction (DrG0) to get the transformed value. """ ddG0_forward = 0 for compound_id, coeff in self.iteritems(): comp = self.ccache.get_compound(compound_id) ddG0_forward += coeff * comp.transform_pH7(pH, I, T) return ddG0_forward
def load_compound_cache(): ccache = CompoundCacher() return ccache
# -*- coding: utf-8 -*- """ Created on Thu Aug 7 21:00:31 2014 @author: eladn """ import sys from compound_cacher import CompoundCacher compound_id = sys.argv[1] CompoundCacher.RebuildCompoundJSON() ccache = CompoundCacher() sys.stderr.write('removing %s from cache ...\n' % compound_id) ccache.remove(compound_id) sys.stderr.write('recalculating SMILES and pKa values ...\n') comp = ccache.get_compound(compound_id) sys.stderr.write('writing new data to cache ...\n') ccache.dump() d = comp.to_json_dict() sys.stderr.write(''.join(['%20s : %s\n' % (k, v) for (k, v) in d.iteritems()]))
class KeggReaction(object): def __init__(self, sparse, arrow='<=>', rid=None): for cid, coeff in sparse.iteritems(): if not (isinstance(coeff, float) or isinstance(coeff, int)): raise ValueError( 'All values in KeggReaction must be integers or floats') self.sparse = dict(filter(lambda (k, v): v, sparse.items())) self.arrow = arrow self.rid = rid self.ccache = CompoundCacher() def keys(self): return self.sparse.keys() def iteritems(self): return self.sparse.iteritems() def __str__(self): return self.write_formula() def reverse(self): """ reverse the direction of the reaction by negating all stoichiometric coefficients """ self.sparse = dict((k, -v) for (k, v) in self.sparse.iteritems()) @staticmethod def parse_reaction_formula_side(s): """ Parses the side formula, e.g. '2 C00001 + C00002 + 3 C00003' Ignores stoichiometry. Returns: The set of CIDs. """ if s.strip() == "null": return {} compound_bag = {} for member in re.split('\s+\+\s+', s): tokens = member.split(None, 1) if len(tokens) == 0: continue if len(tokens) == 1: amount = 1 key = member else: try: amount = float(tokens[0]) except ValueError: raise KeggParseException("Non-specific reaction: %s" % s) key = tokens[1] try: compound_bag[key] = compound_bag.get(key, 0) + amount except ValueError: raise KeggParseException("Non-specific reaction: %s" % s) return compound_bag @staticmethod def parse_formula(formula, arrow='<=>', rid=None): """ Parses a two-sided formula such as: 2 C00001 => C00002 + C00003 Return: The set of substrates, products and the direction of the reaction """ tokens = formula.split(arrow) if len(tokens) < 2: raise KeggParseException( 'Reaction does not contain the arrow sign (%s): %s' % (arrow, formula)) if len(tokens) > 2: raise KeggParseException( 'Reaction contains more than one arrow sign (%s): %s' % (arrow, formula)) left = tokens[0].strip() right = tokens[1].strip() sparse_reaction = {} for cid, count in KeggReaction.parse_reaction_formula_side( left).iteritems(): sparse_reaction[cid] = sparse_reaction.get(cid, 0) - count for cid, count in KeggReaction.parse_reaction_formula_side( right).iteritems(): sparse_reaction[cid] = sparse_reaction.get(cid, 0) + count return KeggReaction(sparse_reaction, arrow, rid=rid) @staticmethod def write_compound_and_coeff(compound_id, coeff): if coeff == 1: return compound_id else: return "%g %s" % (coeff, compound_id) def write_formula(self): """String representation.""" left = [] right = [] for cid, coeff in sorted(self.sparse.iteritems()): if coeff < 0: left.append(KeggReaction.write_compound_and_coeff(cid, -coeff)) elif coeff > 0: right.append(KeggReaction.write_compound_and_coeff(cid, coeff)) return "%s %s %s" % (' + '.join(left), self.arrow, ' + '.join(right)) def _get_reaction_atom_bag(self, raise_exception=False): """ Use for checking if all elements are conserved. Returns: An atom_bag of the differences between the sides of the reaction. E.g. if there is one extra C on the left-hand side, the result will be {'C': -1}. """ try: cids = list(self.keys()) coeffs = map(self.sparse.__getitem__, cids) coeffs = np.matrix(coeffs) cached_cids = set(map(str, self.ccache.compound_id2inchi.keys())) if not cached_cids.issuperset(cids): missing_cids = set(cids).difference(cached_cids) warning_str = 'The following compound IDs are not in the cache, ' + \ 'make sure they appear in kegg_additions.tsv and ' + \ 'then run compound_cacher.py: ' + \ ', '.join(sorted(missing_cids)) raise ValueError(warning_str) elements, Ematrix = self.ccache.get_element_matrix(cids) conserved = coeffs * Ematrix if np.any(np.isnan(conserved), 1): warning_str = 'cannot test reaction balancing because of unspecific ' + \ 'compound formulas: %s' % self.write_formula() raise ValueError(warning_str) atom_bag = {} if np.any(conserved != 0, 1): logging.debug('unbalanced reaction: %s' % self.write_formula()) for j, c in enumerate(conserved.flat): if c != 0: logging.debug( 'there are %d more %s atoms on the right-hand side' % (c, elements[j])) atom_bag[str(elements[j])] = c return atom_bag except ValueError as e: if raise_exception: raise e else: logging.debug(str(e)) return None def is_balanced(self, fix_water=False, raise_exception=False): reaction_atom_bag = self._get_reaction_atom_bag(raise_exception) if reaction_atom_bag is None: # this means some compound formulas are missing return False if fix_water and 'O' in reaction_atom_bag: self.sparse.setdefault('C00001', 0) self.sparse['C00001'] += -reaction_atom_bag['O'] if self.sparse['C00001'] == 0: del self.sparse['C00001'] reaction_atom_bag = self._get_reaction_atom_bag() return len(reaction_atom_bag) == 0 def is_empty(self): return len(self.sparse) == 0 def dense(self, cids): s = np.matrix(np.zeros((len(cids), 1))) for cid, coeff in self.iteritems(): s[cids.index(cid), 0] = coeff return s def get_transform_ddG0(self, pH, I, T): """ needed in order to calculate the transformed Gibbs energies of reactions. Returns: The difference between DrG0_prime and DrG0 for this reaction. Therefore, this value must be added to the chemical Gibbs energy of reaction (DrG0) to get the transformed value. """ ddG0_forward = 0 for compound_id, coeff in self.iteritems(): comp = self.ccache.get_compound(compound_id) ddG0_forward += coeff * comp.transform_pH7(pH, I, T) return ddG0_forward