def _Recalculate(self): S, cids = self.kegg.reaction_list_to_S(self.reactions) known_cids = self.formations.get_all_cids() fix_rows = [i for i, cid in enumerate(cids) if cid in known_cids] var_rows = [i for i, cid in enumerate(cids) if cid not in known_cids] fix_cids = [cids[i] for i in fix_rows] var_cids = [cids[i] for i in var_rows] # subtract the part of the dG0 which is fixed, and leave only the part # which is attributed to the NaN compounds. fix_S = S[fix_rows, :] var_S = S[var_rows, :] var_P_C, var_P_L = LinearRegression.ColumnProjection(var_S) var_P_R, var_P_N = LinearRegression.RowProjection(var_S) # take all the known dG0_primes from self.formations dG0_f_prime = self.formations.GetTransformedFormationEnergies(fix_cids, pH=self.pH, I=self.I, T=self.T, pMg=self.pMg) # project the dG0_r on the column-space of var_S to eliminate inconsistencies # between the dG0_r and the fixed formation energies. # then subtract the fixed part of the dG0_r. var_dG0_r_prime = np.matrix(self.dG0_r_primes) * var_P_C - dG0_f_prime * fix_S var_dG0_f_prime, _ = LinearRegression.LeastSquares(var_S, var_dG0_r_prime) return var_cids, var_dG0_f_prime, var_P_N
def next(self): if self.dimension == len(self): raise StopIteration while True: self.emf_counter += 1 g_plus, g_minus, coeffs = self.GetSolution() self.ExcludeSolutionVector(g_plus, g_minus, 'avoid_%d_plus' % self.emf_counter) self.ExcludeSolutionVector(g_minus, g_plus, 'avoid_%d_minus' % self.emf_counter) nonzero_indices = np.nonzero( g_plus > 0.5)[0].tolist() + np.nonzero( g_minus > 0.5)[0].tolist() self.K[self.dimension, nonzero_indices] = coeffs[nonzero_indices] if LinearRegression.MatrixRank(self.K) < self.dimension + 1: self.K[self.dimension, :] = 0 else: # normalize the kernel vector so that it will have nice coefficients g = min(abs(coeffs[nonzero_indices])) self.K[self.dimension, :] /= g #if sum(self.K[:, self.dimension] < 0.0): # self.K[:, self.dimension] *= -1.0 v = self.K[self.dimension, :] self.AddLinearConstraint(v) self.dimension += 1 return v
class SparseKernel(object): """ Finds a sparse representation of the kernel matrix, using MILP to iterate the Fundamental Modes of the matrix. Input: a (n x m) matrix A, whose rank is r. Return: a (m x m-r) matrix K that will span the kernel of A, i.e.: span(K) = {x | Ax = 0} """ class LinearProgrammingException(Exception): pass def __init__(self, A): self.upper_bound = 1000 self.eps = 1e-10 self.dimension = 0 try: self.cpl = Cplex() except NameError, CplexSolverError: raise CplexNotInstalledError() self.cpl.set_problem_name('find_kernel') self.cpl.set_log_stream(None) self.cpl.set_results_stream(None) self.cpl.set_warning_stream(None) self.n_variables = A.shape[1] self.CreateAllVariables() self.constraint_counter = 0 for r in xrange(A.shape[0]): self.AddLinearConstraint(A[r, :]) self.kernel_rank = self.n_variables - LinearRegression.MatrixRank(A)
def AnalyzeResiduals(self): GS = np.dot(self.G.T, self.S) # Write the analysis of residuals: # I am not sure if this analysis should be done before "uniquing" # the rows of S or after. The observation residual is much smaller # in the latter case, since intra-reaction noise is averaged. _P_R1, P_N1 = LinearRegression.RowProjection(self.S) _P_R2, P_N2 = LinearRegression.RowProjection(GS) r_obs = np.linalg.norm(np.dot(self.gibbs_values, P_N1)) r_est = np.linalg.norm(np.dot(self.gibbs_values, P_N2 - P_N1)) r_tot = np.linalg.norm(np.dot(self.gibbs_values, P_N2)) self.html_writer.write('</br><b>Analysis of residuals:<b>\n') self.html_writer.insert_toggle(start_here=True) residual_text = [ 'r<sub>observation</sub> = %.2f kJ/mol' % r_obs, 'r<sub>estimation</sub> = %.2f kJ/mol' % r_est, 'r<sub>total</sub> = %.2f kJ/mol' % r_tot ] self.html_writer.write_ul(residual_text) self.html_writer.div_end()
def TestGroupMatrix(): group_filename = '../data/thermodynamics/hatzimanikatis_groups.csv' all_group_names = [] sparse_matrix = [] dG_vector = [] line_no = 0 for row in csv.DictReader(open(group_filename)): line_no += 1 if row['est_dG'] == "None": continue dG_vector.append(float(row['est_dG'])) sparse_groupvec = [] if row['groups'] != "": for token in row['groups'].split(' | '): try: [group_name, coeff] = token.split(' : ', 1) except ValueError: raise Exception("cannot parse this token (line %d): %s\n" % (line_no, token)) coeff = float(coeff) if group_name not in all_group_names: all_group_names.append(group_name) group_index = all_group_names.index(group_name) sparse_groupvec.append((group_index, coeff)) sparse_matrix.append(sparse_groupvec) full_matrix = np.zeros((len(sparse_matrix), len(all_group_names))) for i in range(len(sparse_matrix)): for j, coeff in sparse_matrix[i]: full_matrix[i, j] = coeff dG_vector = np.array(dG_vector, ndmin=2).T #print full_matrix.shape #print LinearRegression.MatrixRank(full_matrix) #print dG_vector.shape #augmented_matrix = np.hstack([full_matrix, dG_vector]) #_U, s, _V = np.linalg.svd(augmented_matrix, full_matrices=False) #print sorted(s) contributions, _K = LinearRegression.LeastSquares(full_matrix, dG_vector) for i, group_name in enumerate(all_group_names): print "%s,%.3f" % (group_name, contributions[i, 0]) pyplot.plot(dG_vector, dG_vector - np.dot(full_matrix, contributions), '.') pyplot.show()
def stoichiometric_matrix2html(html_writer, A, cids, eps=1e-10): """ Print a table in HTML format. A is a stoichiometric matrix where each row is a reaction and each column is a compound, corresponding in position to the list "cids". """ dict_list = [] for i in xrange(A.shape[0]): sparse_reaction = dict([(cids[j], A[i, j]) for j in xrange(A.shape[1]) if abs(A[i, j]) > eps]) r = Reaction("reaction%d" % i, sparse_reaction=sparse_reaction) dict_list.append({'reaction': r.to_hypertext()}) html_writer.write_ul([ '%d rows' % A.shape[0], '%d columns' % A.shape[1], '%d rank' % LinearRegression.MatrixRank(A) ]) html_writer.write_table(dict_list, headers=['#', 'reaction'])
def GetTransfromedReactionEnergies(self, S, cids, pH=None, I=None, pMg=None, T=None, conc=1): """ Find the set of reaction Gibbs energies that are completely consistent with thermo[0], and also close to the energies provided by thermo[1]. To find this solution, we project the vector of Gibbs energies obtained using thermo[1] onto the subspace spanned by the columns of the stoichiometric matrix (where some of the values are fixed according to thermo[0]). """ # first try to use thermo[0] to estimate all reaction energies. # note that this calculation already adds the effect of concentrations to dG_r. dGc_r0 = self.thermo[0].GetTransfromedReactionEnergies(S, cids, pH=pH, I=I, pMg=pMg, T=T, conc=conc) if np.all(np.isfinite(dGc_r0)): return dGc_r0 # if thermo[1] cannot estimate all reactions, just use thermo[0]. # note that here we leave out the effect of the concentrations on dG_r, # because we are going to use standard formation energies from thermo[0] # and fill the gaps using thermo[1]. dG0_r1 = self.thermo[1].GetTransfromedReactionEnergies(S, cids, pH=pH, I=I, pMg=pMg, T=T) if np.isnan(dG0_r1).any(): return dGc_r0 dG0_f0 = self.thermo[0].GetTransformedFormationEnergies(cids, pH=pH, I=I, pMg=pMg, T=T) finite_cols = list(np.where(np.isfinite(dG0_f0))[1].flat) nan_cols = list(np.where(np.isnan(dG0_f0))[1].flat) fixed_dG0_r = dG0_f0[:, finite_cols] * S[finite_cols, :] P_R, P_N = LinearRegression.RowProjection(S[nan_cols, :]) dG0_r = dG0_r1 * P_R + fixed_dG0_r * P_N # now add the effect of the concentrations if conc != 1: return dG0_r + AddConcentrationsToReactionEnergies(S, cids, T, conc) else: return dG0_r
def AnalyzeTrainingSet(self, skip_formations=True): n_obs = self.group_matrix.shape[1] rowdicts = [] fit_results = np.dot(self.group_contributions, self.group_matrix) residuals = fit_results - self.obs_values if self.transformed: sym = symbol_d_G0_prime else: sym = symbol_d_G0 for i in xrange(n_obs): if self.obs_types[i] in [ KeggObservation.TYPE_ACID_BASE, KeggObservation.TYPE_MG, KeggObservation.TYPE_REDOX ]: continue if skip_formations and self.obs_types[ i] == KeggObservation.TYPE_FORMATION: continue rowdict = {'Observation': self.obs_ids[i]} rowdict[sym + ' (obs)'] = self.obs_values[0, i] rowdict[sym + ' (fit)'] = fit_results[0, i] rowdict[sym + ' (res)'] = residuals[0, i] rowdict['LOO ' + sym + ' (fit)'] = np.nan rowdict['LOO ' + sym + ' (res)'] = np.nan rowdict['sortkey'] = 0 rowdicts.append(rowdict) logging.info('Fit Error = %.1f' % residuals[0, i]) # leave out the row corresponding with observation 'i' logging.info('Cross validation, leaving-one-out: ' + self.obs_ids[i]) subset = range(n_obs) subset.pop(i) loo_group_contributions, loo_nullspace = LinearRegression.LeastSquares( self.group_matrix[:, subset], self.obs_values[:, subset]) if loo_nullspace.shape[1] > self.group_nullspace.shape[1]: logging.warning( 'example %d is not linearly dependent in the other examples' % i) continue rowdict['LOO ' + sym + ' (fit)'] = float( np.dot(loo_group_contributions, self.group_matrix[:, i])) rowdict['LOO ' + sym + ' (res)'] = \ rowdict['LOO ' + sym + ' (fit)'] - self.obs_values[0, i] rowdict['sortkey'] = abs(rowdict['LOO ' + sym + ' (res)']) logging.info('LOO Error = %.1f' % rowdict['LOO ' + sym + ' (res)']) logging.info( "writing the table of estimation errors for each compound") self.html_writer.write('</br><b>Cross validation table</b>') self.html_writer.insert_toggle(start_here=True) self.html_writer.write('<font size="1">\n') obs_vec = np.matrix([row[sym + ' (obs)'] for row in rowdicts]) resid_vec = np.matrix([row[sym + ' (res)'] for row in rowdicts]) rmse = rms_flat(resid_vec.flat) loo_resid_vec = np.matrix( [row['LOO ' + sym + ' (res)'] for row in rowdicts]) loo_rmse = rms_flat(loo_resid_vec[np.isfinite(loo_resid_vec)].flat) self.html_writer.write_ul([ 'fit RMSE = %.1f [kJ/mol]' % rmse, 'leave-one-out RMSE = %.1f [kJ/mol]' % loo_rmse ]) logging.info("Goodness of fit: RMSE = %.1f [kJ/mol]" % rmse) logging.info("Leave-one-out test: RMSE = %.1f [kJ/mol]" % loo_rmse) headers = [ 'Observation', sym + ' (obs)', sym + ' (fit)', sym + ' (res)', 'LOO ' + sym + ' (fit)', 'LOO ' + sym + ' (res)' ] rowdicts.sort(key=lambda (x): x['sortkey'], reverse=True) self.html_writer.write_table(rowdicts, headers, decimal=1) self.html_writer.write('</font>\n') self.html_writer.div_end() self.html_writer.write('</br><b>Cross-validation figure</b>') self.html_writer.insert_toggle(start_here=True) obs_vs_err_fig = plt.figure(figsize=[6.0, 6.0], dpi=100) plt.plot(obs_vec.T, resid_vec.T, '.') plt.xlabel('Observation') plt.ylabel('Estimated (PGC) Residuals') plt.hold(True) for row in rowdicts: if abs(row[sym + ' (res)']) > 2 * rmse: plt.text(row[sym + ' (obs)'], row[sym + ' (res)'], row['Observation'], fontsize=4, figure=obs_vs_err_fig) plt.title('Observed vs. Fitted (PGC) Residuals', figure=obs_vs_err_fig) self.html_writer.embed_matplotlib_figure(obs_vs_err_fig) self.html_writer.div_end()
def WriteRegressionReport(self, T=default_T, pH=default_pH): rowdicts = [] for i in xrange(self.group_matrix.shape[1]): groupvec = GroupVector(self.groups_data, self.group_matrix[:, i]) rowdict = {'#': i, 'ID': self.obs_ids[i]} rowdict[self.obs_collection. gibbs_symbol] = '%.1f' % self.obs_values[0, i] rowdict['Group Vector'] = str(groupvec) rowdicts.append(rowdict) self.html_writer.write('</br><b>Regression report</b>') self.html_writer.insert_toggle(start_here=True) self.html_writer.write('<font size="1">\n') self.html_writer.write_ul([ 'observations: %d' % self.group_matrix.shape[1], 'groups: %d' % self.group_matrix.shape[0], 'rank: %d' % LinearRegression.MatrixRank(self.group_matrix) ]) self.html_writer.write_table(rowdicts, headers=[ '#', 'ID', 'Group Vector', self.obs_collection.gibbs_symbol ]) self.html_writer.write('</font>\n') self.html_writer.div_end() self.html_writer.write('</br><b>Group Contributions</b>\n') div_id = self.html_writer.insert_toggle() self.html_writer.div_start(div_id) self.html_writer.write('</br><font size="1">\n') rowdicts = [] if self.transformed: headers = [ "#", "Group Name", self.obs_collection.gibbs_symbol, "acid-base", "formation", "reaction" ] else: headers = [ "#", "Group Name", "nH", "charge", "nMg", self.obs_collection.gibbs_symbol, "acid-base", "formation", "reaction" ] group_names = self.groups_data.GetGroupNames() for j, dG0_gr in enumerate(self.group_contributions.flat): obs_lists_dict = defaultdict(list) for k in self.group_matrix[j, :].nonzero()[1].flat: obs_lists_dict[self.obs_types[k]].append(self.obs_ids[k]) d = { "#": "%d" % j, "Group Name": group_names[j], self.obs_collection.gibbs_symbol: "%.1f" % dG0_gr } for k, v in obs_lists_dict.iteritems(): d[k] = ' | '.join(v) if not self.transformed: group = self.groups_data.all_groups[j] d["nH"] = group.hydrogens d["charge"] = group.charge d["nMg"] = group.nMg rowdicts.append(d) self.html_writer.write_table(rowdicts, headers) self.html_writer.write('</font>\n') self.html_writer.div_end()
def Train(self): logging.info("Calculating the linear regression data") cids, S, b, anchored = self.obs_collection.GetStoichiometry() anchored_cols = list(np.where(anchored == 1)[1].flat) # now remove anchored data from S and leave only the data which will be # used for calculating the group contributions g, P_C, P_L = LinearRegression.LeastSquaresProjection( S[:, anchored_cols], b[:, anchored_cols]) self.anchored_cids = cids self.anchored_contributions = g * P_C self.anchored_P_L = P_L self.anchored_P_L[abs(self.anchored_P_L) <= self.epsilon] = 0 b -= self.anchored_contributions * S S = self.anchored_P_L * S # set epsilon-small values to absolute 0 S[np.where(abs(S) <= self.epsilon)] = 0 # removed zero rows (compounds) from S used_cid_indices = set(np.nonzero(np.sum(abs(S), 1))[0].flat) for i_cid, cid in enumerate(cids): if self.cid2groupvec[cid] is None: used_cid_indices.difference_update([i_cid]) for i_obs in np.nonzero(S[i_cid, :])[1].flat: logging.warning( "%s is removed because C%05d has no group vector, " "but is still part of the final stoichiometric matrix" % (self.obs_collection.observations[i_obs].obs_id, cid)) S[:, i_obs] = 0 used_cid_indices = sorted(used_cid_indices) S = S[used_cid_indices, :] n_groups = len(self.groups_data.GetGroupNames()) # number of groups G = np.matrix(np.zeros((len(used_cid_indices), n_groups))) for i, i_cid in enumerate(used_cid_indices): G[i, :] = self.cid2groupvec[cids[i_cid]].Flatten() GS = G.T * S # 'unique' the rows GS. For each set of rows that is united, # the Y-value for the new row is the average of the corresponding Y-values. unique_GS, col_mapping = LinearRegression.ColumnUnique( GS, remove_zero=True) unique_b = np.matrix(np.zeros((1, unique_GS.shape[1]))) unique_obs_types = [] unique_obs_ids = [] for i, old_indices in sorted(col_mapping.iteritems()): unique_b[0, i] = np.mean(b[0, old_indices]) obs_list = [ self.obs_collection.observations[j] for j in old_indices ] unique_obs_types.append( obs_list[0].obs_type ) # take the type of the first one (not perfect...) unique_obs_ids.append(', '.join([obs.obs_id for obs in obs_list])) self.group_matrix = unique_GS self.obs_values = unique_b self.obs_ids = unique_obs_ids self.obs_types = unique_obs_types logging.info("Performing linear regression") self.group_contributions, self.group_nullspace = \ LinearRegression.LeastSquares(self.group_matrix, self.obs_values) logging.info("Storing the group contribution data in the database") self.SaveContributionsToDB()
def _GetContributionData(self, obs_S, obs_cids, obs_b, obs_anchored): assert obs_S.shape[0] == len(obs_cids) assert obs_S.shape[1] == obs_b.shape[1] assert obs_S.shape[1] == obs_anchored.shape[1] # (1) # use the anchored reactions to directly estimate the part of est_S # which is in their column-span, and normalize that part out from all matrices. anchored_cols = list(obs_anchored.nonzero()[1].flat) if anchored_cols: g_anch, P_C_anch, P_L_anch = LinearRegression.LeastSquaresProjection( obs_S[:, anchored_cols], obs_b[:, anchored_cols]) obs_b -= g_anch * P_C_anch * obs_S # subtract the contribution of anchored reactions to obs_b obs_S = P_L_anch * obs_S # project obs_S on the residual space else: g_anch = np.matrix(np.zeros((1, obs_S.shape[0]))) P_C_anch = np.matrix(np.zeros((obs_S.shape[0], obs_S.shape[0]))) P_L_anch = np.matrix(np.eye(obs_S.shape[0])) # (2) # calculate the reactant contributions from obs_S and obs_b, and use that # to estimate the part which is in the column-space of NIST. g_prc, P_C_prc, P_L_prc = LinearRegression.LeastSquaresProjection( obs_S, obs_b) # (3) # calculate the group contributions from obs_S and obs_b. Note that # some reaction involve compounds that don't have groupvectors, and # therefore are discarded from this step. G, has_groupvec = self._GenerateGroupMatrix(obs_cids) bad_compounds = list(np.where(has_groupvec == False)[0].flat) reactions_with_groupvec = [] for i in xrange(obs_S.shape[1]): if np.all(abs(obs_S[bad_compounds, i]) < self.epsilon): reactions_with_groupvec.append(i) obs_GS = G.T * obs_S[:, reactions_with_groupvec] g_pgc, P_C_pgc, P_L_pgc = LinearRegression.LeastSquaresProjection( obs_GS, obs_b[:, reactions_with_groupvec]) # calculate the total contributions result_dict = {} result_dict['names'] = ['anchors', 'reactants', 'groups'] result_dict['contributions'] = [g_anch, g_prc, g_pgc * P_C_pgc * G.T] result_dict['group_contributions'] = g_pgc result_dict['column_spaces'] = [P_C_anch, P_C_prc, P_C_pgc] result_dict['null_spaces'] = [P_L_anch, P_L_prc, P_L_pgc] result_dict['projections'] = [ P_C_anch, P_C_prc * P_L_anch, P_L_prc * P_L_anch ] result_dict['total_contributions'] = np.matrix( np.zeros((1, len(obs_cids)))) for g, S in zip(result_dict['contributions'], result_dict['projections']): result_dict['total_contributions'] += g * S # conservation laws that check if we rely on compounds that have no groupvector P_L_bad = (P_L_prc * P_L_anch)[bad_compounds, :] # projection of reactions to the residual groupvector space G_resid = P_L_prc * P_L_anch * G result_dict['bad_conservations'] = P_L_bad result_dict['pgc_conservations'] = P_L_pgc result_dict['pgc_groupvectors'] = G_resid result_dict['conservations'] = np.vstack( [P_L_bad, (G_resid * P_L_pgc).T]) return result_dict
def LoadData(self, FromDatabase=False): if FromDatabase and self.db.DoesTableExist( self.STOICHIOMETRIC_TABLE_NAME): logging.info("Reading group matrices from database") self.S = self.db.LoadSparseNumpyMatrix( self.STOICHIOMETRIC_TABLE_NAME) self.G = self.db.LoadSparseNumpyMatrix(self.GROUP_TABLE_NAME) self.b = self.db.LoadNumpyMatrix(self.GIBBS_ENERGY_TABLE_NAME).T self.anchored = self.db.LoadNumpyMatrix(self.ANCHORED_TABLE_NAME).T self.has_groupvec = np.sum(self.G, 1) > 0 self.cids = [] for rowdict in self.db.DictReader(self.COMPOUND_TABLE_NAME): self.cids.append(int(rowdict['cid'])) self.obs_ids = [] self.obs_types = [] self.obs_urls = [] for rowdict in self.db.DictReader( self.UNIQUE_OBSERVATION_TABLE_NAME): self.obs_ids.append(rowdict['id']) self.obs_types.append(rowdict['type']) self.obs_urls.append(rowdict['url']) else: logging.info("Calculating group matrices") self.cids, S, b, anchored = self.obs_collection.GetStoichiometry() if self.CollapseReactions: self.S, col_mapping = LinearRegression.ColumnUnique(S) self.b = np.matrix( np.zeros((1, len(col_mapping)), dtype='float')) self.anchored = np.matrix( np.zeros((1, len(col_mapping)), dtype='int')) self.obs_ids = [] self.obs_types = [] self.obs_urls = [] for i, col_indices in col_mapping.iteritems(): self.b[0, i] = np.mean(b[0, col_indices]) self.anchored[0, i] = anchored[0, col_indices].max() obs_list = [ self.obs_collection.observations[j] for j in col_indices ] self.obs_ids.append(', '.join( [obs.obs_id for obs in obs_list])) self.obs_types.append(', '.join( set([obs.obs_type for obs in obs_list]))) self.obs_urls.append(', '.join( [obs.url for obs in obs_list])) else: self.S = S self.b = b self.anchored = anchored self.obs_ids = [ obs.obs_id for obs in self.obs_collection.observations ] self.obs_types = [ obs.obs_type for obs in self.obs_collection.observations ] self.obs_urls = [ obs.url for obs in self.obs_collection.observations ] self.G, self.has_groupvec = self._GenerateGroupMatrix(self.cids) # save everything to the database self.db.SaveSparseNumpyMatrix(self.STOICHIOMETRIC_TABLE_NAME, self.S) self.db.SaveSparseNumpyMatrix(self.GROUP_TABLE_NAME, self.G) self.db.SaveNumpyMatrix(self.GIBBS_ENERGY_TABLE_NAME, self.b.T) self.db.SaveNumpyMatrix(self.ANCHORED_TABLE_NAME, self.anchored.T) self.db.CreateTable(self.COMPOUND_TABLE_NAME, 'cid INT, name TEXT') for cid in self.cids: self.db.Insert(self.COMPOUND_TABLE_NAME, [cid, self.kegg.cid2name(cid)]) self.db.CreateTable(self.UNIQUE_OBSERVATION_TABLE_NAME, 'row INT, id TEXT, type TEXT, url TEXT') for i in xrange(len(self.obs_ids)): self.db.Insert( self.UNIQUE_OBSERVATION_TABLE_NAME, [i, self.obs_ids[i], self.obs_types[i], self.obs_urls[i]]) self.db.Commit()
def ReverseTransform(self, cid2nH_nMg=None): """ Performs the reverse Legendre transform on all the data in NIST where it is possible, i.e. where we have pKa data. Arguments: cid2nH_nMg - a dictionary mapping each compound ID to its chosen pseudoisomer (described by nH and nMg). """ logging.info("Reverse transforming the NIST data") nist_rows = self.nist.SelectRowsFromNist() logging.info("Selected %d NIST rows out of %d" % (len(nist_rows), len(self.nist.data))) data = self.GetDissociation().ReverseTransformNistRows( nist_rows, cid2nH_nMg=cid2nH_nMg) nist_rows_final = data['nist_rows'] stoichiometric_matrix = data['S'] cids_to_estimate = data['cids_to_estimate'] n_cols = stoichiometric_matrix.shape[1] logging.info("Only %d out of %d NIST measurements can be used" % (n_cols, len(nist_rows))) # squeeze the regression matrix by leaving only unique rows unique_cols_S, col_mapping = LinearRegression.ColumnUnique(stoichiometric_matrix) logging.info("There are %d unique reactions" % len(col_mapping)) unique_rids = set([nist_row.reaction.rid for nist_row in nist_rows if nist_row.reaction.rid is not None]) logging.info("Out of which %d have KEGG reaction IDs" % len(unique_rids)) # for every unique column, calculate the average dG0_r of all the columns that # are the same reaction # full_data_mat will contain these columns: dG0, dG0_tag, dG0 - E[dG0], # dG0_tag - E[dG0_tag], N # the averages are over the equivalence set of each reaction (i.e. the # average dG of all the rows in NIST with that same reaction). # 'N' is the unique row number (i.e. the ID of the equivalence set) full_data_mat = np.matrix(np.zeros((5, n_cols))) full_data_mat[0, :] = np.matrix(data['dG0_r']) full_data_mat[1, :] = np.matrix(data['dG0_r_tag']) # unique_data_mat will contain these columns: E[dG0], E[dG0_tag], # std(dG0), std(dG0_tag), no. rows # there is exactly one row for each equivalence set (i.e. unique reaction) # no. rows holds the number of times this unique reaction appears in NIST unique_data_mat = np.matrix(np.zeros((5, len(col_mapping)))) unique_sparse_reactions = [] unique_nist_row_representatives = [] for i, col_indices in col_mapping.iteritems(): col_vector = unique_cols_S[:, i] # convert the rows of unique_rows_S to a list of sparse reactions sparse = {} for j in col_vector.nonzero()[0].flat: sparse[cids_to_estimate[j]] = unique_cols_S[j, i] reaction = Reaction(names=['NIST%03d' % i], sparse_reaction=sparse) unique_sparse_reactions.append(reaction) # find the list of indices which are equal to row i in unique_rows_S unique_nist_row_representatives.append(nist_rows_final[col_indices[0]]) # take the mean and std of the dG0_r of these rows sub_data_mat = full_data_mat[0:2, col_indices] unique_data_mat[0:2, i] = np.mean(sub_data_mat, 1) unique_data_mat[2:4, i] = np.std(sub_data_mat, 1) unique_data_mat[4, i] = sub_data_mat.shape[1] full_data_mat[4, col_indices] = i full_data_mat[2:4, col_indices] = sub_data_mat for k in col_indices: # subtract the mean from each row with this reaction full_data_mat[2:4, k] -= unique_data_mat[0:2, i] # write a table that lists the variances of each unique reaction # before and after the reverse transform self.WriteUniqueReactionReport(unique_sparse_reactions, unique_nist_row_representatives, unique_data_mat, full_data_mat) return unique_cols_S, unique_data_mat[0:1, :], cids_to_estimate
def LinearRegression(self, S, obs_dG0_r, cids, cid2nH_nMg, prior_thermodynamics=None): logging.info("Regression matrix is %d x %d" % \ (S.shape[0], S.shape[1])) cid2ref = dict((cid, 'PRC') for cid in cids) if prior_thermodynamics: # Normalize the contribution of compounds which have formation energies # given in the prior. Perform the regression only on the residuals # remaining after the normalization (note that the stoichiometric # matrix must also be trimmed). cid_index_prior = [] dG0_prior = [] for i, cid in enumerate(cids): nH, nMg = cid2nH_nMg[cid] try: pmap_prior = prior_thermodynamics.cid2PseudoisomerMap(cid) except MissingCompoundFormationEnergy: continue for p_nH, p_z, p_nMg, dG0 in pmap_prior.ToMatrix(): if nH == p_nH and p_nMg == nMg: cid_index_prior.append(i) dG0_prior.append(dG0) cid2ref[cid] = pmap_prior.GetRef(p_nH, p_z, p_nMg) break S_prior = np.matrix(np.zeros((len(cids), len(cid_index_prior)))) for j, i in enumerate(cid_index_prior): S_prior[i, j] = 1 dG0_prior = np.matrix(dG0_prior) g, _ = LinearRegression.LeastSquares(S_prior, dG0_prior) P_C, P_L = LinearRegression.ColumnProjection(S_prior) prior_dG0_r = g * P_C * S new_obs_dG0_r = obs_dG0_r - prior_dG0_r new_S = P_L * S # Find all reactions in new_S which are completely zero. This means that # they are completely determined by the prior. zero_cols = (abs(new_S).sum(0) < 1e-10).nonzero()[1] rowdicts = [] for j in zero_cols.flat: rowdict = {} rowdict['reaction'] = NistRegression.row2hypertext(S[:, j], cids) rowdict['|error|'] = abs(new_obs_dG0_r[0, j]) rowdict['error'] = new_obs_dG0_r[0, j] rowdict['NIST'] = obs_dG0_r[0, j] rowdict['prior'] = prior_dG0_r[0, j] rowdicts.append(rowdict) rowdicts.sort(key=lambda x:x['|error|'], reverse=True) self.html_writer.write('</br><b>Alberty Errors</b>\n') self.html_writer.write_table(rowdicts, headers=['reaction', 'error', 'NIST', 'prior'], decimal=1) est_dG0_f, _ = LinearRegression.LeastSquares(new_S, new_obs_dG0_r) for j, i in enumerate(cid_index_prior): est_dG0_f[0, i] = dG0_prior[0, j] else: est_dG0_f, _ = LinearRegression.LeastSquares(S, obs_dG0_r) est_dG0_r = est_dG0_f * S residuals = est_dG0_r - obs_dG0_r rmse = rms_flat(residuals.flat) logging.info("Regression results for reverse transformed data:") logging.info("N = %d, RMSE = %.1f" % (S.shape[1], rmse)) self.html_writer.write('<p>RMSE = %.1f [kJ/mol]</p>\n' % rmse) rowdicts = [] headers = ['#', 'Reaction', symbol_dr_G0 + ' (obs)', symbol_dr_G0 + ' (fit)', symbol_dr_G0 + ' (res)'] for i in xrange(S.shape[1]): rowdict = {} rowdict['Reaction'] = NistRegression.row2hypertext(S[:, i], cids) rowdict[symbol_dr_G0 + ' (obs)'] = obs_dG0_r[0, i] rowdict[symbol_dr_G0 + ' (fit)'] = est_dG0_r[0, i] rowdict[symbol_dr_G0 + ' (res)'] = residuals[0, i] rowdicts.append(rowdict) rowdicts.sort(key=lambda x:abs(x[symbol_dr_G0 + ' (res)']), reverse=True) self.html_writer.write_table(rowdicts, headers, decimal=1) # copy the solution into the diss_tables of all the compounds, # and then generate their PseudoisomerMaps. for i, cid in enumerate(cids): nH, nMg = cid2nH_nMg[cid] diss_table = self.GetDissociation().GetDissociationTable(cid) z = diss_table.min_charge + (nH - diss_table.min_nH) diss_table.SetFormationEnergyByNumHydrogens(est_dG0_f[0, i], nH, nMg) pmap = diss_table.GetPseudoisomerMap(nH, nMg) pmap.SetRef(nH, z, nMg, cid2ref[cid]) self.cid2pmap_dict[cid] = pmap
def main(): kegg = Kegg.getInstance() prefix = '../res/prc_' fixed_cids = {} # a dictionary from CID to pairs of (nH, dG0) # Alberty formation energies directly measured, linearly independent: fixed_cids[1] = (2, -237.19) # H2O fixed_cids[9] = (1, -1096.1) # HPO3(-2) fixed_cids[14] = (4, -79.31) # NH4(+1) fixed_cids[59] = (0, -744.53) # SO4(-2) fixed_cids[288] = (1, -586.77) # HCO3(-1) # Alberty zeros: fixed_cids[3] = (26, 0.0) # NAD(ox) fixed_cids[10] = (32, 0.0) # CoA fixed_cids[127] = (30, 0.0) # glutathione(ox) fixed_cids[376] = (28, 0.0) # retinal(ox) # Directly measured values fixed_cids[4] = (27, 22.65) # NAD(red) -- relative to NAD(ox) fixed_cids[212] = (13, -194.5) # adenosine #fixed_cids[294] = (12, -409.2) # inosine - linearly dependent on other 'anchors' # Alberty zeros which are not in NIST: #fixed_cids[524] = ( 0, 0.0) # cytochrome c(ox) #fixed_cids[16] = (31, 0.0) # FAD(ox) #fixed_cids[139] = ( 0, 0.0) # ferredoxin(ox) #fixed_cids[61] = (19, 0.0) # FMN(ox) #fixed_cids[343] = ( 0, 0.0) # thioredoxin(ox) #fixed_cids[399] = (90, 0.0) # ubiquinone(ox) public_db = SqliteDatabase("../data/public_data.sqlite") alberty = PsuedoisomerTableThermodynamics.FromDatabase( public_db, 'alberty_pseudoisomers', label=None, name='Alberty') alberty_cid2dG0 = {} alberty_cid2nH = {} for cid in alberty.get_all_cids(): pmap = alberty.cid2PseudoisomerMap(cid) dG0, _dG0_tag, nH, _z, _nMg = pmap.GetMostAbundantPseudoisomer( pH=default_pH, I=default_I, pMg=default_pMg, T=default_T) alberty_cid2nH[cid] = nH alberty_cid2dG0[cid] = dG0 if not os.path.exists(prefix + 'S.txt'): db = SqliteDatabase("../res/gibbs.sqlite") nist_regression = NistRegression(db) cid2nH = {} for cid in nist_regression.nist.GetAllCids(): if cid in fixed_cids: cid2nH[cid] = fixed_cids[cid][0] elif cid in alberty_cid2nH: cid2nH[cid] = alberty_cid2nH[cid] else: tmp = nist_regression.dissociation.GetMostAbundantPseudoisomer( cid, pH=default_pH, I=default_I, pMg=default_pMg, T=default_T) if tmp is not None: cid2nH[cid] = tmp[0] else: logging.warning( 'The most abundant pseudoisomer of %s (C%05d) ' 'cannot be resolved. Using nH = 0.' % (kegg.cid2name(cid), cid)) cid2nH[cid] = 0 #nist_regression.std_diff_threshold = 2.0 # the threshold over which to print an analysis of a reaction #nist_regression.nist.T_range = None#(273.15 + 24, 273.15 + 40) S, dG0, cids = nist_regression.ReverseTransform(cid2nH=cid2nH) # export the raw data matrices to text files C = np.array([[cid, cid2nH.get(cid, 0)] for cid in cids]) np.savetxt(prefix + 'CID.txt', C, fmt='%d', delimiter=',') np.savetxt(prefix + 'S.txt', S, fmt='%g', delimiter=',') np.savetxt(prefix + 'dG0.txt', dG0, fmt='%.2f', delimiter=',') else: C = np.loadtxt(prefix + 'CID.txt', delimiter=',') cids = [int(cid) for cid in C[:, 0]] cid2nH = {} for i, cid in enumerate(cids): cid2nH[cid] = int(C[i, 1]) S = np.loadtxt(prefix + 'S.txt', delimiter=',') dG0 = np.loadtxt(prefix + 'dG0.txt', delimiter=',') dG0 = np.reshape(dG0, (dG0.shape[0], 1)) html_writer = HtmlWriter('../res/regression_fast.html') html_writer.write("<h1>Pseudoisomeric Reactant Contributions</h1>\n") html_writer.write("<p>The stoichiometric matrix (S):") html_writer.insert_toggle(start_here=True) stoichiometric_matrix2html(html_writer, S, cids) html_writer.div_end() html_writer.write('</p>') index2value = {} S_extended = S # the stoichiometric matrix, extended with elementary basis vector for the fixed compounds for cid in fixed_cids.keys(): i = cids.index(cid) e_i = np.zeros((1, len(cids))) e_i[0, i] = 1.0 S_extended = np.vstack([S_extended, e_i]) nH, dG0_fixed = fixed_cids[cid] index2value[i] = dG0_fixed x, _K = LinearRegression.LeastSquaresWithFixedPoints(S, dG0, index2value) cid2dG0 = {} for i, cid in enumerate(cids): cid2dG0[cid] = x[i] # Calculate the Kernel of the reduced stoichiometric matrix (after removing # the columns of the fixed compounds). cids_red = [cid for cid in cids if cid not in fixed_cids] index_red = [i for i in xrange(len(cids)) if i not in index2value] S_red = S[:, index_red] K_red = LinearRegression.Kernel(S_red) #print "Reduced Stoichiometric Matrix:" #print matrix2string(S_red, cids_red, kegg) #print '-'*80 # Find all CIDs that are completely determined and do not depend on any # free variable. In other words, all zeros columns in K2. dict_list = [] determined_indices = np.where( np.sum(abs(K_red), 0) < 1e-10)[0] # all zero-columns in reducedK determined_cids = [cids_red[i] for i in determined_indices] plot_data = [] for i, cid in enumerate(cids): d = { 'CID': 'C%05d' % cid, 'Compound': kegg.cid2name(cid), 'nH': '%d' % cid2nH[cid], 'dG0 (PRC)': '%.1f' % cid2dG0[cid] } if cid in alberty_cid2dG0: d['dG0 (Alberty)'] = '%.1f' % alberty_cid2dG0[cid] if cid not in fixed_cids: plot_data.append( (alberty_cid2dG0[cid], cid2dG0[cid], kegg.cid2name(cid))) else: d['dG0 (Alberty)'] = '' if cid in fixed_cids: d['Depends on'] = 'anchored' elif cid in determined_cids: d['Depends on'] = 'fixed compounds' else: d['Depends on'] = 'kernel dimensions' dict_list.append(d) dict_list.sort(key=lambda (x): (x['Depends on'], x['CID'])) html_writer.write( "<p>Formation energies determined by the linear constraints:") html_writer.insert_toggle(start_here=True) html_writer.write('<font size="1">') html_writer.write_table(dict_list, headers=[ '#', 'Compound', 'CID', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)', 'Depends on' ]) html_writer.write('</font>') html_writer.div_end() html_writer.write('</p>') # Plot a comparison between PRC and Alberty formation energies fig = plt.figure(figsize=(8, 8), dpi=80) plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data], 'b.', figure=fig) for x, y, name in plot_data: plt.text(x, y, name, fontsize=6) plt.xlabel('Alberty $\Delta_f G^\circ$') plt.ylabel('PRC $\Delta_f G^\circ$') html_writer.write("<p>Plot comparing PRC and Alberty results:") html_writer.insert_toggle(start_here=True) html_writer.embed_matplotlib_figure(fig) html_writer.div_end() html_writer.write("</p>") K_sparse = SparseKernel(S_red).Solve() html_writer.write( "<p>The sparse null-space of the reduced stoichiometric matrix:") html_writer.insert_toggle(start_here=True) stoichiometric_matrix2html(html_writer, K_sparse, cids_red) html_writer.div_end() html_writer.write("</p>") dict_list = [] index2string_html = dict( (i, "V<sub>%02d</sub>" % i) for i in xrange(K_sparse.shape[0])) index2string = dict((i, "V%d" % i) for i in xrange(K_sparse.shape[0])) for i, cid in enumerate(cids_red): d = {} d['KEGG ID'] = '<a href="%s">C%05d</a>' % (kegg.cid2link(cid), cid) d['KEGG ID plain'] = 'C%05d' % cid d['Compound'] = kegg.cid2name(cid) d['nH'] = '%d' % cid2nH[cid] if cid in alberty_cid2dG0: d['dG0 (Alberty)'] = '%.1f' % alberty_cid2dG0[cid] else: d['dG0 (Alberty)'] = '' d['dG0 (PRC)'] = '%.1f' % cid2dG0[cid] d['dG0 (PRC) plain'] = '%.1f' % cid2dG0[cid] indic = np.where(abs(K_sparse[:, i]) > 1e-10, 1, 0).tolist() indic.reverse() d['order_key'] = indic if mlab.rms_flat(K_sparse[:, i]) > 1e-10: d['dG0 (PRC)'] += " + (" + vector2string(K_sparse[:, i], index2string_html) + ")" d['dG0 (PRC) plain'] += " + (" + vector2string( K_sparse[:, i], index2string) + ")" dict_list.append(d) dict_list.sort(key=lambda (d): (d['order_key'], d['KEGG ID plain'])) # Export the results to CSV csv_writer = csv.writer(open('../res/prc_results.csv', 'w')) csv_writer.writerow( ['KEGG ID', 'Compound', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)']) for d in dict_list: csv_writer.writerow([ d['KEGG ID plain'], d['Compound'], d['nH'], d['dG0 (PRC) plain'], d['dG0 (Alberty)'] ]) html_writer.write( "<p>All formation energies as a function of the free variables:") html_writer.insert_toggle(start_here=True) html_writer.write('<font size="1">') html_writer.write_table(dict_list, headers=[ '#', 'KEGG ID', 'Compound', 'nH', 'dG0 (PRC)', 'dG0 (Alberty)' ]) html_writer.write('</font>') html_writer.div_end() html_writer.write('</p>') fp = open('../res/prc_latex.txt', 'w') fp.write( latex.table2LaTeX(dict_list, headers=[ '#', 'KEGG ID plain', 'Compound', 'nH', 'dG0 (PRC) plain', 'dG0 (Alberty)' ])) fp.close()