def get_changed_bonds(rxn_smi): reactants = Chem.MolFromSmiles(rxn_smi.split('>')[0]) products = Chem.MolFromSmiles(rxn_smi.split('>')[2]) conserved_maps = [a.GetProp('molAtomMapNumber') for a in products.GetAtoms() if a.HasProp('molAtomMapNumber')] bond_changes = set() # keep track of bond changes # Look at changed bonds bonds_prev = {} for bond in reactants.GetBonds(): nums = sorted( [bond.GetBeginAtom().GetProp('molAtomMapNumber'), bond.GetEndAtom().GetProp('molAtomMapNumber')]) if (nums[0] not in conserved_maps) and (nums[1] not in conserved_maps): continue bonds_prev['{}~{}'.format(nums[0], nums[1])] = bond.GetBondTypeAsDouble() bonds_new = {} for bond in products.GetBonds(): nums = sorted( [bond.GetBeginAtom().GetProp('molAtomMapNumber'), bond.GetEndAtom().GetProp('molAtomMapNumber')]) bonds_new['{}~{}'.format(nums[0], nums[1])] = bond.GetBondTypeAsDouble() for bond in bonds_prev: if bond not in bonds_new: bond_changes.add((bond.split('~')[0], bond.split('~')[1], 0.0)) # lost bond else: if bonds_prev[bond] != bonds_new[bond]: bond_changes.add((bond.split('~')[0], bond.split('~')[1], bonds_new[bond])) # changed bond for bond in bonds_new: if bond not in bonds_prev: bond_changes.add((bond.split('~')[0], bond.split('~')[1], bonds_new[bond])) # new bond return bond_changes
def __init__(self, smiles=None, rdk=None, conv_enabled=False): """Constructor Keyword Arguments: smiles {str} -- SMILES representation of a molecule (default: {None}) rdk {rdkit Mol} -- molecule as an RDKit object (default: {None}) conv_enabled {bool} -- whether to set both smiles and graph arguments here or lazily defer until called (default: {False}) Raises: ValueError -- if neither a correct smiles string or a rdkit mol are provided """ if conv_enabled: if isinstance(smiles, str): # also checks if smiles can be parsed rdk = Chem.MolFromSmiles(smiles) assert rdk is not None elif rdk is not None: smiles = Chem.MolToSmiles(rdk) else: raise ValueError("Invalid arguments") self.smiles = smiles self.rdk = rdk self.graph = None # should be obtained from rdk when needed self.synthesis_path = [] # list of Reactions self.begin_flag = True
def properties(mol): """ Calculates the properties that are required to calculate the QED descriptor. """ if mol is None: raise ValueError('You need to provide a mol argument.') mol = Chem.RemoveHs(mol) qedProperties = QEDproperties( MW=rdmd._CalcMolWt(mol), ALOGP=Crippen.MolLogP(mol), HBA=sum(len(mol.GetSubstructMatches(pattern)) for pattern in Acceptors if mol.HasSubstructMatch(pattern)), HBD=rdmd.CalcNumHBD(mol), PSA=MolSurf.TPSA(mol), ROTB=rdmd.CalcNumRotatableBonds(mol, rdmd.NumRotatableBondsOptions.Strict), AROM=Chem.GetSSSR(Chem.DeleteSubstructs(Chem.Mol(mol), AliphaticRings)), ALERTS=sum(1 for alert in StructuralAlerts if mol.HasSubstructMatch(alert)), ) # The replacement # AROM=Lipinski.NumAromaticRings(mol), # is not identical. The expression above tends to count more rings # N1C2=CC=CC=C2SC3=C1C=CC4=C3C=CC=C4 # OC1=C(O)C=C2C(=C1)OC3=CC(=O)C(=CC3=C2C4=CC=CC=C4)O # CC(C)C1=CC2=C(C)C=CC2=C(C)C=C1 uses 2, should be 0 ? return qedProperties
def edit_mol(rmol, edits, tatoms): new_mol = Chem.RWMol(rmol) [a.SetNumExplicitHs(0) for a in new_mol.GetAtoms()] amap = {} for atom in rmol.GetAtoms(): amap[atom.GetAtomMapNum() - 1] = atom.GetIdx() for x, y, t, v in edits: bond = new_mol.GetBondBetweenAtoms(amap[x], amap[y]) # a1 = new_mol.GetAtomWithIdx(amap[x]) # a2 = new_mol.GetAtomWithIdx(amap[y]) if bond is not None: new_mol.RemoveBond(amap[x], amap[y]) if t > 0: new_mol.AddBond(amap[x], amap[y], BOND_FLOAT_TO_TYPE[t]) pred_mol = new_mol.GetMol() pred_smiles = Chem.MolToSmiles(pred_mol) pred_list = pred_smiles.split('.') pred_mols = [] for pred_smiles in pred_list: mol = Chem.MolFromSmiles(pred_smiles) if mol is None: continue atom_set = set([atom.GetAtomMapNum() - 1 for atom in mol.GetAtoms()]) if len(atom_set & tatoms) == 0: continue for atom in mol.GetAtoms(): atom.SetAtomMapNum(0) pred_mols.append(mol) return '.'.join( sorted([Chem.MolToSmiles(pred_mol) for pred_mol in pred_mols]))
def sanitize_smiles(smi, largest_fragment=False): mol = Chem.MolFromSmiles(smi) if mol is None: return smi try: mol = standardizer.standardize( mol) # standardize functional group reps if largest_fragment: mol = standardizer.largest_fragment( mol) # remove product counterions/salts/etc. mol = standardizer.uncharge( mol) # neutralize, e.g., carboxylic acids except Exception: pass return Chem.MolToSmiles(mol)
def get_bond_label(r, edits, max_natoms): rmol = Chem.MolFromSmiles(r) n_atoms = rmol.GetNumAtoms() rmap = np.zeros((max_natoms, max_natoms, nbos)) for s in edits.split(';'): a1, a2, bo = s.split('-') x = min(int(a1) - 1, int(a2) - 1) y = max(int(a1) - 1, int(a2) - 1) z = bo_to_index[float(bo)] rmap[x, y, z] = rmap[y, x, z] = 1 labels = [] sp_labels = [] for i in range(max_natoms): for j in range(max_natoms): for k in range(len(bo_to_index)): if i == j or i >= n_atoms or j >= n_atoms: labels.append(INVALID_BOND) # mask else: labels.append(rmap[i, j, k]) if rmap[i, j, k] == 1: sp_labels.append(i * max_natoms * nbos + j * nbos + k) # TODO: check if this is consistent with how TF does flattening return np.array(labels), sp_labels
def smiles2graph(smiles, idxfunc=lambda x: x.GetIdx()): mol = Chem.MolFromSmiles(smiles) if not mol: raise ValueError("Could not parse smiles string:", smiles) n_atoms = mol.GetNumAtoms() n_bonds = max(mol.GetNumBonds(), 1) fatoms = np.zeros((n_atoms, atom_fdim)) fbonds = np.zeros((n_bonds, bond_fdim)) atom_nb = np.zeros((n_atoms, max_nb), dtype=np.int32) bond_nb = np.zeros((n_atoms, max_nb), dtype=np.int32) num_nbs = np.zeros((n_atoms, ), dtype=np.int32) for atom in mol.GetAtoms(): idx = idxfunc(atom) if idx >= n_atoms: raise Exception(smiles) fatoms[idx] = atom_features(atom) for bond in mol.GetBonds(): a1 = idxfunc(bond.GetBeginAtom()) a2 = idxfunc(bond.GetEndAtom()) idx = bond.GetIdx() if num_nbs[a1] == max_nb or num_nbs[a2] == max_nb: raise Exception(smiles) atom_nb[a1, num_nbs[a1]] = a2 atom_nb[a2, num_nbs[a2]] = a1 bond_nb[a1, num_nbs[a1]] = idx bond_nb[a2, num_nbs[a2]] = idx num_nbs[a1] += 1 num_nbs[a2] += 1 fbonds[idx] = bond_features(bond) return fatoms, fbonds, atom_nb, bond_nb, num_nbs
def get_product_smiles(rmol, edits, tatoms): smiles = edit_mol(rmol, edits, tatoms) if len(smiles) != 0: return smiles try: Chem.Kekulize(rmol) except Exception as e: return smiles return edit_mol(rmol, edits, tatoms)
def processMols(mols): print('smiles\tName\tsa_score') for i, m in enumerate(mols): if m is None: continue s = calculateScore(m) smiles = Chem.MolToSmiles(m) print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s)
def graph2mol_igraph(graph): emol = Chem.rdchem.RWMol() for v in graph.vs(): label = "AtomicNum" emol.AddAtom(Chem.Atom(int(v[label]))) for e in graph.es(): label = "BondType" emol.AddBond(e.source, e.target, BOND_FLOAT_TO_TYPE[e[label]]) mol = emol.GetMol() return mol
def get_feature_batch(r_list): max_natoms = 0 for r in r_list: rmol = Chem.MolFromSmiles(r) if rmol.GetNumAtoms() > max_natoms: max_natoms = rmol.GetNumAtoms() features = [] for r in r_list: features.append(get_bin_feature(r, max_natoms)) return np.array(features)
def get_bin_feature(r, max_natoms): ''' This function is used to generate descriptions of atom-atom relationships, including the bond type between the atoms (if any) and whether they belong to the same molecule. It is used in the global attention mechanism. ''' comp = {} for i, s in enumerate(r.split('.')): mol = Chem.MolFromSmiles(s) for atom in mol.GetAtoms(): comp[atom.GetIntProp('molAtomMapNumber') - 1] = i n_comp = len(r.split('.')) rmol = Chem.MolFromSmiles(r) n_atoms = rmol.GetNumAtoms() bond_map = {} for bond in rmol.GetBonds(): a1 = bond.GetBeginAtom().GetIntProp('molAtomMapNumber') - 1 a2 = bond.GetEndAtom().GetIntProp('molAtomMapNumber') - 1 bond_map[(a1, a2)] = bond_map[(a2, a1)] = bond features = [] for i in range(max_natoms): for j in range(max_natoms): f = np.zeros((binary_fdim, )) if i >= n_atoms or j >= n_atoms or i == j: features.append(f) continue if (i, j) in bond_map: bond = bond_map[(i, j)] f[1:1 + bond_fdim] = bond_features(bond) else: f[0] = 1.0 f[-4] = 1.0 if comp[i] != comp[j] else 0.0 f[-3] = 1.0 if comp[i] == comp[j] else 0.0 f[-2] = 1.0 if n_comp == 1 else 0.0 f[-1] = 1.0 if n_comp > 1 else 0.0 features.append(f) return np.vstack(features).reshape((max_natoms, max_natoms, binary_fdim))
def _test_sas(self): sas_func = lambda mol: calculateSAScore(Chem.MolFromSmiles(mol.smiles)) print(sas_func(Molecule("CC"))) test_pool = ["CC", "O=C=O", "C#N", "CCN(CC)CC", "CC(=O)O", "C1CCCCC1", "c1ccccc1"] test_pool = [Molecule(smiles) for smiles in test_pool] exp = RandomExplorer(sas_func, initial_pool=test_pool) print("Starting SA score optimization") t0 = time() exp.run(10) #check print("Completed SA score optimization, time elapsed: %.3fs" % (time()-t0)) print(exp.pool) top = exp.get_best(1)[0] print(top.get_synthesis_path())
def get_all_batch(re_list): mol_list = [] max_natoms = 0 for r, e in re_list: rmol = Chem.MolFromSmiles(r) mol_list.append((r, e)) if rmol.GetNumAtoms() > max_natoms: max_natoms = rmol.GetNumAtoms() labels = [] features = [] sp_labels = [] for r, e in mol_list: l, sl = get_bond_label(r, e, max_natoms) features.append(get_bin_feature(r, max_natoms)) labels.append(l) sp_labels.append(sl) return np.array(features), np.array(labels), sp_labels
def get_graph_data_for_distance_computation(mol): """ Returns graph representation for a molecule. """ if isinstance(mol, str): from mols.molecule import Molecule mol = Molecule(mol) rdk_mol = mol.to_rdkit() rdk_mol = Chem.AddHs(rdk_mol) adj_matrix = Chem.rdmolops.GetAdjacencyMatrix(rdk_mol) bonds = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in rdk_mol.GetBonds()] bond_types = [ rdk_mol.GetBondBetweenAtoms(b[0], b[1]).GetBondType() for b in bonds ] atom_idxs = list(range(len(rdk_mol.GetAtoms()))) atomic_numbers = [ rdk_mol.GetAtomWithIdx(idx).GetAtomicNum() for idx in atom_idxs ] atomic_symbols = [ rdk_mol.GetAtomWithIdx(idx).GetSymbol() for idx in atom_idxs ] atomic_masses = [ rdk_mol.GetAtomWithIdx(idx).GetMass() for idx in atom_idxs ] num_atoms = len(atom_idxs) bonds_of_each_atom = [ get_neighbors_and_bond_types(idx, bonds, atomic_symbols, bond_types) for idx in range(num_atoms) ] bond_type_counts_of_each_atom = [ get_bond_type_counts(bt) for bt in bonds_of_each_atom ] # Return graph_data = Namespace( rdk_mol=rdk_mol, adj_matrix=adj_matrix, bonds=bonds, bond_types=bond_types, atom_idxs=atom_idxs, atomic_numbers=atomic_numbers, atomic_symbols=atomic_symbols, atomic_masses=atomic_masses, num_atoms=num_atoms, bonds_of_each_atom=bonds_of_each_atom, bond_type_counts_of_each_atom=bond_type_counts_of_each_atom, ) return graph_data
def to_rdkit(self): """ Converter to rdkit library format, which is used for computation of molecular properties and for synthesis. Performs a validity check. Returns: rdkit.Mol -- molecule in RDKit format Raises: ValueError -- if SMILES cannot be decoded into a chemically valid molecule. """ if self.rdk is None: self.rdk = Chem.MolFromSmiles(self.smiles) if self.rdk is None: raise ValueError(f"Molecule {self.smiles} is not valid.") return self.rdk
def smiles2graph(rsmiles, psmiles, core_bonds, gold_bonds, cutoff=500, idxfunc=lambda x: x.GetIntProp('molAtomMapNumber') - 1, core_size=20, kmax=5, return_found=False, testing=False): '''This is the function that takes reactants, a true product (when defined), and the candidate bonds to generate all of the candidate products according to some bounds on the enumeration''' mol = Chem.MolFromSmiles(rsmiles) if not mol: raise ValueError("Could not parse smiles string:", rsmiles) if not testing: pmol = Chem.MolFromSmiles(psmiles) if not pmol: raise ValueError("Could not parse smiles string:", psmiles) n_atoms = mol.GetNumAtoms() n_bonds = max(mol.GetNumBonds(), 1) fatoms = np.zeros((n_atoms, atom_fdim)) fbonds = np.zeros((n_bonds, bond_fdim)) atom_nb = np.zeros((n_atoms, max_nb), dtype=np.int32) bond_nb = np.zeros((n_atoms, max_nb), dtype=np.int32) num_nbs = np.zeros((n_atoms, ), dtype=np.int32) raw_atom_nb = np.zeros((n_atoms, max_nb), dtype=np.int32) raw_bond_nb = np.zeros((n_atoms, max_nb), dtype=np.int32) raw_num_nbs = np.zeros((n_atoms, ), dtype=np.int32) free_vals = np.zeros((n_atoms, )) pfree_vals = np.zeros((n_atoms, )) is_c2_of_pyridine = np.zeros((n_atoms, ), dtype=bool) is_c = np.zeros((n_atoms, ), dtype=bool) is_p = np.zeros((n_atoms, ), dtype=bool) is_s = np.zeros((n_atoms, ), dtype=bool) is_o = np.zeros((n_atoms, ), dtype=bool) is_n = np.zeros((n_atoms, ), dtype=bool) # gbonds = {(x,y):0 for x,y in core_bonds} #Feature Extraction for atom in mol.GetAtoms(): idx = idxfunc(atom) fatoms[idx] = atom_features(atom) free_vals[idx] += atom.GetTotalNumHs() + abs(atom.GetFormalCharge()) # TODO: review these rules # Aromatic carbon next to an aromatic nitrogen can get a carbonyl b/c stupid bookkeeping of hydroxypyridines if atom.GetAtomicNum() == 6: is_c[idx] = True if atom.GetIsAromatic(): for nbr in atom.GetNeighbors(): if nbr.GetAtomicNum() == 7 and nbr.GetDegree() == 2: is_c2_of_pyridine[idx] = True break # Nitrogen should be allowed to become positively charged elif atom.GetAtomicNum() == 7: free_vals[idx] += 1 - atom.GetFormalCharge() is_n[idx] = True # Phosphorous can form a phosphonium elif atom.GetAtomicNum() == 15: free_vals[idx] += 1 - atom.GetFormalCharge() is_p[idx] = True elif atom.GetAtomicNum() == 8: is_o[idx] = True elif atom.GetAtomicNum() == 16: is_s[idx] = True # special information needed for valence filtering if not testing: tatoms = set() #Calculate free slots for each atom in product for bond in pmol.GetBonds(): a1 = idxfunc(bond.GetBeginAtom()) a2 = idxfunc(bond.GetEndAtom()) t = bond_types.index(bond.GetBondType()) + 1 a1, a2 = min(a1, a2), max(a1, a2) tatoms.add(a1) tatoms.add(a2) if (a1, a2) in core_bonds: # gbonds[(a1,a2)] = t tval = t if t < 4 else 1.5 pfree_vals[a1] += tval pfree_vals[a2] += tval rbonds = {} rbond_vals = {} # bond orders ring_bonds = set() #Calculate free slots for each atom in reactant for bond in mol.GetBonds(): idx = bond.GetIdx() a1 = idxfunc(bond.GetBeginAtom()) a2 = idxfunc(bond.GetEndAtom()) t = bond_types.index(bond.GetBondType()) a1, a2 = min(a1, a2), max(a1, a2) tval = t + 1 if t < 3 else 1.5 rbonds[(a1, a2)] = t + 1 rbond_vals[(a1, a2)] = tval if (a1, a2) in core_bonds: free_vals[a1] += tval free_vals[a2] += tval if bond.IsInRing(): ring_bonds.add((a1, a2)) # Get all possible core configurations - NEW IN DIRECT VERSION from itertools import combinations core_configs = [ ] # will be list of lists of (x, y, t, v) tuples, where t is the bond order and v is CoreFinder score # print('rbond_vals:') # print(rbond_vals) # Filter out core bonds that exactly match reactants prev_len = len(core_bonds) core_bonds = [(x, y, t, v) for (x, y, t, v) in core_bonds if ((x, y) not in rbond_vals) or (rbond_vals[(x, y)] != t)] # print('{}/{} core bonds kept after filtering existing bonds'.format(prev_len, len(core_bonds))) # Pare down to top-core_size only core_bonds = core_bonds[:core_size] # Helper function to check if a combination is connected - this helps the number of valid combinations core_bonds_adj = np.eye(len(core_bonds), dtype=bool) for i in range(len(core_bonds)): a1, b1, t1, v1 = core_bonds[i] for j in range(i, len(core_bonds)): a2, b2, t2, v2 = core_bonds[j] if a1 == a2 or a1 == b2 or b1 == a2 or b1 == b2: core_bonds_adj[i, j] = core_bonds_adj[j, i] = True # print(core_bonds) # print('Calculated core bonds adj matrix: {}'.format(core_bonds_adj * 1.0)) def check_if_connected(combo_i): '''Checks if a set of candidate edits (by indeces) are all connected''' if len(combo_i) == 1: return True # only one change, always connected temp_adj_pow = np.linalg.matrix_power( core_bonds_adj[combo_i, :][:, combo_i], len(combo_i) - 1) return np.all(temp_adj_pow) # Helper function to check if a combiation is valid def check_if_valid(bond_change_combo): force_even_parity = np.zeros((n_atoms, ), dtype=bool) force_odd_parity = np.zeros((n_atoms, ), dtype=bool) seen = defaultdict(lambda: False) free_vals_temp = free_vals.copy() for x, y, t, v in bond_change_combo: x, y = tuple(sorted([x, y])) if seen[(x, y)]: # print('already seen this bond in the list of cand changes') return False # can't have two distinct bond change types in same combo seen[(x, y)] = True # TODO: review these valence rules # Special rules: # - if phosphorous or sulfur, don't count formation of =O toward valence but require odd/even # - if c2 carbon in a pyridine ring, let it get a =O tx = ty = t if t == 2: if is_o[x]: if is_c2_of_pyridine[y]: ty = 1. # pretend it's just a hydroxylation for the sake of valence elif is_p[y]: ty = 0. # don't count toward valence force_odd_parity[ y] = True # but require odd valence parity elif is_s[y]: ty = 0. force_even_parity[y] = True elif is_o[y]: if is_c2_of_pyridine[x]: tx = 1. elif is_p[x]: tx = 0. force_odd_parity[x] = True elif is_s[x]: tx = 0. force_even_parity[x] = True elif is_n[x] and is_p[y]: ty = 0. force_odd_parity[y] = True elif is_n[y] and is_p[x]: tx = 0. force_odd_parity[x] = True elif is_p[x] and is_c[y]: tx = 0. force_odd_parity[x] = True elif is_p[y] and is_c[x]: ty = 0. force_odd_parity[y] = True if (x, y) in rbond_vals: free_vals_temp[x] += rbond_vals[(x, y)] - tx free_vals_temp[y] += rbond_vals[(x, y)] - ty else: free_vals_temp[x] += -tx free_vals_temp[y] += -ty # too many connections? sulfur valence not even? phosphorous valence not odd? if any(free_vals_temp < 0) \ or any(aval % 2 != 0 for aval in free_vals_temp[force_even_parity]) \ or any(aval % 2 != 1 for aval in free_vals_temp[force_odd_parity]): # print('invalid valence?') # print(free_vals_temp) return False return True # N choose k combinatorics # up to 4 bond changes at once - only 0.19% of train examples have 5 bonds changed, we can take the hit... core_bonds_i = range(len(core_bonds)) for k in range(1, kmax + 1): for bond_change_combo_i in combinations(core_bonds_i, k): # Check if connected if not check_if_connected(bond_change_combo_i): # print('This combination is not connected!') continue bond_change_combo = [core_bonds[i] for i in bond_change_combo_i] if check_if_valid(bond_change_combo): core_configs.append(bond_change_combo) # print('Found a total of {} core configs that seem valid'.format(len(core_configs))) if not testing: random.shuffle(core_configs) idx = -1 for i, cand_bonds in enumerate(core_configs): if set([(x, y, t) for (x, y, t, v) in cand_bonds]) == gold_bonds: idx = i break # If we are training and did not find the true outcome, make sure it is the first entry if idx == -1: # print('Did not find true outcome') found_true = False core_configs = [[(x, y, t, 0.0) for (x, y, t) in gold_bonds]] + core_configs else: # print('Found true outcome') found_true = True core_configs[0], core_configs[idx] = core_configs[ idx], core_configs[0] # swap order so true is first else: found_true = False if not testing: # If it is possible to recover the true smiles from the set of bonds using the edit_mol method, # remove duplicates from the list by converting each candidate into a smiles string # note: get_product_smiles is HIGHLY imperfect, but that's not a huge deal. training tries to pick the # right bonds. The evaluation script has a more robust function to get product_smiles smiles0 = get_product_smiles(mol, core_configs[0], tatoms) if len(smiles0) > 0: # cand_smiles = set([smiles0]) new_core_configs = [core_configs[0]] for core_conf in core_configs[1:]: smiles = get_product_smiles(mol, core_conf, tatoms) # print('candidate smiles: {}'.format(smiles)) if smiles in cand_smiles or len(smiles) == 0: continue cand_smiles.add(smiles) new_core_configs.append(core_conf) core_configs = new_core_configs else: print('\nwarning! could not recover true smiles from gbonds: {}'. format(psmiles)) print('{} {}'.format(rsmiles, gold_bonds)) # print('After removing duplicates, {} core configs'.format(len(core_configs))) core_configs = core_configs[:cutoff] n_batch = len(core_configs) + 1 if not testing: labels = np.zeros((n_batch - 1, )) labels[0] = 1 # Calculate information that is the same for all candidates; do small updates based on specific changes later pending_reactant_neighbors = [ ] # reactant neighbors that *might* be over-ridden core_bonds_noScore = [(x, y, t) for (x, y, t, z) in core_bonds] for bond in mol.GetBonds(): idx = bond.GetIdx() a1 = idxfunc(bond.GetBeginAtom()) a2 = idxfunc(bond.GetEndAtom()) a1, a2 = min(a1, a2), max(a1, a2) if ( a1, a2, 0.0 ) not in core_bonds_noScore: # are a1 and a2 guaranteed to be neighbors? raw_atom_nb[a1, raw_num_nbs[a1]] = a2 raw_atom_nb[a2, raw_num_nbs[a2]] = a1 raw_bond_nb[a1, raw_num_nbs[a1]] = idx raw_bond_nb[a2, raw_num_nbs[a2]] = idx raw_num_nbs[a1] += 1 raw_num_nbs[a2] += 1 else: pending_reactant_neighbors.append( (a1, a2, bond.GetBondTypeAsDouble())) # Reactants have this bond... atom_nb[a1, num_nbs[a1]] = a2 atom_nb[a2, num_nbs[a2]] = a1 bond_nb[a1, num_nbs[a1]] = idx bond_nb[a2, num_nbs[a2]] = idx num_nbs[a1] += 1 num_nbs[a2] += 1 fbonds[idx] = bond_features(bond) # print('What is core_bonds here?: {}'.format(core_bonds)) if not testing: num_newbonds = max( len(gold_bonds), len(core_bonds) ) * 2 + 1 # CC fixed in case where core_bonds isn't large enough else: num_newbonds = len(core_bonds) * 2 + 1 new_fbonds = np.zeros( (n_bonds + num_newbonds + len(pending_reactant_neighbors), bond_fdim)) # CC added + len(pending_reactant_neighbors) new_fbonds[:n_bonds, :] = fbonds fbonds = new_fbonds batch_fbonds, batch_anb, batch_bnb, batch_nbs = [fbonds], [atom_nb], [ bond_nb ], [num_nbs] # first entry is reactants batch_corebias = [] for core_bonds in core_configs: atom_nb2 = np.copy(raw_atom_nb) bond_nb2 = np.copy(raw_bond_nb) num_nbs2 = np.copy(raw_num_nbs) fbonds2 = np.copy(fbonds) n_bonds2 = n_bonds + 1 # Add back reactant bonds? core_bonds_nobo = [(x, y) for (x, y, t, v) in core_bonds] for (x, y, t) in pending_reactant_neighbors: if (x, y) not in core_bonds_nobo: core_bonds.append((x, y, t, 0.0)) for x, y, t, v in core_bonds: # add new bond features to the "default" reactant ones if t == 0: continue atom_nb2[x, num_nbs2[x]] = y atom_nb2[y, num_nbs2[y]] = x bond_nb2[x, num_nbs2[x]] = n_bonds2 bond_nb2[y, num_nbs2[y]] = n_bonds2 num_nbs2[x] += 1 num_nbs2[y] += 1 fbonds2[n_bonds2] = onek_encoding_unk(t, [1.0, 2.0, 3.0, 1.5, -1]) if (x, y) in ring_bonds: fbonds2[n_bonds2][4] = 1 n_bonds2 += 1 batch_fbonds.append(fbonds2) batch_anb.append(atom_nb2) batch_bnb.append(bond_nb2) batch_nbs.append(num_nbs2) batch_corebias.append(sum([v for (x, y, t, v) in core_bonds])) # TODO: change atom features for each candidate? Maybe update degree at least if return_found: return (np.array([fatoms] * n_batch), np.array(batch_fbonds), packnb(batch_anb), packnb(batch_bnb), np.array(batch_nbs), np.array(batch_corebias), labels), core_configs, found_true if not testing: return (np.array([fatoms] * n_batch), np.array(batch_fbonds), packnb(batch_anb), packnb(batch_bnb), np.array(batch_nbs), np.array(batch_corebias), labels), core_configs return (np.array([fatoms] * n_batch), np.array(batch_fbonds), packnb(batch_anb), packnb(batch_bnb), np.array(batch_nbs), np.array(batch_corebias)), core_configs
def to_smiles(self): smiles = self.smiles if self.smiles is None: self.smiles = Chem.MolToSmiles(self.rdk) return self.smiles
try: rank = [] n, top1, top2, top3, top5, gfound = 0, 0, 0, 0, 0, 0 top1_sani, top2_sani, top3_sani, top5_sani, gfound_sani = 0, 0, 0, 0, 0 for line in fpred: thisrow = [] line = line.strip('\r\n |') gold = fgold.readline() rex, gedits = gold.split() r, _, p = rex.split('>') if opts.singleonly and '.' in p: continue rmol = Chem.MolFromSmiles(r) pmol = Chem.MolFromSmiles(p) thisrow.append(r) thisrow.append(p) # Save pbond information pbonds = {} for bond in pmol.GetBonds(): a1 = idxfunc(bond.GetBeginAtom()) a2 = idxfunc(bond.GetEndAtom()) t = bond_types.index(bond.GetBondType()) pbonds[(a1, a2)] = pbonds[(a2, a1)] = t + 1 for atom in pmol.GetAtoms(): atom.ClearProp('molAtomMapNumber')
def edit_mol(rmol, edits): new_mol = Chem.RWMol(rmol) # Keep track of aromatic nitrogens, might cause explicit hydrogen issues aromatic_nitrogen_idx = set() aromatic_carbonyl_adj_to_aromatic_nH = {} aromatic_carbondeg3_adj_to_aromatic_nH0 = {} for a in new_mol.GetAtoms(): if a.GetIsAromatic() and a.GetSymbol() == 'N': aromatic_nitrogen_idx.add(a.GetIdx()) for nbr in a.GetNeighbors(): if a.GetNumExplicitHs() == 1 and nbr.GetSymbol( ) == 'C' and nbr.GetIsAromatic() and any( b.GetBondTypeAsDouble() == 2 for b in nbr.GetBonds()): aromatic_carbonyl_adj_to_aromatic_nH[ nbr.GetIdx()] = a.GetIdx() elif a.GetNumExplicitHs() == 0 and nbr.GetSymbol( ) == 'C' and nbr.GetIsAromatic() and len(nbr.GetBonds()) == 3: aromatic_carbondeg3_adj_to_aromatic_nH0[ nbr.GetIdx()] = a.GetIdx() else: a.SetNumExplicitHs(0) new_mol.UpdatePropertyCache() amap = {} for atom in rmol.GetAtoms(): amap[atom.GetIntProp('molAtomMapNumber')] = atom.GetIdx() # Apply the edits as predicted for x, y, t in edits: bond = new_mol.GetBondBetweenAtoms(amap[x], amap[y]) a1 = new_mol.GetAtomWithIdx(amap[x]) a2 = new_mol.GetAtomWithIdx(amap[y]) if bond is not None: new_mol.RemoveBond(amap[x], amap[y]) # Are we losing a bond on an aromatic nitrogen? if bond.GetBondTypeAsDouble() == 1.0: if amap[x] in aromatic_nitrogen_idx: if a1.GetTotalNumHs() == 0: a1.SetNumExplicitHs(1) elif a1.GetFormalCharge() == 1: a1.SetFormalCharge(0) elif amap[y] in aromatic_nitrogen_idx: if a2.GetTotalNumHs() == 0: a2.SetNumExplicitHs(1) elif a2.GetFormalCharge() == 1: a2.SetFormalCharge(0) # Are we losing a c=O bond on an aromatic ring? If so, remove H from adjacent nH if appropriate if bond.GetBondTypeAsDouble() == 2.0: if amap[x] in aromatic_carbonyl_adj_to_aromatic_nH: new_mol.GetAtomWithIdx( aromatic_carbonyl_adj_to_aromatic_nH[ amap[x]]).SetNumExplicitHs(0) elif amap[y] in aromatic_carbonyl_adj_to_aromatic_nH: new_mol.GetAtomWithIdx( aromatic_carbonyl_adj_to_aromatic_nH[ amap[y]]).SetNumExplicitHs(0) if t > 0: new_mol.AddBond(amap[x], amap[y], BOND_TYPE[t]) # Special alkylation case? if t == 1: if amap[x] in aromatic_nitrogen_idx: if a1.GetTotalNumHs() == 1: a1.SetNumExplicitHs(0) else: a1.SetFormalCharge(1) elif amap[y] in aromatic_nitrogen_idx: if a2.GetTotalNumHs() == 1: a2.SetNumExplicitHs(0) else: a2.SetFormalCharge(1) # Are we getting a c=O bond on an aromatic ring? If so, add H to adjacent nH0 if appropriate if t == 2: if amap[x] in aromatic_carbondeg3_adj_to_aromatic_nH0: new_mol.GetAtomWithIdx( aromatic_carbondeg3_adj_to_aromatic_nH0[ amap[x]]).SetNumExplicitHs(1) elif amap[y] in aromatic_carbondeg3_adj_to_aromatic_nH0: new_mol.GetAtomWithIdx( aromatic_carbondeg3_adj_to_aromatic_nH0[ amap[y]]).SetNumExplicitHs(1) # Tried: # bonds_to_remove.sort(key=lambda x: x[0], reverse=True) # for (idx, bond) in bonds_to_remove: # start = bond.GetBeginAtomIdx() # end = bond.GetEndAtomIdx() # new_mol.RemoveBond(start, end) # pred_mol = new_mol.GetMol() pred_mol = new_mol.GetMol() # Clear formal charges to make molecules valid # Note: because S and P (among others) can change valence, be more flexible for atom in pred_mol.GetAtoms(): atom.ClearProp('molAtomMapNumber') if atom.GetSymbol() == 'N' and atom.GetFormalCharge( ) == 1: # exclude negatively-charged azide bond_vals = sum( [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()]) if bond_vals <= 3: atom.SetFormalCharge(0) elif atom.GetSymbol() == 'N' and atom.GetFormalCharge( ) == -1: # handle negatively-charged azide addition bond_vals = sum( [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()]) if bond_vals == 3 and any( [nbr.GetSymbol() == 'N' for nbr in atom.GetNeighbors()]): atom.SetFormalCharge(0) elif atom.GetSymbol() == 'N': bond_vals = sum( [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()]) if bond_vals == 4 and not atom.GetIsAromatic( ): # and atom.IsInRingSize(5)): atom.SetFormalCharge(1) elif atom.GetSymbol() == 'C' and atom.GetFormalCharge() != 0: atom.SetFormalCharge(0) elif atom.GetSymbol() == 'O' and atom.GetFormalCharge() != 0: bond_vals = sum( [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()]) + atom.GetNumExplicitHs() if bond_vals == 2: atom.SetFormalCharge(0) elif atom.GetSymbol() in ['Cl', 'Br', 'I', 'F' ] and atom.GetFormalCharge() != 0: bond_vals = sum( [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()]) if bond_vals == 1: atom.SetFormalCharge(0) elif atom.GetSymbol() == 'S' and atom.GetFormalCharge() != 0: bond_vals = sum( [bond.GetBondTypeAsDouble() for bond in atom.GetBonds()]) if bond_vals in [2, 4, 6]: atom.SetFormalCharge(0) elif atom.GetSymbol( ) == 'P': # quartenary phosphorous should be pos. charge with 0 H bond_vals = [ bond.GetBondTypeAsDouble() for bond in atom.GetBonds() ] if sum(bond_vals) == 4 and len(bond_vals) == 4: atom.SetFormalCharge(1) atom.SetNumExplicitHs(0) elif sum(bond_vals) == 3 and len( bond_vals) == 3: # make sure neutral atom.SetFormalCharge(0) elif atom.GetSymbol( ) == 'B': # quartenary boron should be neg. charge with 0 H bond_vals = [ bond.GetBondTypeAsDouble() for bond in atom.GetBonds() ] if sum(bond_vals) == 4 and len(bond_vals) == 4: atom.SetFormalCharge(-1) atom.SetNumExplicitHs(0) elif atom.GetSymbol() in ['Mg', 'Zn']: bond_vals = [ bond.GetBondTypeAsDouble() for bond in atom.GetBonds() ] if sum(bond_vals) == 1 and len(bond_vals) == 1: atom.SetFormalCharge(1) elif atom.GetSymbol() == 'Si': bond_vals = [ bond.GetBondTypeAsDouble() for bond in atom.GetBonds() ] if sum(bond_vals) == len(bond_vals): atom.SetNumExplicitHs(max(0, 4 - len(bond_vals))) # Bounce to/from SMILES to try to sanitize pred_smiles = Chem.MolToSmiles(pred_mol) # <--- TODO: error occurs here pred_list = pred_smiles.split('.') pred_mols = [Chem.MolFromSmiles(pred_smiles) for pred_smiles in pred_list] for i, mol in enumerate(pred_mols): # Check if we failed/succeeded in previous step if mol is None: logging.debug('##### Unparseable mol: {}'.format(pred_list[i])) continue # Else, try post-sanitiztion fixes in structure mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol)) if mol is None: continue for rxn in clean_rxns_postsani: out = rxn.RunReactants((mol, )) if out: try: Chem.SanitizeMol(out[0][0]) pred_mols[i] = Chem.MolFromSmiles( Chem.MolToSmiles(out[0][0])) except Exception as e: print(e) print('Could not sanitize postsani reaction product: {}'. format(Chem.MolToSmiles(out[0][0]))) print('Original molecule was: {}'.format( Chem.MolToSmiles(mol))) pred_smiles = [ Chem.MolToSmiles(pred_mol) for pred_mol in pred_mols if pred_mol is not None ] return pred_smiles
continue s = calculateScore(m) smiles = Chem.MolToSmiles(m) print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s) if __name__ == '__main__': import sys, time t1 = time.time() readFragmentScores("fpscores") t2 = time.time() suppl = Chem.SmilesMolSupplier(sys.argv[1]) t3 = time.time() processMols(suppl) t4 = time.time() print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)), file=sys.stderr) # # Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met:
from collections import namedtuple import math from myrdkit import Chem from myrdkit import MolSurf, Crippen from myrdkit import rdmd QEDproperties = namedtuple('QEDproperties', 'MW,ALOGP,HBA,HBD,PSA,ROTB,AROM,ALERTS') ADSparameter = namedtuple('ADSparameter', 'A,B,C,D,E,F,DMAX') WEIGHT_MAX = QEDproperties(0.50, 0.25, 0.00, 0.50, 0.00, 0.50, 0.25, 1.00) WEIGHT_MEAN = QEDproperties(0.66, 0.46, 0.05, 0.61, 0.06, 0.65, 0.48, 0.95) WEIGHT_NONE = QEDproperties(1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00) AliphaticRings = Chem.MolFromSmarts('[$([A;R][!a])]') # AcceptorSmarts = [ '[oH0;X2]', '[OH1;X2;v2]', '[OH0;X2;v2]', '[OH0;X1;v2]', '[O-;X1]', '[SH0;X2;v2]', '[SH0;X1;v2]', '[S-;X1]', '[nH0;X2]', '[NH0;X1;v3]', '[$([N;+0;X3;v3]);!$(N[C,S]=O)]' ]
def calculateScore(m): if _fscores is None: readFragmentScores() # fragment score fp = rdMolDescriptors.GetMorganFingerprint( m, 2) #<- 2 is the *radius* of the circular fingerprint fps = fp.GetNonzeroElements() score1 = 0. nf = 0 for bitId, v in iteritems(fps): nf += v sfp = bitId score1 += _fscores.get(sfp, -4) * v score1 /= nf # features score nAtoms = m.GetNumAtoms() nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True)) ri = m.GetRingInfo() nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri) nMacrocycles = 0 for x in ri.AtomRings(): if len(x) > 8: nMacrocycles += 1 sizePenalty = nAtoms**1.005 - nAtoms stereoPenalty = math.log10(nChiralCenters + 1) spiroPenalty = math.log10(nSpiro + 1) bridgePenalty = math.log10(nBridgeheads + 1) macrocyclePenalty = 0. # --------------------------------------- # This differs from the paper, which defines: # macrocyclePenalty = math.log10(nMacrocycles+1) # This form generates better results when 2 or more macrocycles are present if nMacrocycles > 0: macrocyclePenalty = math.log10(2) score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty # correction for the fingerprint density # not in the original publication, added in version 1.1 # to make highly symmetrical molecules easier to synthetise score3 = 0. if nAtoms > len(fps): score3 = math.log(float(nAtoms) / len(fps)) * .5 sascore = score1 + score2 + score3 # need to transform "raw" value into scale between 1 and 10 min = -4.0 max = 2.5 sascore = 11. - (sascore - min + 1) / (max - min) * 9. # smooth the 10-end if sascore > 8.: sascore = 8. + math.log(sascore + 1. - 9.) if sascore > 10.: sascore = 10.0 elif sascore < 1.: sascore = 1.0 return sascore
def predict(self, react, top_cand_bonds, top_cand_scores=[], scores=True, top_n=100): '''react: atom mapped reactant smiles top_cand_bonds: list of strings "ai-aj-bo"''' cand_bonds = [] if not top_cand_scores: top_cand_scores = [0.0 for b in top_cand_bonds] for i, b in enumerate(top_cand_bonds): x, y, t = b.split('-') x, y, t = int(float(x)) - 1, int(float(y)) - 1, float(t) cand_bonds.append((x, y, t, float(top_cand_scores[i]))) while True: src_tuple, conf = smiles2graph(react, None, cand_bonds, None, core_size=core_size, cutoff=MAX_NCAND, testing=True) if len(conf) <= MAX_NCAND: break ncore -= 1 feed_map = {x: y for x, y in zip(self.src_holder, src_tuple)} cur_scores, cur_probs, candidates = self.session.run( self.predict_vars, feed_dict=feed_map) idxfunc = lambda a: a.GetAtomMapNum() bond_types = [ Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC ] bond_types_as_double = {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, 1.5: 4} # Don't waste predictions on bond changes that aren't actually changes rmol = Chem.MolFromSmiles(react) rbonds = {} for bond in rmol.GetBonds(): a1 = idxfunc(bond.GetBeginAtom()) a2 = idxfunc(bond.GetEndAtom()) t = bond_types.index(bond.GetBondType()) + 1 a1, a2 = min(a1, a2), max(a1, a2) rbonds[(a1, a2)] = t cand_smiles = [] cand_scores = [] cand_probs = [] for idx in candidates: cbonds = [] # Define edits from prediction for x, y, t, v in conf[idx]: x, y = x + 1, y + 1 if ((x, y) not in rbonds and t > 0) or ( (x, y) in rbonds and rbonds[(x, y)] != t): cbonds.append((x, y, bond_types_as_double[t])) pred_smiles = edit_mol(rmol, cbonds) cand_smiles.append(pred_smiles) cand_scores.append(cur_scores[idx]) cand_probs.append(cur_probs[idx]) outcomes = [] if scores: for i in range(min(len(cand_smiles), top_n)): outcomes.append({ 'rank': i + 1, 'smiles': cand_smiles[i], 'score': cand_scores[i], 'prob': cand_probs[i], }) else: for i in range(min(len(cand_smiles), top_n)): outcomes.append({ 'rank': i + 1, 'smiles': cand_smiles[i], }) return outcomes
a = np.zeros((len(arr_list), N)) for i, arr in enumerate(arr_list): for j in range(arr.shape[0]): a[i][j] = 1 return a def smiles2graph_list(smiles_list, idxfunc=lambda x: x.GetIdx()): res = list(map(lambda x: smiles2graph(x, idxfunc), smiles_list)) fatom_list, fbond_list, gatom_list, gbond_list, nb_list = zip(*res) return pack2D(fatom_list), pack2D(fbond_list), pack2D_withidx( gatom_list), pack2D_withidx(gbond_list), pack1D(nb_list), get_mask( fatom_list) m = Chem.MolFromSmiles('CC') assignProperties(m) atom = m.GetAtoms()[0] bond = m.GetBonds()[0] atom_fdim = len(atom_features(atom)) bond_fdim = len(bond_features(bond)) if __name__ == "__main__": np.set_printoptions(threshold='nan') a, b, c, d, e, f = smiles2graph_list(["c1cccnc1", 'c1nccc2n1ccc2']) print(a) print(b) print(c) print(d) print(e) print(f)
restore_path = tf.train.latest_checkpoint(opts.model_path) saver.restore(session, restore_path) sys.stderr.write('restored') sys.stderr.flush() total = 0.0 idxfunc = lambda x: x.GetIntProp('molAtomMapNumber') try: while not coord.should_stop(): total += 1 r, conf = queue.get(timeout=30) if r is None: # reached end of data set break cur_pred = session.run(pred_topk) rmol = Chem.MolFromSmiles(r) rbonds = {} for bond in rmol.GetBonds(): a1 = idxfunc(bond.GetBeginAtom()) a2 = idxfunc(bond.GetEndAtom()) t = bond_types.index(bond.GetBondType()) + 1 a1, a2 = min(a1, a2), max(a1, a2) rbonds[(a1, a2)] = t if opts.verbose: for idx in cur_pred: # record the bond changes for this candidate for x, y, t, v in conf[idx]: # convert ids to atom map numbers x, y = x + 1, y + 1 # make sure this bond change is really a _change_