def _fragment_mol(mol, fragment_filter, num_heavies=None): cut_lists = fragment_filter.get_cut_lists(mol) if not cut_lists: return seen = set() if num_heavies is None: num_heavies = count_num_heavies(mol) # Identify atoms that are chiral (assigned and unassigned)in parent compound # 0 means not chiral, 1 means assigned, 2 means unassigned atom_ranks = Chem.CanonicalRankAtoms(mol, breakTies=False) chiral_flags = get_chiral_flags(mol, atom_ranks) for cut_list in cut_lists: num_cuts = len(cut_list) #print("num_cuts", num_cuts) if num_cuts == 1: fragmentations = make_single_cut(mol, cut_list[0], chiral_flags, fragment_filter) else: fragmentations = make_multiple_cuts(mol, cut_list, chiral_flags, fragment_filter) if fragmentations == None: # Fragmentation has been filtered out continue for fragmentation in fragmentations: key = fragmentation.get_unique_key() # XXX + "012" + YYY if key not in seen: seen.add(key) yield fragmentation
def get_parity(Mol, Atm_idx): canonical_rank = list() neighbor_list = list() neighbor_rank = list() string_rank = list(Chem.CanonicalRankAtoms(Mol, breakTies=False)) for rank in string_rank: canonical_rank.append(int(rank)) del string_rank for bond in Mol.GetAtomWithIdx(Atm_idx).GetBonds(): neighbor_idx = bond.GetOtherAtomIdx(Atm_idx) neighbor_list.append(neighbor_idx) neighbor_rank.append(canonical_rank[neighbor_idx]) ### See also http://www.dalkescientific.com/writings/diary/archive/2016/08/14/fragment_chiral_molecules.html N = len(neighbor_rank) num_swaps = 0 for i in range(N - 1): for j in range(i + 1, N): if neighbor_rank[i] > neighbor_rank[j]: neighbor_rank[i], neighbor_rank[j] = neighbor_rank[ j], neighbor_rank[i] num_swaps += 1 return num_swaps % 2
def reorder_atoms( mol: Chem.rdchem.Mol, break_ties: bool = True, include_chirality: bool = True, include_isotopes: bool = True, ) -> Optional[Chem.rdchem.Mol]: """Reorder the atoms in a mol. It ensures a single atom order for the same molecule, regardless of its original representation. Args: mol: a molecule. break_ties: Force breaking of ranked ties. include_chirality: Use chiral information when computing rank. include_isotopes: Use isotope information when computing rank. Returns: mol: a molecule. """ if mol.GetNumAtoms() == 0: return mol new_order = Chem.CanonicalRankAtoms( mol, breakTies=break_ties, includeChirality=include_chirality, includeIsotopes=include_isotopes, ) new_order = sorted([(y, x) for x, y in enumerate(new_order)]) return Chem.RenumberAtoms(mol, [y for (x, y) in new_order])
def check_nei_bonds(bond): a1, a2 = bond.GetBeginAtom(), bond.GetEndAtom() a1_bonds_single = [ b.GetBondType() == Chem.BondType.SINGLE for b in a1.GetBonds() if b.GetIdx() != bond.GetIdx() ] a2_bonds_single = [ b.GetBondType() == Chem.BondType.SINGLE for b in a2.GetBonds() if b.GetIdx() != bond.GetIdx() ] # if there are two identical substituents in one side then the bond is unsteric (no stereoisomers possible) ranks = list(Chem.CanonicalRankAtoms(m, breakTies=False)) a1_nei = [ a.GetIdx() for a in a1.GetNeighbors() if a.GetIdx() != a2.GetIdx() ] if len(a1_nei) == 2 and \ all(m.GetBondBetweenAtoms(i, a1.GetIdx()).GetBondType() == Chem.BondType.SINGLE for i in a1_nei) and \ ranks[a1_nei[0]] == ranks[a1_nei[1]]: return False a2_nei = [ a.GetIdx() for a in a2.GetNeighbors() if a.GetIdx() != a1.GetIdx() ] if len(a2_nei) == 2 and \ all(m.GetBondBetweenAtoms(i, a2.GetIdx()).GetBondType() == Chem.BondType.SINGLE for i in a2_nei) and \ ranks[a2_nei[0]] == ranks[a2_nei[1]]: return False # if list is empty this is a terminal atom, e.g. O in C=O if a1_bonds_single and a2_bonds_single and \ all(a1_bonds_single) and all(a2_bonds_single): return True else: return False
def get_symmetry_groups(mol): """ Computes the symmetry class for each atom and returns a list with the idx of non-symmetric atoms. Parameters ---------- mol : rdkit molecule object. Fragment from custom-made library. Returns ------- symmetry_list : list List with atom indices. """ rank = {} symmetry_list = [] symmetry_rank_list = [] counter = 0 for counter, atom in enumerate(mol.GetAtoms()): rank[atom.GetIdx()] = list( Chem.CanonicalRankAtoms(mol, breakTies=False))[counter] for idx, symmetry_rank in rank.items(): if symmetry_rank not in symmetry_rank_list: symmetry_rank_list.append(symmetry_rank) symmetry_list.append(idx) return symmetry_list
def get_symmetry_classes(molecule: off.Molecule) -> List[int]: """Calculate the symmetry classes of each atom in the molecule using the backend toolkits.""" try: from rdkit import Chem rd_mol = molecule.to_rdkit() symmetry_classes = list( Chem.CanonicalRankAtoms(rd_mol, breakTies=False)) except (ImportError, ModuleNotFoundError): from openeye import oechem oe_mol = molecule.to_openeye() oechem.OEPerceiveSymmetry(oe_mol) symmetry_classes_by_index = { a.GetIdx(): a.GetSymmetryClass() for a in oe_mol.GetAtoms() } symmetry_classes = [ symmetry_classes_by_index[i] for i in range(molecule.n_atoms) ] return symmetry_classes
def get_assm_cands(mol, atoms, inter_label, cluster, inter_size): atoms = list(set(atoms)) mol = get_clique_mol(mol, atoms) atom_map = [idxfunc(atom) for atom in mol.GetAtoms()] mol = set_atommap(mol) rank = Chem.CanonicalRankAtoms(mol, breakTies=False) rank = {x: y for x, y in zip(atom_map, rank)} pos, icls = zip(*inter_label) if inter_size == 1: cands = [pos[0]] + [x for x in cluster if rank[x] != rank[pos[0]]] elif icls[0] == icls[1]: #symmetric case shift = cluster[inter_size - 1:] + cluster[:inter_size - 1] cands = zip(cluster, shift) cands = [pos] + [(x, y) for x, y in cands if (rank[min(x, y)], rank[max(x, y)]) != (rank[min(pos)], rank[max(pos)])] else: shift = cluster[inter_size - 1:] + cluster[:inter_size - 1] cands = zip(cluster + shift, shift + cluster) cands = [pos] + [(x, y) for x, y in cands if (rank[x], rank[y]) != (rank[pos[0]], rank[pos[1]])] return cands
def canonicalize(mol): # E.g. # returns list [2,3,1,0] # means atom indexed 0 in mol is mapped to canonical order 2, # atom indexed 1 is mapped to canonical order 3, # atom indexed 2 is mapped to 1, # atom indexed 3 is mapped to 0. return list(Chem.CanonicalRankAtoms(mol, breakTies=True))
def canonicalize_tautomers(rank_list, mol): canon = tautomer.TautomerCanonicalizer() mol_t = canon.canonicalize(mol) rank_list = list() string_rank = list(Chem.CanonicalRankAtoms(mol_t, breakTies=False)) for rank in string_rank: rank_list.append(int(rank)) del string_rank
def ranks(m): r = [] for i, j in enumerate( list( Chem.CanonicalRankAtoms(m, breakTies=False, includeChirality=False, includeIsotopes=False))): r.append([mol.GetProp("_Name"), i + 1, j]) # 1based! return r
def find_identical_atoms(smi, atom_list): rdkit_mol = Chem.MolFromSmiles(smi) len_list = len(atom_list) atom_rank = list(Chem.CanonicalRankAtoms(rdkit_mol, breakTies=False)) for idx, atom in enumerate(rdkit_mol.GetAtoms()): if atom.GetIdx() in atom_list[:len_list]: sym_atoms = [int(atom_idx) for atom_idx, ranking in enumerate(atom_rank) if ranking == atom_rank[idx] and atom_idx not in atom_list] atom_list.extend(sym_atoms) return atom_list
def canonicalize_mol(mol, inplace=False, **kwargs): r"""Take a PLAMS molecule and sort its atoms based on their canonical rank. Example: .. code:: python >>> from scm.plams import Molecule, canonicalize_mol # Methane >>> mol: Molecule = ... >>> print(mol) Atoms: 1 H 0.640510 0.640510 -0.640510 2 H 0.640510 -0.640510 0.640510 3 C 0.000000 0.000000 0.000000 4 H -0.640510 0.640510 0.640510 5 H -0.640510 -0.640510 -0.640510 >>> print(canonicalize_mol(mol)) Atoms: 1 C 0.000000 0.000000 0.000000 2 H -0.640510 -0.640510 -0.640510 3 H -0.640510 0.640510 0.640510 4 H 0.640510 -0.640510 0.640510 5 H 0.640510 0.640510 -0.640510 :parameter mol: The to-be canonicalized molecule. :type mol: |Molecule| :parameter bool inplace: Whether to sort the atoms inplace or to return a new molecule. :parameter \**kwargs: Further keyword arguments for rdkit.Chem.CanonicalRankAtoms_. :return: Either ``None`` or a newly sorted molecule, depending on the value of ``inplace``. :rtype: None or |Molecule| .. _rdkit.Chem.CanonicalRankAtoms: https://www.rdkit.org/docs/source/rdkit.Chem.rdmolfiles.html#rdkit.Chem.rdmolfiles.CanonicalRankAtoms """ if not isinstance(mol, Molecule): raise TypeError("`mol` expected a plams Molecule") rdmol = to_rdmol(mol) idx_rank = Chem.CanonicalRankAtoms(rdmol, **kwargs) if inplace: mol.atoms = [ at for _, at in sorted(zip(idx_rank, mol.atoms), reverse=True) ] return None else: ret = mol.copy() ret.atoms = [ at for _, at in sorted(zip(idx_rank, ret.atoms), reverse=True) ] return ret
def remove_identical_atoms(rdkit_mol, atom_list): idx_list = [] rank_kept = [] atom_rank = list(Chem.CanonicalRankAtoms(rdkit_mol, breakTies=False)) for idx, atom in enumerate(atom_list): if atom_rank[atom] not in rank_kept: rank_kept.append(atom_rank[atom]) idx_list.append(idx) atom_list = np.array(atom_list)[idx_list].tolist() return atom_list
def _sample_ordering(mol, scaffold_nodes, k, p, ms=MoleculeSpec.get_default()): """Sampling decoding routes of a given molecule `mol` Args: mol (Chem.Mol): the given molecule (type: Chem.Mol) scaffold_nodes (np.ndarray): the nodes marked as scaffold k (int): The number of importance samples p (float): Degree of uncertainty during route sampling, should be in (0, 1) ms (mol_spec.MoleculeSpec) Returns: route_list (np.ndarray): route_list[i][j] the index of the atom reached at step j in sample i step_ids_list (np.ndarray): step_ids_list[i][j] the step at which atom j is reach at sample i logp_list (np.ndarray): logp_list[i] - the log-likelihood value of route i """ # build graph atom_types = [] for atom in mol.GetAtoms(): atom_types.append(ms.get_atom_type(atom)) atom_ranks = [] for r in Chem.CanonicalRankAtoms(mol): atom_ranks.append(r) atom_ranks = np.array(atom_ranks) bonds = [] for b in mol.GetBonds(): idx_1, idx_2 = b.GetBeginAtomIdx(), b.GetEndAtomIdx() bonds.append([idx_1, idx_2]) # build nx graph graph = nx.Graph() graph.add_nodes_from(range(len(atom_ranks))) graph.add_edges_from(bonds) route_list = [] step_ids_list = [] logp_list = [] for _ in range(k): step_ids, log_p = _traverse(graph=graph, atom_ranks=atom_ranks, scaffold_nodes=scaffold_nodes, p=p) step_ids_list.append(step_ids) step_ids = np.argsort(step_ids) route_list.append(step_ids) logp_list.append(log_p) # cast to numpy array route_list = np.array(route_list, dtype=np.int32) step_ids_list = np.array(step_ids_list, dtype=np.int32) logp_list = np.array(logp_list, dtype=np.float32) return route_list, step_ids_list, logp_list
def combine_core_env_to_rxn_smarts(core, env, keep_h=True): if isinstance(env, str): m_env = Chem.MolFromSmiles(env, sanitize=False) if isinstance(core, str): m_frag = Chem.MolFromSmiles(core, sanitize=False) backup_atom_map = "backupAtomMap" # put all atom maps to atom property and remove them for a in m_env.GetAtoms(): atom_map = a.GetAtomMapNum() if atom_map: a.SetIntProp(backup_atom_map, atom_map) a.SetAtomMapNum(0) for a in m_frag.GetAtoms(): atom_map = a.GetAtomMapNum() if atom_map: a.SetIntProp(backup_atom_map, atom_map) a.SetAtomMapNum(0) # set canonical ranks for atoms in env without maps m_env.UpdatePropertyCache() for atom_id, rank in zip([a.GetIdx() for a in m_env.GetAtoms()], list(Chem.CanonicalRankAtoms(m_env))): a = m_env.GetAtomWithIdx(atom_id) if not a.HasProp(backup_atom_map): a.SetAtomMapNum(rank + 1) # because ranks start from 0 m = Chem.RWMol(Chem.CombineMols(m_frag, m_env)) links = defaultdict(list) # pairs of atom ids to create bonds att_to_remove = [] # ids of att points to remove for a in m.GetAtoms(): if a.HasProp(backup_atom_map): i = a.GetIntProp(backup_atom_map) links[i].append(a.GetNeighbors()[0].GetIdx()) att_to_remove.append(a.GetIdx()) for i, j in links.values(): m.AddBond(i, j, Chem.BondType.SINGLE) for i in sorted(att_to_remove, reverse=True): m.RemoveAtom(i) comb_sma = mol_to_smarts(m, keep_h) if not keep_h: # remove H only in mapped env part comb_sma = patt_remove_h.sub('', comb_sma) return comb_sma
def get_symmetry_class(smi): symmetry = [] m = Chem.MolFromSmiles(smi) symmetry_classes = Chem.CanonicalRankAtoms(m, breakTies=False) #get the symmetry class of the attachements points #Note: 1st star is the zero index, #2nd star is first index, etc for atom, symmetry_class in zip(m.GetAtoms(), symmetry_classes): if (atom.GetMass() == 0): symmetry.append(symmetry_class) return symmetry
def __init__(self, rdmol, root_atm_idx=0): if not root_atm_idx < rdmol.GetNumAtoms(): raise ValueError("root_atm_idx must be 0<root_atm_idx<N_atms") self.rdmol = rdmol self.ordered_atom_list = [None] * rdmol.GetNumAtoms() self.z = dict() self.N_atms = 0 self.rank = list(Chem.CanonicalRankAtoms(rdmol, breakTies=False)) self.n_non_deadends = 0 self.add_atom(root_atm_idx) self.order_atoms(root_atm_idx) self.zzit()
def __init__(self, Mol, Verbose=False): self.mol = Mol self.canonical_rank = list() string_rank = list(Chem.CanonicalRankAtoms(self.mol, breakTies=False)) for rank in string_rank: self.canonical_rank.append(int(rank)) del string_rank canonicalize_tautomers(self.canonical_rank, self.mol) ### Holds connector instances self.connectors = list() ### Holds rdkit Mol instancs of final capped fragments self.frag_list = list() ### Holds atom indices of fragments in numbering scheme of ### original molecule. self.frag_list_map = list() ### Holds r/l anchor atom idcs for each fragment in numbering ### scheme of the fragment molecule self.ranc_list = list() self.lanc_list = list() ### Holds r/l cap atom idcs for each fragment in numbering ### scheme of the fragment molecule self.rcap_list_map = list() self.lcap_list_map = list() ### Holds corresponding connector idx for each r/l cap self.rcap_conn_idx = list() self.lcap_conn_idx = list() ### Stores fragment to fragment cross couplings ### atom indices for each cross couplin self.frag2frag_atms = list() ### fragment indices for each cross coulin self.frag2frag_frgs = list() self.__frag_count = 0 self.__connector_count = 0 self.verbose = Verbose if self.verbose: self.process_list = list()
def getSymmClasses(mol): ranks = list(Chem.CanonicalRankAtoms(mol, breakTies=False)) print('ranks: ', ranks) rankUniVals = set(ranks) if len(ranks) == len(rankUniVals): print("no equivalents") return [] symmGroups = [] for rankVal in rankUniVals: symmGroup = [i for i, x in enumerate(ranks) if x == rankVal] symmGroups.append(list(symmGroup)) return symmGroups
def canonical_order_atoms(molecule, h_last=True): """ Canonical order atoms in RDKit molecule. Eaach atom in the molecule is given a map index that corresponds to the RDkit rank for that atom (+1). RDKit atom ranking ranks hydrogens first and then the heavy atoms. When h_last is set to True, the map indices are reordered to put hydrogens after the heavy atoms. Parameters ---------- molecule: rdkit mol h_last: bool, optional, default is True Returns ------- molecule: rdkit molecule with map indices that correspond to the atom canonical rank """ # Check if molecule already has map. If it does, remove map because Chem.CanonicalRankAtoms uses map indices in # ranking if has_atom_map(molecule): remove_atom_map(molecule) # Add explicit hydrogen molecule = Chem.AddHs(molecule) heavy_atoms = 0 hydrogens = 0 ranks = list(Chem.CanonicalRankAtoms(molecule, breakTies=True)) for i, j in enumerate(ranks): atom = molecule.GetAtomWithIdx(i) atom.SetAtomMapNum(j + 1) if atom.GetAtomicNum() != 1: # heavy atom heavy_atoms += 1 else: # hydrogen hydrogens += 1 if h_last: # reorder map to put hydrogen last for atom in molecule.GetAtoms(): map_idx = atom.GetAtomMapNum() if atom.GetAtomicNum() != 1: atom.SetAtomMapNum(map_idx - hydrogens) else: atom.SetAtomMapNum(map_idx + heavy_atoms) return molecule
def get_graph_from_smiles(smiles): mol = Chem.MolFromSmiles(smiles) # build graph atom_types, atom_ranks, bonds, bond_types = [], [], [], [] for a, r in zip(mol.GetAtoms(), Chem.CanonicalRankAtoms(mol)): atom_types.append(meta.atom_to_index(a)) atom_ranks.append(r) for b in mol.GetBonds(): idx_1, idx_2, bt = b.GetBeginAtomIdx(), b.GetEndAtomIdx(), meta.bond_to_index(b) bonds.append([idx_1, idx_2]) bond_types.append(bt) # build nx graph graph = nx.Graph() graph.add_nodes_from(range(len(atom_types))) graph.add_edges_from(bonds) return graph, atom_types, atom_ranks, bonds, bond_types
def __extend_output_by_equivalent_atoms(mol, output): """ Generate additional fragments which cover equivalent atoms to extend the output and make replacements for equivalent atoms as well :param mol: :param output: :return: """ atom_ranks = list( Chem.CanonicalRankAtoms(mol, breakTies=False, includeChirality=False, includeIsotopes=False)) tmp = defaultdict(list) for i, rank in enumerate(atom_ranks): tmp[rank].append(i) atom_eq = dict() # dict of equivalent atoms for ids in tmp.values(): if len(ids) > 1: for i in ids: atom_eq[i] = [j for j in ids if j != i] extended_output = [] for item in output: if all(i in atom_eq.keys() for i in item[2]): # if all atoms of a fragment have equivalent atoms smi = patt_remove_map.sub('', item[1]) smi = patt_remove_brackets.sub('', smi) ids_list = [ set(i) for i in mol.GetSubstructMatches(Chem.MolFromSmarts(smi)) ] for ids_matched in ids_list: for ids_eq in product( *(atom_eq[i] for i in item[2] )): # enumerate all combinations of equivalent atoms if ids_matched == set(ids_eq): extended_output.append( (item[0], item[1], tuple(sorted(ids_eq)))) return extended_output
def canonicalize_atom_order(m, reverse=True, add_hs=True): """Canonicalize using RDKIT Args: m (rdkit.Chem.Mol): Mol object for RDKit Returns: rdkit.Chem.Mol: New canonicalized RDKit mol """ if add_hs: mH = Chem.AddHs(m) else: mH = m Compute2DCoords(mH) m_neworder = tuple( zip(*sorted( [(j, i) for i, j in enumerate(Chem.CanonicalRankAtoms(mH))], reverse=reverse, )))[1] m_canon = Chem.RenumberAtoms(mH, m_neworder) add_atom_indices(m_canon) return m_canon
def convert(self): """ Convert atom order. Returns: RDKit Mol object: An RDKit Mol object with canonical atom order. """ # Creat canonical order dict old2new = Chem.CanonicalRankAtoms(self.mol, includeChirality=True, breakTies=True) new2old = {o: i for i, o in enumerate(old2new)} # build new molecule based on the new atom order new_mol = Chem.rdchem.RWMol(Chem.Mol()) # add Atoms for idx in range(len(old2new)): new_mol.AddAtom(self.mol.GetAtomWithIdx(new2old[idx])) # rebuild Bonds bonds = self.mol.GetBonds() for b in bonds: new_mol.AddBond( old2new[b.GetBeginAtomIdx()], old2new[b.GetEndAtomIdx()], b.GetBondType(), ) # Add conformer (atom 3D positions) try: old_conformer = self.mol.GetConformer(0) except ValueError: old_conformer = None if old_conformer is not None: new_conformer = Chem.Conformer(new_mol.GetNumAtoms()) for idx in range(len(old2new)): pos = old_conformer.GetAtomPosition(new2old[idx]) new_conformer.SetAtomPosition(idx, pos) new_mol.AddConformer(new_conformer) return new_mol
def __standardize_smiles_with_att_points(mol, keep_stereo=False): """ to avoid different order of atoms in SMILES with different map number of attachment points smi = ["ClC1=C([*:1])C(=S)C([*:2])=C([*:3])N1", "ClC1=C([*:1])C(=S)C([*:3])=C([*:2])N1", "ClC1=C([*:2])C(=S)C([*:1])=C([*:3])N1", "ClC1=C([*:2])C(=S)C([*:3])=C([*:1])N1", "ClC1=C([*:3])C(=S)C([*:1])=C([*:2])N1", "ClC1=C([*:3])C(=S)C([*:2])=C([*:1])N1"] these will produce different output with RDKit MolToSmiles(): S=c1c([*:1])c(Cl)[nH]c([*:3])c1[*:2] S=c1c([*:1])c(Cl)[nH]c([*:2])c1[*:3] S=c1c([*:1])c([*:3])[nH]c(Cl)c1[*:2] S=c1c([*:2])c(Cl)[nH]c([*:1])c1[*:3] S=c1c([*:1])c([*:2])[nH]c(Cl)c1[*:3] S=c1c([*:2])c([*:1])[nH]c(Cl)c1[*:3] output of this function S=c1c([*:2])c([*:3])[nH]c(Br)c1[*:1] S=c1c([*:3])c([*:2])[nH]c(Br)c1[*:1] S=c1c([*:1])c([*:3])[nH]c(Br)c1[*:2] S=c1c([*:3])c([*:1])[nH]c(Br)c1[*:2] S=c1c([*:1])c([*:2])[nH]c(Br)c1[*:3] S=c1c([*:2])c([*:1])[nH]c(Br)c1[*:3] https://sourceforge.net/p/rdkit/mailman/message/35862258/ """ # update property cache if needed if mol.NeedsUpdatePropertyCache(): mol.UpdatePropertyCache() # store original maps and remove map numbers from mol backup_atom_map = "backupAtomMap" for a in mol.GetAtoms(): atom_map = a.GetAtomMapNum() if atom_map: a.SetIntProp(backup_atom_map, atom_map) a.SetAtomMapNum(0) # get canonical ranks for atoms for a mol without maps atoms = list( zip(list(Chem.CanonicalRankAtoms(mol)), [a.GetIdx() for a in mol.GetAtoms()])) atoms.sort() # set new atom maps based on canonical order rep = {} atom_map = 1 for pos, atom_idx in atoms: a = mol.GetAtomWithIdx(atom_idx) if a.HasProp(backup_atom_map): a.SetAtomMapNum(atom_map) rep["[*:%i]" % atom_map] = "[*:%i]" % a.GetIntProp(backup_atom_map) atom_map += 1 # get SMILES and relabel with original map numbers s = Chem.MolToSmiles(mol, isomericSmiles=keep_stereo) rep = dict((re.escape(k), v) for k, v in rep.items()) patt = re.compile("|".join(rep.keys())) s = patt.sub(lambda m: rep[re.escape(m.group(0))], s) return s
import math import pickle p_in = sys.argv[1] p_out = sys.argv[2] db_shingles = {} sh_count = 0 with open(p_in, 'r') as fi_in: #fi_in.readline() # header for i, line in enumerate(fi_in): smi = line.split('\t')[0].rstrip() mol = Chem.MolFromSmiles(smi) if mol: for atm in Chem.CanonicalRankAtoms(mol): for N in range(1, 4): bonds = AllChem.FindAtomEnvironmentOfRadiusN(mol, N, atm) if not bonds: break # the faster method... atoms = set() for bond_id in bonds: bond = mol.GetBondWithIdx(bond_id) atoms.add(bond.GetBeginAtomIdx()) atoms.add(bond.GetEndAtomIdx()) shingle = Chem.rdmolfiles.MolFragmentToSmiles( mol, list(atoms), bonds, 0, 0, False, False, atm, True, False, False)
def __init__(inchiStr): self.mol = Chem.inchi.MolFromInchi(inchiStr) self.symmEquivalence = Chem.CanonicalRankAtoms(self.mol)
def get_mol(self): """ Return section containing element types, fitting weight, molecule title, number of atoms and atom equivalencing. groups_frozen: Freeze charges in groups to the values in qin file, typcially obtained from previous resp run. h_equiv : Fit charges of degenerate hydrogen atoms together all_equiv : Freeze charges of all degenerate atoms together. If this is activated, and h_equiv is deactivated, only heavy-atom atomic centers will be fitted together. """ line_2I5 = fortranformat.FortranRecordWriter('2I5') _tmp_str = list() for mol_i in range(self._mol_count): mol = self._mol_list[mol_i] _tmp_str.append(' %f\n' % self._mol_weight_list[mol_i]) _tmp_str.append(' %s\n' % self._mol_name_list[mol_i]) _charge = float(self._mol_charge_list[mol_i]) _charge = round(_charge) _charge = int(_charge) _natoms = mol.GetNumAtoms() _tmp_str.append(line_2I5.write([_charge, _natoms])) _tmp_str.append('\n') canonical_rank = list() string_rank = list(Chem.CanonicalRankAtoms(mol, breakTies=False)) for rank in string_rank: canonical_rank.append(int(rank)) del string_rank ### This really never worked perfectly... canonicalize_tautomers(canonical_rank, mol) index_list = np.arange(_natoms) if mol_i not in self._intermol1: for atom_i in index_list: atom = mol.GetAtomWithIdx(int(atom_i)) at_num = atom.GetAtomicNum() _tmp_str.append(line_2I5.write([at_num, 0])) _tmp_str.append('\n') else: for atom_i in index_list: atom = mol.GetAtomWithIdx(int(atom_i)) at_num = atom.GetAtomicNum() if mol_i in self._free_list_mol: mol_i_idx = self._free_list_mol.index(mol_i) if atom_i in self._free_list[mol_i_idx]: _tmp_str.append(line_2I5.write([at_num, 0])) _tmp_str.append('\n') continue placed_frozen = False if self.unfreeze_all: _tmp_str.append(line_2I5.write([at_num, 0])) elif self.groups_frozen: ### Check if atom itself is in group for index, atom_j in enumerate(self._group_atom_list): if atom_j==atom_i \ and self._group_mol_list[index] == mol_i: if self.noh_frozen and at_num != 1: _tmp_str.append( line_2I5.write([at_num, -1])) elif not self.h_groups_frozen and at_num == 1: if self.h_equiv: canon_eq_bool = np.isin( canonical_rank, canonical_rank[atom_i]) canon_eq_int = index_list[ canon_eq_bool] if atom_i == canon_eq_int[0]: _tmp_str.append( line_2I5.write([at_num, 0])) else: _tmp_str.append( line_2I5.write([ at_num, canon_eq_int[0] + 1 ])) else: _tmp_str.append( line_2I5.write([at_num, 0])) else: _tmp_str.append(line_2I5.write([at_num, 0])) placed_frozen = True break if not placed_frozen: if self.noh_frozen and at_num != 1: _tmp_str.append(line_2I5.write([at_num, -1])) elif (self.h_equiv and at_num == 1) \ or (self.all_equiv and not self.h_equiv and at_num != 1): ### canon_eq_bool is True for all atoms that are canonically ### equal to atom_i (including atom_i itself). ### canon_eq_int holds atom indices of all atoms that are ### canonically equal to atom_i (including atom_i itself). canon_eq_bool = np.isin(canonical_rank, canonical_rank[atom_i]) canon_eq_int = index_list[canon_eq_bool] ### This is fulfilled only when we encounter this canoncial ### rank (stored in canonical_rank[atom_i]) for the first ### time in this molecule. It will tell resp to let that ### atom center vary independly. if atom_i == canon_eq_int[0]: _tmp_str.append(line_2I5.write([at_num, 0])) ### If current atom atom_i is equivalent to another atom ### which is present in a different group than atom_i, then ### we should not equivalence constraints on these two atoms. ### If we already have encountered this canoncial rank before ### freeze atom_i to the atom that was our first encounter with ### this canonical rank. Note, that resp expects atom counting ### to start at 1, *not* 0. else: _tmp_str.append( line_2I5.write( [at_num, canon_eq_int[0] + 1])) # else: # found_in_group = False # for index2, atom_j in enumerate(self._group_atom_list): # if canon_eq_int[0]==atom_j \ # and self._group_mol_list[index2]==mol_i: # for index, atom_k in enumerate(self._group_atom_list): # if atom_i==atom_k \ # and self._group_mol_list[index]==mol_i: # if index2 == index: # _tmp_str.append(line_2I5.write([at_num, canon_eq_int[0]+1])) # else: # _tmp_str.append(line_2I5.write([at_num, 0])) # found_in_group = True # if found_in_group: # break # if found_in_group: # break # # if not found_in_group: # _tmp_str.append(line_2I5.write([at_num, 0])) else: _tmp_str.append(line_2I5.write([at_num, 0])) _tmp_str.append('\n') if self._mol_count > 1: _tmp_str.append('\n') return ''.join(_tmp_str)
def make_multiple_cuts(mol, atom_pairs, chiral_flags, fragment_filter): num_cuts = len(atom_pairs) assert num_cuts >= 2, num_cuts fragmented_mol, other_atom_table = fragment_on_atom_pairs(mol, atom_pairs) # Figure out which atoms are in the variable part and which atoms are in the constant part. constant_atom_indices = [] variable_atom_indices = [] for atom_indices in Chem.GetMolFrags(fragmented_mol): non_wildcard_indices = [] for atom_index in atom_indices: if fragmented_mol.GetAtomWithIdx(atom_index).GetAtomicNum() != 0: non_wildcard_indices.append(atom_index) num_wildcard_atoms = len(atom_indices) - len(non_wildcard_indices) if num_wildcard_atoms == 1: # Filter out fragmentations with too small fragments in the constant if len(non_wildcard_indices ) < fragment_filter.min_heavies_per_const_frag: return constant_atom_indices.extend(non_wildcard_indices) elif num_wildcard_atoms == num_cuts: variable_atom_indices.extend(non_wildcard_indices) else: # Did not cut into core+rgroups return # # Filter out fragmentations with too small fragments in the constant # if fragment_filter.min_heavies_per_const_frag > 0: # for frag in Chem.GetMolFrags(fragmented_mol, asMols=True): # num_wildcards = 0 # for atom in frag.GetAtoms(): # if atom.GetAtomicNum() == 0: # num_wildcards += 1 # if num_wildcards == 1 and frag.GetNumHeavyAtoms() < fragment_filter.min_heavies_per_const_frag: # yield None # return # Determine the symmetry of the variable part fragmented_mol.UpdatePropertyCache( strict=False) # XXX magic; without it I get a RuntimeError Chem.AssignStereochemistry(fragmented_mol, cleanIt=True, force=True) # "getNumImplicitHs() called without preceding call to calcImplicitValence()" new_atom_ranks = Chem.CanonicalRankAtoms(fragmented_mol, breakTies=False) new_chiral_flags = get_chiral_flags(mol, new_atom_ranks) seen_smiles = set() # for enumeration_label, chiral_assignments in up_enumerate( fragmented_mol, constant_atom_indices, variable_atom_indices, chiral_flags, new_chiral_flags): if enumeration_label == EnumerationLabel.NO_ENUMERATION: assert chiral_assignments is None atom_ranks = new_atom_ranks ## print("reused:", list(atom_ranks)) else: for (atom_index, chiral_tag) in chiral_assignments: fragmented_mol.GetAtomWithIdx(atom_index).SetChiralTag( chiral_tag) fragmented_mol.ClearComputedProps() # XXX Do I need this? atom_ranks = Chem.CanonicalRankAtoms(fragmented_mol, breakTies=False) ## print("computed:", list(atom_ranks)) # Work in SMILES space so we find a canonical mapping between the # unlabeled canonical variable and canonical constant parts. smiles = cansmiles(fragmented_mol) #print("smiles", smiles) # The up-enumeration may have several ways to generate the same structure. # For example, flipping two "@"s to "@@"s may leave the structure unchanged. if smiles in seen_smiles: continue seen_smiles.add(smiles) # Figure out which is the variable/core structure. # It's the one with the most "*"s on it (must equal the number of cuts) frag_smiles_list = smiles.split(".") assert len(frag_smiles_list) == num_cuts + 1, smiles variable_component_index = _get_variable_index(frag_smiles_list) if variable_component_index is None: # 3 cuts but no fragment with three "*"s raise AssertionError(("I already checked for this", smiles)) #print("core is at", variable_component_index) # Get the mapping from position in the SMILES string to atom index in the molecule smiles_index_to_atom_index = get_atom_order_in_smiles(fragmented_mol) # Determine the constant part (the rgroups) constant_component_indices = list(range(num_cuts + 1)) del constant_component_indices[variable_component_index] constant_smiles_list = [ frag_smiles_list[i] for i in constant_component_indices ] assert len(constant_smiles_list) == num_cuts # Find the connection points on the variable part component_atom_symbols = get_component_atom_symbols(smiles) variable_connection_atom_indices = [] variable_atom_indices2 = [] for smiles_index, smiles_symbol in component_atom_symbols[ variable_component_index]: atom_index = smiles_index_to_atom_index[smiles_index] if "*" in smiles_symbol: variable_connection_atom_indices.append(atom_index) else: variable_atom_indices2.append(atom_index) # XXX Remove assert sorted(variable_atom_indices) == sorted( variable_atom_indices2), (sorted(variable_atom_indices), sorted(variable_atom_indices2)) assert len(variable_connection_atom_indices) == num_cuts #print("variable_connection_atom_indices", variable_connection_atom_indices) variable_symmetry_class = get_symmetry_class( *(atom_ranks[atom_index] for atom_index in variable_connection_atom_indices)) # Determine the symmetry of the constant part (the rgroups) constant_symmetry_class = get_symmetry_class(*constant_smiles_list) # Figure out which R-groups in the constant part correspond to the # attachment points in the core/variable part. atom_index_to_rgroup_label = {} constant_atom_indices = [] for rgroup_id, component_i in enumerate(constant_component_indices): rgroup_label = str(rgroup_id) for (smiles_index, smiles_symbol) in component_atom_symbols[component_i]: atom_index = smiles_index_to_atom_index[smiles_index] atom_index_to_rgroup_label[atom_index] = rgroup_label if "*" not in smiles_symbol: constant_atom_indices.append(atom_index) attachment_order = "".join( atom_index_to_rgroup_label[other_atom_table[atom_index]] for atom_index in variable_connection_atom_indices) # Figure the canonical attachment order canonical_attachment_order = CANONICAL_ATTACHMENT_ORDER[ variable_symmetry_class, constant_symmetry_class, attachment_order] # Figure out which atoms in the variable part are still chiral ## fragmented_chiral_flags = get_chiral_flags(fragmented_mol, atom_ranks) ## variable_num_chirals, variable_num_lost_chirals, variable_num_new_stereocenters = \ ## get_chiral_difference(variable_atom_indices2, chiral_flags, fragmented_chiral_flags) ## constant_num_chirals, constant_num_lost_chirals, constant_num_new_stereocenters = \ ## get_chiral_difference(constant_atom_indices2, chiral_flags, fragmented_chiral_flags) variable_smiles = frag_smiles_list[variable_component_index] constant_smiles = ".".join(constant_smiles_list) ## print("variable_smiles:", variable_smiles) ## print("constant_smiles:", constant_smiles) # Test that I can reconnect if 0: offsets = [int(c) for c in canonical_attachment_order] var_part = smiles_syntax.convert_wildcards_to_closures( variable_smiles, offsets) const_part = smiles_syntax.convert_wildcards_to_closures( constant_smiles, list(range(num_cuts))) smi = Chem.CanonSmiles(var_part + "." + const_part, 0) expected_smi = Chem.MolToSmiles(mol) if smi != expected_smi: print(" Got:", smi) print("Expected:", expected_smi) assert smi == expected_smi, (smi, expected_smi) ## print("Fragmentation") ## print(get_num_heavies_from_smiles(variable_smiles), variable_symmetry_class, variable_smiles) yield Fragmentation( num_cuts, enumeration_label, get_num_heavies_from_smiles(variable_smiles), variable_symmetry_class, variable_smiles, canonical_attachment_order, get_num_heavies_from_smiles(constant_smiles), constant_symmetry_class, constant_smiles, None, )
def make_single_cut(mol, atom_pair, chiral_flags, fragment_filter): fragmented_mol, other_atom_table = fragment_on_atom_pairs(mol, [atom_pair]) frag1_indices, frag2_indices = Chem.GetMolFrags(fragmented_mol) # Remove the indices for the wildcard atoms (should be the last two atoms in the molecule) num_atoms = fragmented_mol.GetNumAtoms() a1, a2 = num_atoms - 1, num_atoms - 2 assert fragmented_mol.GetAtomWithIdx(a1).GetAtomicNum() == 0 assert fragmented_mol.GetAtomWithIdx(a2).GetAtomicNum() == 0 frag1_smiles = Chem.MolFragmentToSmiles(fragmented_mol, frag1_indices, isomericSmiles=True) frag2_smiles = Chem.MolFragmentToSmiles(fragmented_mol, frag2_indices, isomericSmiles=True) frag1_num_atoms = get_num_heavies_from_smiles(frag1_smiles) frag2_num_atoms = get_num_heavies_from_smiles(frag2_smiles) # Determine the symmetry of both parts fragmented_mol.UpdatePropertyCache( strict=False) # XXX magic; without it I get a RuntimeError # Need to clear chiral tags which are no longer relevant because the new # wildcards are symmetric. The canonical SMILES output is affected by an # atom's chiral tag, even if the output doesn't denote chirality for that # atom. I need to clear the tags to get a truly canonical output. # See https://sourceforge.net/p/rdkit/mailman/message/35420297/ , from Greg # Landrum, on 2016-10-11 05:39:12 titled "identify chiral atoms which # became achiral after fragmenting". Chem.AssignStereochemistry(fragmented_mol, cleanIt=True, force=True) # "getNumImplicitHs() called without preceding call to calcImplicitValence()" new_atom_ranks = Chem.CanonicalRankAtoms(fragmented_mol, breakTies=False) ## print("new_atom_ranks:", list(new_atom_ranks)) new_chiral_flags = get_chiral_flags(mol, new_atom_ranks) up_enumerations = [] for frag_indices in (frag1_indices, frag2_indices): ## print("indices", frag_indices) ## print("chiral_flags", len(chiral_flags), chiral_flags) ## print("new_chiral_flags", len(new_chiral_flags), new_chiral_flags) frag_indices_without_wildcard = [a for a in frag1_indices if a < a2] chiral_indices = get_new_stereocenter_indices( frag_indices_without_wildcard, chiral_flags, new_chiral_flags) up_enumeration = set() for chiral_assignment in chiral_enumerate(chiral_indices): for (atom_index, chiral_tag) in chiral_assignment: fragmented_mol.GetAtomWithIdx(atom_index).SetChiralTag( chiral_tag) up_smiles = Chem.MolFragmentToSmiles(fragmented_mol, frag_indices, isomericSmiles=True) up_enumeration.add(up_smiles) up_enumerations.append(up_enumeration) frag1_up_enumerations, frag2_up_enumerations = up_enumerations # fragment 1 is the constant part and 2 is variable. for ((constant_num_atoms, constant_smiles, constant_up_enumerations, variable_num_atoms, variable_smiles, variable_up_enumerations)) in ( (frag1_num_atoms, frag1_smiles, frag1_up_enumerations, frag2_num_atoms, frag2_smiles, frag2_up_enumerations), (frag2_num_atoms, frag2_smiles, frag2_up_enumerations, frag1_num_atoms, frag1_smiles, frag1_up_enumerations), ): if constant_num_atoms < fragment_filter.min_heavies_per_const_frag: continue constant_smiles_with_H = replace_wildcard_with_H(constant_smiles) yield Fragmentation(1, EnumerationLabel.NO_ENUMERATION, variable_num_atoms, "1", variable_smiles, "0", constant_num_atoms, "1", constant_smiles, constant_smiles_with_H) # up-enumeration in the constant part for constant_up_smiles in constant_up_enumerations: yield Fragmentation(1, EnumerationLabel.CONSTANT_UP_ENUMERATION, variable_num_atoms, "1", variable_smiles, "0", constant_num_atoms, "1", constant_up_smiles, replace_wildcard_with_H(constant_up_smiles)) # up-enumeration in the variable part for variable_up_smiles in variable_up_enumerations: yield Fragmentation(1, EnumerationLabel.VARIABLE_UP_ENUMERATION, variable_num_atoms, "1", variable_up_smiles, "0", constant_num_atoms, "1", constant_up_smiles, constant_smiles_with_H)