def testNumberOfHitsWithStartAtom(self): """ Test the molecular formula creator, which is basically a RDKit feature, but used in KinBot for postprocessing """ # the data contains the smiles as key and as value: # 1. the multiplicity # 2. start atom # 3. the motif which will be searched for # 4. the expected number of hits of the motif data = { 'CCCO[O]': [2, 0, ['C', 'C', 'C', 'O', 'O'], 1], 'C=C': [1, 0, ['X', 'X', 'X'], 2], 'S=S': [1, 0, ['S', 'S'], 1], } for smi in data: mult = data[smi][0] start = data[smi][1] motif = data[smi][2] exp = data[smi][3] st_pt = StationaryPoint(smi, 0, mult, smiles=smi) st_pt.characterize() bond = st_pt.bond natom = st_pt.natom atom = st_pt.atom # do not use any equivalencies eqv = [[k] for k in range(natom)] hits = find_motif.start_motif(motif, natom, bond, atom, start, eqv) cal = len(hits) warn = 'Unexpected number of motif hits for ' warn += '{}, expected {}, calculated {}'.format(smi, exp, cal) self.assertEqual(exp, cal, warn)
def rigid_along_path(self,atomi, atomj): """ Method finds the shortest path between two atoms and checks if any atom along that pathway is rigid. An atom is rigid if it is in a cycle or is doubly bonded to another atom which has more than one neighbor. """ if self.bond[atomi][atomj] > 0: if self.bond[atomi][atomj] > 1: # atoms are doubly bonded return 1 elif self.cycle[atomi] == 1: # atoms are in a cycle return 1 else: return 0 for chain_length in range(3, self.natom): motif = ['X' for i in range(chain_length)] instances = find_motif.start_motif(motif, self.natom, self.bond, self.atom, -1, [[k] for k in range(self.natom)]) if len(instances) == 0: break for ins in instances: if (ins[0] == atomi and ins[-1] == atomj) or (ins[0] == atomj and ins[-1] == atomi): for at in ins[1:-1]: if self.cycle[at] == 1: return 1 elif 2 in self.bond[at]: double_neigh = [i for i, x in enumerate(self.bond[at]) if x == 2] for neigh in double_neigh: if sum(self.bond[neigh]) > 2: # atom has at least on other neighbor return 1 return 0 return 0
def testBondFilter(self): """ Test the molecular formula creator, which is basically a RDKit feature, but used in KinBot for postprocessing """ smi = 'CCC=CCC' motif = ['C', 'C', 'C', 'C'] bondpattern = [2, 'X', 'X'] exp = 2 st_pt = StationaryPoint(smi, 0, 1, smiles=smi) st_pt.characterize() bond = st_pt.bond natom = st_pt.natom atom = st_pt.atom # do not use any equivalencies eqv = [[k] for k in range(natom)] hits = find_motif.start_motif(motif, natom, bond, atom, -1, eqv) count = 0 for hit in hits: if find_motif.bondfilter(hit, bond, bondpattern) == 0: count += 1 warn = 'Unexpected number of motif hits for ' warn += '{}, expected {}, calculated {}'.format(smi, exp, count) self.assertEqual(exp, count, warn)
def calc_chiral(self): """ Calculate self.chiral. 0 if non-chiral, +1 or -1 if chiral. Each atom gets a label like this. """ self.chiral = np.zeros(self.natom) # take min of resonance structure bonds # as those portions are planar and do not contribute to chirality # for the >C=C=C< case reduced_bond = self.bonds[0] for b in range(len(self.bonds) - 1): reduced_bond = np.minimum(self.bonds[b], self.bonds[b + 1]) for i in range(self.natom): if np.count_nonzero( reduced_bond[i] > 0) == 4: # exactly 4 neighbors atids = [] positions = np.empty((0, 3)) for j in range(self.natom): if reduced_bond[i][j] > 0: atids.append(self.atomid[j]) positions = np.append(positions, [self.geom[j]], axis=0) if len(set(atids)) == 4: # all are different self.chiral[i] = self.calc_chiral_hand( self.geom[i], positions, atids) if np.count_nonzero( reduced_bond[i] == 2) > 0: # has at least one double bond for dlen in range( 2, 9, 2): # up to 8, even number of double bonds in a row motif = ['X' for i in range(dlen + 1)] instances = find_motif.start_motif(motif, self.natom, reduced_bond, self.atom, i, self.atom_eqv) bondpattern = [2 for d in range(dlen)] for instance in instances: atids = [] if find_motif.bondfilter(instance, reduced_bond, bondpattern) == 0: positions = np.empty((0, 3)) for j in range(self.natom): if (reduced_bond[instance[0]][j] > 0 or reduced_bond[instance[-1]][j] > 0) and \ (j not in instance): # bonded to first or last atom in instance atids.append(self.atomid[j]) positions = np.append(positions, [self.geom[j]], axis=0) if len(set(atids)) == 4: center = instance[int(dlen / 2)] self.chiral[center] = self.calc_chiral_hand( self.geom[center], positions, atids) return 0
def get_chain(a1, a2, mol): """ Get the shortest chain between two atoms """ for i in range(1, mol.natom): motif = ['X' for j in range(i)] instances = find_motif.start_motif(motif, mol.natom, mol.bond, mol.atom, a1, [[k] for k in range(mol.natom)]) for ins in instances: if ins[-1] == a2: return ins return []
def find_cycle(self): """ Find all the cycles in a molecule, if any This is done by searching from motifs ['X','X', ..., 'X'] with length 3 to natom, and the cycles are defined by the motif instances of which the first and last atom are bonded The search is halted before reaching natoms if a certain morif length does not give any hit TODO: leave all the leaves of the graph out for the search, i.e. the atoms that only have neighbor, as they never participate in a cycle The cycles are kept in the cycle_chain list, which is a list of lists These lists contain the atom indices participating in each cycle. In the case of fused cycles, keep all the possible cycles (e.g. two fused rings lead to three cycles, and they are all defined in the cycle_chain """ self.cycle_chain = [] #list of the cycles self.cycle = [0 for i in range(self.natom) ] # 0 if atom is not in cycle, 1 otherwise for cycle_size in range(3, self.natom + 1): motif = ['X' for i in range(cycle_size)] instances = find_motif.start_motif(motif, self.natom, self.bond, self.atom, -1, [[k] for k in range(self.natom)]) if len(instances) == 0: break for ins in instances: if self.bond[ins[0]][ins[-1]]: #cycle found, check if it is new new = 1 for cyc in self.cycle_chain: if sorted(cyc) == sorted(ins): new = 0 if new: self.cycle_chain.append(ins) for at in ins: self.cycle[at] = 1 return 0
def divide_atoms(ati, atj, bond, natom, atom): """ This method divides the atoms in a molecule in two sets, which are separated by a bond In the case of rings, the atoms are equally divided in the two sets, which will change the bond length of the bond furthest away from the given bond. Be careful when using this method for cyclic structures! """ status = 1 if bond[ati, atj] == 0: return 0, [ati], [] # Get all the atoms on the side of ati visited = [ati] forbidden = [atj] division = [ati] # check for cycles and cut them in half for ring_size in range(3, natom + 1): motif = ['X' for at in range(ring_size)] inst = find_motif.start_motif(motif, natom, bond, atom, -1, []) for ins in inst: if bond[ins[0]][ins[-1]] > 0: # cycle found if ins[0] == ati and ins[-1] == atj: forbidden.append(ins[ring_size // 2]) if ins[0] == atj and ins[-1] == ati: forbidden.append(ins[-ring_size // 2 - 1]) if len(inst) == 0: break get_neighbors(ati, visited, forbidden, division, bond, natom) division2 = [x for x in range(natom) if x not in division] return status, division, division2
def find_cycle(self): """ Find all the cycles in a molecule, if any This is done by searching from motifs ['X','X', ..., 'X'] with length 3 to natom, and the cycles are defined by the motif instances of which the first and last atom are bonded The search is halted before reaching natoms if a certain morif length does not give any hit TODO: leave all the leaves of the graph out for the search, i.e. the atoms that only have neighbor, as they never participate in a cycle The cycles are kept in the cycle_chain list, which is a list of lists These lists contain the atom indices participating in each cycle. In the case of fused cycles, keep all the possible cycles (e.g. two fused rings lead to three cycles, and they are all defined in the cycle_chain """ self.cycle_chain = [] #list of the cycles self.cycle = [0 for i in range(self.natom) ] # 0 if atom is not in cycle, 1 otherwise for cycle_size in range(3, self.natom + 1): motif = ['X' for i in range(cycle_size)] instances = find_motif.start_motif(motif, self.natom, self.bond, self.atom, -1, [[k] for k in range(self.natom)]) if len(instances) == 0: break for ins in instances: if self.bond[ins[0]][ins[-1]]: #cycle found, check if it is new new = 1 for cyc in self.cycle_chain: if sorted(cyc) == sorted(ins): new = 0 if new: self.cycle_chain.append(ins) for at in ins: self.cycle[at] = 1 ringSizes = [] filteredRings = [] if len(self.cycle_chain) > 1: for ring in self.cycle_chain: ringSize = len(ring) ringSizes.append(ringSize) ringSizes.sort() ringSizes.reverse() for size in ringSizes: for ring in self.cycle_chain: if len(ring) == size: filteredRings.append(ring) checkRings = filteredRings for i, ring in enumerate(checkRings): duplicateRing = [0] * len(ring) for k, a in enumerate(checkRings[i]): j = i + 1 while j < len(checkRings): for b in checkRings[j]: if a == b: duplicateRing[k] = 1 j = j + 1 sumDuplicateRing = sum(duplicateRing) if sumDuplicateRing == len(checkRings[i]): filteredRings.pop(i) self.cycle_chain = filteredRings return 0