def test_compute_smiles_from_molecule_no_hs(self): mol = Chem.MolFromSmiles('FOC', sanitize=False) self.assertEqual( smu_utils_lib.compute_smiles_for_molecule(mol, include_hs=False), 'COF') # This is expected. Even with include_hs=True, if there were no Hs in the # molecule, they will not be in the smiles. self.assertEqual( smu_utils_lib.compute_smiles_for_molecule(mol, include_hs=True), 'COF')
def test_compute_smiles_from_molecule_with_hs(self): mol = Chem.MolFromSmiles('FOC', sanitize=False) Chem.SanitizeMol(mol, Chem.rdmolops.SanitizeFlags.SANITIZE_ADJUSTHS) mol = Chem.AddHs(mol) self.assertEqual( smu_utils_lib.compute_smiles_for_molecule(mol, include_hs=False), 'COF') self.assertEqual( smu_utils_lib.compute_smiles_for_molecule(mol, include_hs=True), '[H]C([H])([H])OF')
def test_compute_smiles_from_molecule_labeled_no_h(self): mol = Chem.MolFromSmiles( '[O-][N+]([H])([H])N([H])OC([H])([H])F', sanitize=False) self.assertIsNotNone(mol) self.assertEqual( '[O-][NH2+:1][NH:2][O:3][CH2:4][F:5]', smu_utils_lib.compute_smiles_for_molecule( mol, include_hs=False, labeled_atoms=True))
def test_compute_smiles_from_molecule_labeled_with_h(self): mol = Chem.MolFromSmiles( '[O-][N+]([H])([H])N([H])OC([H])([H])F', sanitize=False) self.assertIsNotNone(mol) self.assertEqual( '[O-][N+:1]([H:2])([H:3])[N:4]([H:5])[O:6][C:7]([H:8])([H:9])[F:10]', smu_utils_lib.compute_smiles_for_molecule( mol, include_hs=True, labeled_atoms=True))
def test_compute_smiles_from_molecule_special_case(self): mol = Chem.MolFromSmiles('C12=C3C4=C1C4=C23', sanitize=False) # Double check that this really is the special case -- we get back the # SMILES we put in even though it's not the one we want. self.assertEqual('C12=C3C4=C1C4=C23', Chem.MolToSmiles(mol, kekuleSmiles=True)) self.assertEqual( smu_utils_lib.compute_smiles_for_molecule(mol, include_hs=False), 'C12=C3C1=C1C2=C31')
def find_by_smiles(self, smiles): """Finds all conformer associated with a given smiles string. Args: smiles: string Returns: iterable for dataset_pb2.Conformer """ canon_smiles = smu_utils_lib.compute_smiles_for_molecule( Chem.MolFromSmiles(smiles, sanitize=False), include_hs=False) cur = self._conn.cursor() select = f'SELECT btid FROM {_SMILES_TABLE_NAME} WHERE smiles = ?' cur.execute(select, (canon_smiles, )) result = cur.fetchall() if not result: return [] # Since it's a unique index, there should only be one result and it's a # tuple with one value. assert len(result) == 1 assert len(result[0]) == 1 return self.find_by_bond_topology_id(result[0][0])
def bond_topologies_from_geom(bond_lengths, conformer_id, fate, bond_topology, geometry, matching_parameters): """Return all BondTopology's that are plausible. Given a molecule described by `bond_topology` and `geometry`, return all possible BondTopology that are consistent with that. Note that `bond_topology` will be put in a canonical form. Args: bond_lengths: matrix of interatomic distances conformer_id: fate: outcome of calculations bond_topology: geometry: coordinates for the bond_topology matching_parameters: Returns: TopologyMatches """ result = dataset_pb2.TopologyMatches() # To be returned. result.starting_smiles = bond_topology.smiles result.conformer_id = conformer_id result.fate = fate natoms = len(bond_topology.atoms) if natoms == 1: return result # empty. if len(geometry.atom_positions) != natoms: return result # empty distances = utilities.distances(geometry) # First join each Hydrogen to its nearest heavy atom, thereby # creating a starting BondTopology from which all others can grow starting_bond_topology = hydrogen_to_nearest_atom(bond_topology, distances) if starting_bond_topology is None: return result heavy_atom_indices = [ i for i, t in enumerate(bond_topology.atoms) if t != dataset_pb2.BondTopology.AtomType.ATOM_H ] # For each atom pair, a list of possible bond types. # Key is a tuple of the two atom numbers, value is an np.array # with the score for each bond type. bonds_to_scores: Dict[Tuple[int, int], np.ndarray] = {} for (i, j) in itertools.combinations(heavy_atom_indices, 2): # All pairs. dist = distances[i, j] if dist > THRESHOLD: continue try: possible_bonds = bond_lengths.probability_of_bond_types( bond_topology.atoms[i], bond_topology.atoms[j], dist) except KeyError: # Happens when this bond type has no data continue if not possible_bonds: continue # Note that this relies on the fact that BOND_SINGLE==1 etc.. btypes = np.zeros(4, np.float32) for key, value in possible_bonds.items(): btypes[key] = value bonds_to_scores[(i, j)] = btypes if not bonds_to_scores: # Seems unlikely. return result # Need to know when the starting smiles has been recovered. rdkit_mol = smu_utils_lib.bond_topology_to_molecule(bond_topology) starting_smiles = smu_utils_lib.compute_smiles_for_molecule( rdkit_mol, include_hs=True) initial_ring_atom_count = utilities.ring_atom_count_mol(rdkit_mol) # Avoid finding duplicates. all_found_smiles: Set[str] = set() mol = smu_molecule.SmuMolecule(starting_bond_topology, bonds_to_scores, matching_parameters) search_space = mol.generate_search_state() for s in itertools.product(*search_space): bt = mol.place_bonds(list(s), matching_parameters) if not bt: continue rdkit_mol = smu_utils_lib.bond_topology_to_molecule(bt) if matching_parameters.consider_not_bonded and len( Chem.GetMolFrags(rdkit_mol)) > 1: continue found_smiles = smu_utils_lib.compute_smiles_for_molecule( rdkit_mol, include_hs=True) if found_smiles in all_found_smiles: continue all_found_smiles.add(found_smiles) if matching_parameters.ring_atom_count_cannot_decrease: ring_atoms = utilities.ring_atom_count_mol(rdkit_mol) if ring_atoms < initial_ring_atom_count: continue bt.ring_atom_count = ring_atoms bt.bond_topology_id = bond_topology.bond_topology_id utilities.canonical_bond_topology(bt) if found_smiles == starting_smiles: bt.is_starting_topology = True if not matching_parameters.smiles_with_h: found_smiles = smu_utils_lib.compute_smiles_for_molecule( rdkit_mol, include_hs=False) bt.geometry_score = geometry_score(bt, distances, bond_lengths) bt.smiles = found_smiles result.bond_topology.append(bt) if len(result.bond_topology) > 1: result.bond_topology.sort(key=lambda bt: bt.score, reverse=True) score_sum = np.sum([bt.score for bt in result.bond_topology]) for bt in result.bond_topology: bt.topology_score = np.log(bt.score / score_sum) bt.ClearField("score") return result