def string(self, val): if not _os.path.exists(val): self._molecule = _rdkit.openAsRdkit(val, minimise=self.minimise) self._string = _rdmolfiles.MolToSmiles(self._molecule) else: raise ValueError( "Need a SMILES or InChI string instead of a filename")
def test_sdf_file_parser_target_index(sdf_file, test_mols): idxs = [0, 2] preprocessor = EGCNPreprocessor(max_atoms=49, out_size=49) parser = SDFFileParser(preprocessor, labels='Fitness') result = parser.parse(sdf_file, return_smiles=True, target_index=idxs) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 4 # # Check if computed features are saved correctly for i in range(len(dataset) - 1): # for each feature for data_idx, j in enumerate(idxs): # and for each example expect = preprocessor.get_input_feats(test_mols[j]) np.testing.assert_array_almost_equal(dataset[i][data_idx], expect[i], decimal=3) # Check if labels are parsed correctly labels = dataset[3] expected_labels = np.array( [preprocessor.get_labels(test_mols[idx], 'Fitness') for idx in idxs]) np.testing.assert_array_almost_equal(labels, expected_labels, decimal=3) # Check smiles array assert type(smiles) == np.ndarray assert smiles.ndim == 1 assert len(smiles) == dataset[0].shape[0] expected_smiles = np.array( [rdmolfiles.MolToSmiles(test_mols[idx]) for idx in idxs]) np.testing.assert_array_equal(smiles, expected_smiles)
def _get_smiles(inchi_term): '''Get smiles.''' try: mol = inchi.MolFromInchi(inchi_term, treatWarningAsError=True) return rdmolfiles.MolToSmiles(mol) except Exception: return None
def prepare_mol(self, mol: rdchem.Mol) -> Tuple[str, rdchem.Mol]: """Prepare both smiles and mol by standardizing to common rules. This method should be called before `get_input_feats`. Params: ------- mol: rdkit.Chem.rdchem.Mol Molecule of interest. Returns: -------- canonical_smiles: str Canonical SMILES representation of the molecule. mol: rdkit.Chem.rdchem.Mol Modified molecule w/ kekulization and Hs added, if specified. """ canonical_smiles = rdmolfiles.MolToSmiles(mol, canonical=True) mol = rdmolfiles.MolFromSmiles(canonical_smiles) if self.add_Hs: mol = rdmolops.AddHs(mol) if self.kekulize: rdmolops.Kekulize(mol) return canonical_smiles, mol
def smiles_from_seq(self, seq): """Calculates the smiles of a given peptide dendrimer sequence Arguments: seq {string} -- peptide dendrimer sequence Returns: string -- molecule_smile - SMILES of the peptide """ gs, bs, terminal, capping = self.split_seq_components(seq) # modifies the Cterminal if terminal: molecule = rdmolfiles.MolFromSmiles(self.T_SMILES[terminal[0]]) else: molecule = '' # creates the dendrimer structure for gen in gs: for aa in gen: if aa == '-': self.metbond = True continue if molecule == '': molecule = rdmolfiles.MolFromSmiles(self.AA_SMILES[aa]) else: molecule = self.connect_mol( molecule, rdmolfiles.MolFromSmiles(self.AA_SMILES[aa])) if bs: if bs[0] == '-': self.metbond = True bs.pop(0) if molecule == '': molecule = rdmolfiles.MolFromSmiles(self.B_SMILES[bs[0]]) else: molecule = self.connect_mol( molecule, rdmolfiles.MolFromSmiles(self.B_SMILES[bs[0]])) bs.pop(0) # adds capping to the N-terminal (the called clip function is different, cause the listed smiles # for the capping are already without OH, it is not necessary removing any atom after foming the new bond) if capping: molecule = attach_capping( molecule, rdmolfiles.MolFromSmiles(self.C_SMILES[capping[0]])) # clean the smile from all the tags for atom in molecule.GetAtoms(): atom.SetAtomMapNum(0) molecule_smile = rdmolfiles.MolToSmiles(molecule, isomericSmiles=True).replace( '[N]', 'N').replace('[C]', 'C') return molecule_smile
def protonated_filename(self, val): with self.workdir: if val is None: self._protonated_filename = None self._protonated = False else: self._protonated_filename = _fileio.checkFileExists(val) self._molecule = _rdkit.openAsRdkit(self._protonated_filename, removeHs=False, minimise=self.minimise) self._string = _rdmolfiles.MolToSmiles(self.molecule) self._protonated = True
def test_sdf_file_parser_return_smiles(sdf_file, test_mols): preprocessor = EGCNPreprocessor(max_atoms=49, out_size=49) parser = SDFFileParser(preprocessor) result = parser.parse(sdf_file, return_smiles=True) dataset = result['dataset'] smiles = result['smiles'] assert len(dataset) == 3 # Check if computed features are saved correctly for i in range(len(dataset)): # for each feature for j in range(len(test_mols)): # and for each example expect = preprocessor.get_input_feats(test_mols[j]) np.testing.assert_array_almost_equal(dataset[i][j], expect[i], decimal=3) # Check smiles array assert type(smiles) == np.ndarray assert smiles.ndim == 1 assert len(smiles) == dataset[0].shape[0] expected_smiles = np.array( [rdmolfiles.MolToSmiles(mol) for mol in test_mols]) np.testing.assert_array_equal(smiles, expected_smiles)
def molecule(self, val): if isinstance(val, _rdchem.Mol): self._molecule = val self._string = _rdmolfiles.MolToSmiles(self._molecule) else: raise TypeError("Need an object of type RDKit Mol")
def smiles_from_seq_cyclic(seq): """Calculates the smiles of the given peptide sequence and cyclize it Arguments: seq {string} -- peptide dendrimer sequence Returns: string -- molecule_smile - SMILES of the peptide """ # used internally to recognize a methylated aa: metbond = False # can be set with exclude or allow methylation, # it refers to the possibility of having methylation in the entire GA: methyl = False if 'X' in seq: cy = 1 for i in NT: seq = seq.replace(i, '') for i in CT: seq = seq.replace(i, '') else: cy = 0 gs, bs, terminal, capping = split_seq_components(seq) # modifies the Cterminal if terminal: molecule = rdmolfiles.MolFromSmiles(T_SMILES[terminal[0]]) else: molecule = '' if bs: if verbose: print( 'dendrimer, cyclization not possible, branching unit will not be considered' ) # creates the linear peptide structure for gen in gs: for aa in gen: if aa == 'X': continue if aa == '-': metbond = True continue if molecule == '': molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa]) else: molecule = connect_mol(molecule, rdmolfiles.MolFromSmiles(AA_SMILES[aa])) # adds capping to the N-terminal (the called clip function is different, cause the listed smiles # for the capping are already without OH, it is not necessary removing any atom after foming the new bond) if capping: molecule = attach_capping( molecule, rdmolfiles.MolFromSmiles(C_SMILES[capping[0]])) # cyclize if molecule == '': smiles = '' return smiles #print (cy) molecule = cyclize(molecule, cy) # clean the smile from all the tags for atom in molecule.GetAtoms(): atom.SetAtomMapNum(0) smiles = rdmolfiles.MolToSmiles(molecule, isomericSmiles=True).replace( '[N]', 'N').replace('[C]', 'C') return smiles
def smiles_from_seq(seq, cyclize): """Calculates the smiles of a given peptide dendrimer sequence Arguments: seq {string} -- peptide dendrimer sequence Returns: string -- molecule_smile - SMILES of the peptide """ #seq = seq.replace("-z","z").replace("-Z","Z").replace("-p","p").replace("-P","P") gs, bs, terminal, capping = split_seq_components(seq) # modifies the Cterminal if terminal: molecule = rdmolfiles.MolFromSmiles(T_SMILES[terminal[0]]) else: molecule = '' if cyclize and bs: print( 'dendrimer, cyclization not possible, branching unit will not be considered' ) if cyclize: for gen in gs: metbond = False for aa in gen: if aa == 'X': continue if aa == '-': metbond = True continue if molecule == '': molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa]) else: molecule = utils.connect_mol( molecule, rdmolfiles.MolFromSmiles(AA_SMILES[aa]), metbond) if metbond: metbond = False else: # creates the dendrimer structure for gen in gs: metbond = False for aa in gen: if aa == '-': metbond = True continue if molecule == '': molecule = rdmolfiles.MolFromSmiles(AA_SMILES[aa]) else: molecule = utils.connect_mol( molecule, rdmolfiles.MolFromSmiles(AA_SMILES[aa]), metbond) if metbond: metbond = False if bs: if bs[0] == '-': metbond = True bs.pop(0) if molecule == '': molecule = rdmolfiles.MolFromSmiles(B_SMILES[bs[0]]) else: molecule = utils.connect_mol( molecule, rdmolfiles.MolFromSmiles(B_SMILES[bs[0]]), metbond) if metbond: metbond = False bs.pop(0) # adds capping to the N-terminal (the called clip function is different, cause the listed smiles # for the capping are already without OH, it is not necessary removing any atom after foming the new bond) if molecule == '': smiles = '' return smiles, seq if capping: molecule = utils.attach_capping( molecule, rdmolfiles.MolFromSmiles(C_SMILES[capping[0]])) if cyclize: if is_cyclic(seq): cy = 1 else: cy = 0 molecule = utils.cyclize(molecule, cy) # clean the smile from all the tags for atom in molecule.GetAtoms(): atom.SetAtomMapNum(0) molecule_smile = rdmolfiles.MolToSmiles( molecule, isomericSmiles=True).replace('[N]', 'N').replace('[C]', 'C') return molecule_smile, seq