def test_to_selfies(): smiles = "CC(=O)Oc1ccccc1C(=O)O" mol = dm.to_mol(smiles) true_sf = ( "[C][C][Branch1_2][C][=O][O][C][=C][C][=C][C][=C][Ring1][Branch1_2][C][Branch1_2][C][=O][O]" ) selfies = dm.to_selfies(smiles) assert selfies == true_sf selfies = dm.to_selfies(mol) assert selfies == true_sf
def _preprocess(i, row): # print('hello') mol = dm.to_mol(str(row[smiles_column]), ordered=True) mol = dm.fix_mol(mol) mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False) mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True) fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect pars = { "radius": 2, "nBits": 8192, "invariants": [], "fromAtoms": [], "useChirality": True, "useBondTypes": True, "useFeatures": False, } fp = fingerprint_function(mol, **pars) row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol)) row["selfies"] = dm.to_selfies(mol) row["inchi"] = dm.to_inchi(mol) row["inchikey"] = dm.to_inchikey(mol) row["onbits_fp"] =list(fp.GetOnBits()) return row
def _preprocess(i, row): # print('hello') try: mol = dm.to_mol(str(row[smiles_column]), ordered=True) mol = dm.fix_mol(mol) mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False) mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True) opts = StereoEnumerationOptions(unique=True, maxIsomers=20, rand=0xf00d) isomers = EnumerateStereoisomers(mol, options=opts) enum_smiles = sorted( Chem.MolToSmiles(y, isomericSmiles=True) for y in isomers) smiles_list = [] for count, smi in enumerate(enum_smiles): smiles_string = smi smiles_list.append(smiles_string) # fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect # pars = { "radius": 2, # "nBits": 8192, # "invariants": [], # "fromAtoms": [], # "useChirality": False, # "useBondTypes": True, # "useFeatures": False, # } # fp = fingerprint_function(mol, **pars) row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol)) row["selfies"] = dm.to_selfies(mol) row["inchi"] = dm.to_inchi(mol) row["inchikey"] = dm.to_inchikey(mol) row["enumerated_smiles"] = smiles_list # row["onbits_fp"] =list(fp.GetOnBits()) return row except ValueError: row["standard_smiles"] = 'dropped' row["selfies"] = 'dropped' row["inchi"] = 'dropped' row["inchikey"] = 'dropped' row["enumerated_smiles"] = list('dropped') return row
def _preprocess(i, row): '''Takes a smiles string and generates a clean rdkit mol with datamol. The stereoisomers are then enumerated while holding defined stereochemistry. Morgan fingerprints are then generated using RDkit with and without stereochemistry. The try/except logic deals with RDkit mol failures on conversion of an invalid smiles string. Smarts are added for later searching.''' try: mol = dm.to_mol(str(row[smiles_column]), ordered=True) mol = dm.fix_mol(mol) mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False) mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True) opts = StereoEnumerationOptions(unique=True,maxIsomers=20,rand=0xf00d) isomers = EnumerateStereoisomers(mol, options=opts) enum_smiles = sorted(Chem.MolToSmiles(y,isomericSmiles=True) for y in isomers) # enum_dm_smiles = sorted(dm.standardize_smiles(dm.to_smiles(x)) for x in isomers) smiles_list = [] achiral_fp_lis = [] chiral_fp_lis = [] # standard_smiles_list = [] for count, smi in enumerate(enum_smiles): smiles_string = smi mol = dm.to_mol(smi, ordered=True) mol = dm.fix_mol(mol) mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False) mol = dm.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True) fingerprint_function = rdMolDescriptors.GetMorganFingerprintAsBitVect pars = { "radius": 2, "nBits": 8192, "invariants": [], "fromAtoms": [], "useChirality": True, "useBondTypes": True, "useFeatures": False, } pars2 = { "radius": 2, "nBits": 8192, "invariants": [], "fromAtoms": [], "useChirality": False, "useBondTypes": True, "useFeatures": False, } fp = fingerprint_function(mol, **pars) fp1 = fingerprint_function(mol, **pars2) smiles_list.append(dm.standardize_smiles(smiles_string)) achiral_fp_lis.append(list(fp1.GetOnBits())) chiral_fp_lis.append(list(fp.GetOnBits())) row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol)) row["smarts"] = dm.to_smarts(mol) row["selfies"] = dm.to_selfies(mol) row["enumerated_smiles"] = smiles_list row["achiral_fp"] = achiral_fp_lis row["chiral_fp"] = chiral_fp_lis # row["dm_enumerated_smiles"] = enum_dm_smiles_lis # row["onbits_fp"] =list(fp.GetOnBits()) return row except ValueError: # row["standard_smiles"] = 'dropped' # row["selfies"] = 'dropped' # row["inchi"] = 'dropped' # row["inchikey"] = 'dropped' row["standard_smiles"] = 'dropped' row["smarts"] = 'dropped' row["selfies"] = 'dropped' row["enumerated_smiles"] = list('dropped') row["achiral_fp"] = list('dropped') row["chiral_fp"] = list('dropped') # row["dm_enumerated_smiles"] = 'dropped' return row