def clean_mol(self, mol: Mol) -> Mol: """Cleans the specified molecule into standardised format. The steps are, - Removal of salts - Normalise structures - Normalise tautomers - Remove all charges (where possible) Args: mol: The molecule to clean. Returns: The cleaned molecule. """ # Use RDKit standardizer to return the parent fragment (non-salt) # This will also apply more normalisation and clean up any charges mol = rdMolStandardize.ChargeParent(mol) # Custom tautomers mol = self._apply_reaction(mol, self._tautomerTetrazole) return mol
def test3Parents(self): mol = Chem.MolFromSmiles("[Na]OC(=O)c1ccccc1") nmol = rdMolStandardize.FragmentParent(mol) self.assertEqual(Chem.MolToSmiles(nmol), "O=C([O-])c1ccccc1") mol = Chem.MolFromSmiles("C[NH+](C)(C).[Cl-]") nmol = rdMolStandardize.ChargeParent(mol) self.assertEqual(Chem.MolToSmiles(nmol), "CN(C)C") mol = Chem.MolFromSmiles("[O-]CCCC=CO.[Na+]") nmol = rdMolStandardize.TautomerParent(mol) self.assertEqual(Chem.MolToSmiles(nmol), "O=CCCCC[O-].[Na+]") nmol = rdMolStandardize.TautomerParent(mol, skipStandardize=True) # same answer because of the standardization at the end self.assertEqual(Chem.MolToSmiles(nmol), "O=CCCCC[O-].[Na+]") mol = Chem.MolFromSmiles("C[C@](F)(Cl)C/C=C/[C@H](F)Cl") nmol = rdMolStandardize.StereoParent(mol) self.assertEqual(Chem.MolToSmiles(nmol), "CC(F)(Cl)CC=CC(F)Cl") mol = Chem.MolFromSmiles("[12CH3][13CH3]") nmol = rdMolStandardize.IsotopeParent(mol) self.assertEqual(Chem.MolToSmiles(nmol), "CC") mol = Chem.MolFromSmiles( "[Na]Oc1c([12C@H](F)Cl)c(O[2H])c(C(=O)O)cc1CC=CO") nmol = rdMolStandardize.SuperParent(mol) self.assertEqual(Chem.MolToSmiles(nmol), "O=CCCc1cc(C(=O)O)c(O)c(C(F)Cl)c1O") mol = Chem.MolFromSmiles( "[Na]Oc1c([12C@H](F)Cl)c(O[2H])c(C(=O)O)cc1CC=CO") nmol = rdMolStandardize.SuperParent(mol, skipStandardize=True) self.assertEqual(Chem.MolToSmiles(nmol), "O=CCCc1cc(C(=O)[O-])c(O)c(C(F)Cl)c1O.[Na+]")
def test20NoneHandling(self): with self.assertRaises(ValueError): rdMolStandardize.ChargeParent(None) with self.assertRaises(ValueError): rdMolStandardize.Cleanup(None) with self.assertRaises(ValueError): rdMolStandardize.FragmentParent(None) with self.assertRaises(ValueError): rdMolStandardize.Normalize(None) with self.assertRaises(ValueError): rdMolStandardize.Reionize(None)
def clean_mol(smile, is_deep=True): smile = smile.replace('[O]', 'O').replace('[C]', 'C') \ .replace('[N]', 'N').replace('[B]', 'B') \ .replace('[2H]', '[H]').replace('[3H]', '[H]') try: mol = Chem.MolFromSmiles(smile) if is_deep: mol = rdMolStandardize.ChargeParent(mol) smileR = Chem.MolToSmiles(mol, 0) smile = Chem.CanonSmiles(smileR) except: print('Parsing Error:', smile) smile = None return smile
def calculate_single(self, smiles) -> Tuple: if smiles is nan: return None, False, "No smiles entry." try: mol = MolFromSmiles( smiles) # Read SMILES and convert it to RDKit mol object. except (TypeError, ValueError, AttributeError) as e: return None, False, str(e) # Check, if the input SMILES has been converted into a mol object. if mol is None: return None, False, "failed to parse smiles {}".format(smiles) # check size of the molecule based on the non-hydrogen atom count. if mol.GetNumAtoms() >= self.max_num_atoms: return ( None, False, "number of non-H atoms {0} exceeds limit of {1} for smiles {2}" .format(mol.GetNumAtoms(), self.max_num_atoms, smiles), ) try: mol = rdMolStandardize.ChargeParent( mol) # standardize molecules using MolVS and RDKit mol = self.isotope_parent(mol) if self.include_stereoinfo is False: Chem.RemoveStereochemistry(mol) mol = self.tautomerizer.Canonicalize(mol) mol_clean_tmp = self.my_standardizer(mol) smi_clean_tmp = MolToSmiles( mol_clean_tmp) # convert mol object back to SMILES ## Double check if standardized SMILES is a valid mol object mol_clean = MolFromSmiles(smi_clean_tmp) smi_clean = MolToSmiles(mol_clean) except (TypeError, ValueError, AttributeError) as e: return None, False, str(e) return smi_clean, True, None
def test4ChargeParent(self): mol = Chem.MolFromSmiles("C[NH+](C)(C).[Cl-]") nmol = rdMolStandardize.ChargeParent(mol) self.assertEqual(Chem.MolToSmiles(nmol), "CN(C)C")
def get_simplified_smiles_for_chemicals(self) -> pd.DataFrame: """ This method gets SMILES for every chemical substance in the robokop neo4j graph database and creates a simplified SMILES from each. The simplified SMILES values will be used as a grouping mechanism and saved in the redis database. """ # Create a target data frame for the processed data df: pd.DataFrame = pd.DataFrame(columns=[ 'chem_id', 'original_SMILES', 'simplified_SMILES', 'name' ]) try: # Create the query. This is of course robokop specific # Query modified to exclude all chemical substances that have wildcard definitions c_query: str = f'match (c:chemical_substance) where c.smiles is not NULL and c.smiles <> "" and NOT c.smiles CONTAINS "*" RETURN c.id, c.smiles, c.name order by c.smiles {self._debug_record_limit}' self.print_debug_msg( f"Querying target database for chemical substances.", True) # check to see if we are in test mode if self._do_KGX != 0 or self._do_redis != 0: # execute the query records: list = self.run_neo4j_query(c_query) # to create a test data file # d = pd.DataFrame(records, columns=['c.id', 'c.smiles', 'c.name']) # d.to_json('datafile.json.test', orient='records') else: # open the test data file and use that instead of the database with open('./tests/datafile.json') as json_file: records = json.load(json_file) # did we get some records if len(records) > 0: self.print_debug_msg( f"{len(records)} chemical substance records will be processed.", True) # init a counter count: int = 0 # loop through the records for r in records: # increment the record counter count = count + 1 # inform user of progress if count % 25000 == 0: self.print_debug_msg( f'get_simplified_smiles_for_chemicals() - At data record index {count}.', True) try: # Construct a molecule from a SMILES string molecule: Mol = Chem.MolFromSmiles(r['c.smiles']) except Exception as e: # alert the user there was an issue and continue self.print_debug_msg( f"Error - Exception trying to get a molecule for record {count}, chem id: {r['c.id']} with original SMILES: {r['c.smiles']}, Exception {e}. Proceeding.", True) continue # did we get the molecule if molecule is None: # Couldn't parse the molecule self.print_debug_msg( f"Error - Got an empty molecule for record {count}, chem id: {r['c.id']} with smiles: {r['c.smiles']}. Proceeding.", True) continue try: # get the uncharged version of the largest fragment molecule_uncharged: Mol = rdMolStandardize.ChargeParent( molecule) # Remove all stereo-chemistry info from the molecule RemoveStereochemistry(molecule_uncharged) # get the simplified SMILES value simplified_smiles: str = Chem.MolToSmiles( molecule_uncharged) # convert the curie prefix to the new standard if self._do_curie_update == 1: chem_id = r['c.id'].replace( "KEGG:", "KEGG.COMPOUND:").replace( "CHEMBL:", "CHEMBL.COMPOUND:") else: chem_id = r['c.id'] # check to see if there is a name if r['c.name'] is None or r['c.name'] == '' or r[ 'c.name'] == 'NULL': name_fixed = chem_id else: # insure there are no dbl quotes in the name, it throws off the CSV file name_fixed = r['c.name'].replace('"', "'") # save the new record record = { 'chem_id': chem_id, 'original_SMILES': r['c.smiles'], 'simplified_SMILES': simplified_smiles, 'name': name_fixed } # append the new record to the data frame df = df.append(record, ignore_index=True) except Exception as e: # alert the user that something was discovered in the original graph record self.print_debug_msg( f"Error - Could not get a simplified SMILES for record {count}, chem id: {r['c.id']}, Original SMILES: {r['c.smiles']}, Exception: {e}" ) else: self.print_debug_msg(f"No records to process.", True) except Exception as e: raise e # return to the caller return df
## 'CHEBI:30470','CHEBI:36301','CHEBI:38284','CHEBI:48998','CHEBI:37189', # 'CHEBI:60532','CHEBI:51221','CHEBI:29416', 'CHEBI:36163','CHEBI:29296', # 'CHEBI:51508','CHEBI:30665','CHEBI:29886','CHEBI:85715','CHEBI:49851', # 'CHEBI:30197','CHEBI:30125','CHEBI:37856','CHEBI:38283','CHEBI:10098', # 'CHEBI:132769','CHEBI:133489','CHEBI:134067','CHEBI:141330','CHEBI:15432', # 'CHEBI:26355','CHEBI:28163','CHEBI:29295','CHEBI:29417','CHEBI:29418', # 'CHEBI:29422','CHEBI:29440','CHEBI:29796','CHEBI:29880','CHEBI:30126', # 'CHEBI:30238']: # continue smiles = x[2] if smiles == '[empty]': continue try: mol = Chem.MolFromSmiles(smiles) except Exception as e: print(f"error with {smiles}. Proceeding") continue if mol is None: #Couldn't parse continue try: print(f'{cid}\t{smiles}') molp = rdMolStandardize.ChargeParent(mol) RemoveStereochemistry(molp) newsmi = Chem.MolToSmiles(molp) #chems.append(chem) outf.write(f"{cid}\t{smiles}\t{newsmi}\n") except Exception as e: print(f"error with {x}") #exit()
def structure_standardization(smi: str) -> str: """ Standardization function to clean up smiles with RDKit. First, the input smiles is converted into a mol object. Not-readable SMILES are written to the log file. The molecule size is checked by the number of atoms (non-hydrogen). If the molecule has more than 100 non-hydrogen atoms, the compound is discarded and written in the log file. Molecules with number of non-hydrogen atoms <= 100 are standardized with the MolVS toolkit (https://molvs.readthedocs.io/en/latest/index.html) relying on RDKit. Molecules which failed the standardization process are saved in the log file. The remaining standardized structures are converted back into their canonical SMILES format. :param smi: Input SMILES from the given structure data file T4 :return: smi_clean: Cleaned and standardized canonical SMILES of the given input SMILES. Args: smi (str): Non-standardized smiles string Returns: str: standardized smiles string """ # tautomer.TAUTOMER_TRANSFORMS = update_tautomer_rules() # importlib.reload(MolVS_standardizer) # param = ReadConfig() standardization_param = ConfigDict.get_parameters()["standardization"] max_num_atoms = standardization_param["max_num_atoms"] max_num_tautomers = standardization_param["max_num_tautomers"] include_stereoinfo = standardization_param["include_stereoinfo"] ## Load new tautomer enumarator/canonicalizer tautomerizer = rdMolStandardize.TautomerEnumerator() tautomerizer.SetMaxTautomers(max_num_tautomers) tautomerizer.SetRemoveSp3Stereo( False) # Keep stereo information of keto/enol tautomerization def isotope_parent(mol: Chem.Mol) -> Chem.Mol: """ Isotope parent from MOLVS Return the isotope parent of a given molecule. The isotope parent has all atoms replaced with the most abundant isotope for that element. Args: mol (Chem.Mol): input rdkit mol object Returns: Chem.Mol: isotope parent rdkit mol object """ mol = copy.deepcopy(mol) # Replace isotopes with common weight for atom in mol.GetAtoms(): atom.SetIsotope(0) return mol def my_standardizer(mol: Chem.Mol) -> Chem.Mol: """ MolVS implementation of standardization Args: mol (Chem.Mol): non-standardized rdkit mol object Returns: Chem.Mol: stndardized rdkit mol object """ mol = copy.deepcopy(mol) Chem.SanitizeMol(mol) mol = Chem.RemoveHs(mol) disconnector = rdMolStandardize.MetalDisconnector() mol = disconnector.Disconnect(mol) normalizer = rdMolStandardize.Normalizer() mol = normalizer.normalize(mol) reionizer = rdMolStandardize.Reionizer() mol = reionizer.reionize(mol) Chem.AssignStereochemistry(mol, force=True, cleanIt=True) # TODO: Check this removes symmetric stereocenters return mol mol = MolFromSmiles(smi) # Read SMILES and convert it to RDKit mol object. if (mol is not None ): # Check, if the input SMILES has been converted into a mol object. if ( mol.GetNumAtoms() <= max_num_atoms ): # check size of the molecule based on the non-hydrogen atom count. try: mol = rdMolStandardize.ChargeParent( mol) # standardize molecules using MolVS and RDKit mol = isotope_parent(mol) if include_stereoinfo is False: Chem.RemoveStereochemistry(mol) mol = tautomerizer.Canonicalize(mol) mol_clean = my_standardizer(mol) smi_clean = MolToSmiles( mol_clean) # convert mol object back to SMILES else: mol = tautomerizer.Canonicalize(mol) mol_clean = my_standardizer(mol) smi_clean = MolToSmiles(mol_clean) except (ValueError, AttributeError) as e: smi_clean = np.nan logging.error( "Standardization error, " + smi + ", Error Type: " + str(e) ) # write failed molecules during standardization to log file else: smi_clean = np.nan logging.error("Molecule too large, " + smi) else: smi_clean = np.nan logging.error("Reading Error, " + smi) return smi_clean
if args.gpu: model = model.cuda() # Map smiles to embedding embeddings = {} input_smiles = open(args.smiles_list, "r").readlines() # Filter title and remove new lines input_smiles = [ j.strip() for index, j in enumerate(input_smiles) if index >= 1 ] # Create charge parents input_mols = [Chem.MolFromSmiles(i) for i in input_smiles] standardized_mols = [rdMolStandardize.ChargeParent(i) for i in input_mols] input_smiles = [Chem.MolToSmiles(i) for i in standardized_mols] out_tensors = model.encode_from_smiles(input_smiles) output = out_tensors.cpu().detach().numpy() feature_mapping = {} pickle_obj = dict(zip(input_smiles, output)) pickle.dump(pickle_obj, open(f"{args.out}.p", "wb")) loaded_pickle = pickle.load(open(f"{args.out}.p", "rb")) import pdb pdb.set_trace() print(loaded_pickle)