def insert_core_compound(self, compound_dict, requests=None): """This method generates a mongo request to save a compound into the core database. The necessary fields for the API are calculated. If a list of requests are given the request is appended for later bulk writing. Otherwise a single entry is made. If a compound is already in the core database nothing is written. :param compound_dict: Compound Dictionary :type compound_dict: dict :param requests: List of requests for bulk insert :type requests: None """ core_dict = copy(compound_dict) cpd_id = core_dict['_id'] mol_object = AllChem.MolFromSmiles(core_dict['SMILES']) if 'Generation' in core_dict: del (core_dict['Generation']) if 'Expand' in core_dict: del (core_dict['Expand']) if 'Type' in core_dict: del (core_dict['Type']) if 'Product_of' in core_dict: del (core_dict['Product_of']) if 'Reactant_in' in core_dict: del (core_dict['Reactant_in']) # Store all different representations of the molecule (SMILES, Formula, # InChI key, etc.) as well as its properties in a dictionary if not 'SMILES' in core_dict: core_dict['SMILES'] = AllChem.MolToSmiles(mol_object, True) if not 'Inchi' in core_dict: core_dict['Inchi'] = AllChem.MolToInchi(mol_object) if not 'Inchikey' in core_dict: core_dict['Inchikey'] = AllChem.InchiToInchiKey(core_dict['Inchi']) core_dict['Mass'] = AllChem.CalcExactMolWt(mol_object) core_dict['Formula'] = AllChem.CalcMolFormula(mol_object) core_dict['logP'] = AllChem.CalcCrippenDescriptors(mol_object)[0] core_dict['NP_likeness'] = nps.scoreMol(mol_object, self.nps_model) core_dict['Spectra'] = {} # Record which expansion it's coming from core_dict['MINES'] = [] if requests != None: requests.append( pymongo.UpdateOne({'_id': cpd_id}, {'$setOnInsert': core_dict}, upsert=True)) else: self.core_compounds.update_one({'_id': cpd_id}, {'$setOnInsert': core_dict}, upsert=True) return None
def _get_core_cpd_insert(cpd_dict: dict) -> pymongo.UpdateOne: """Generate core compound to be inserted""" core_keys = ["_id", "SMILES", "Inchi", "InchiKey", "Mass", "Formula"] core_dict = { key: cpd_dict.get(key) for key in core_keys if cpd_dict.get(key) != None } mol_object = AllChem.MolFromSmiles(core_dict["SMILES"]) rdk_fp = [ i for i, val in enumerate( list(AllChem.RDKFingerprint(mol_object, fpSize=512))) if val ] # Store all different representations of the molecule (SMILES, Formula, # InChI key, etc.) as well as its properties in a dictionary if not "SMILES" in core_dict: core_dict["SMILES"] = AllChem.MolToSmiles(mol_object, True) if not "Inchi" in core_dict: core_dict["Inchi"] = AllChem.MolToInchi(mol_object) if not "Inchikey" in core_dict: core_dict["Inchikey"] = AllChem.InchiToInchiKey(core_dict["Inchi"]) core_dict["Mass"] = AllChem.CalcExactMolWt(mol_object) core_dict["Charge"] = AllChem.GetFormalCharge(mol_object) core_dict["Formula"] = AllChem.CalcMolFormula(mol_object) core_dict["logP"] = AllChem.CalcCrippenDescriptors(mol_object)[0] core_dict["RDKit_fp"] = rdk_fp core_dict["len_RDKit_fp"] = len(rdk_fp) # core_dict['NP_likeness'] = nps.scoreMol(mol_object, nps_model) core_dict["Spectra"] = {} # Record which expansion it's coming from core_dict["MINES"] = [] return pymongo.UpdateOne({"_id": core_dict["_id"]}, {"$setOnInsert": core_dict}, upsert=True)
def insert_compound(self, mol_object, compound_dict=None, bulk=None, kegg_db="KEGG", pubchem_db='PubChem-8-28-2015', modelseed_db='ModelSEED'): """This class saves a RDKit Molecule as a compound entry in the MINE. Calculates necessary fields for API and includes additional information passed in the compound dict. Overwrites preexisting compounds in MINE on _id collision. :param mol_object: The compound to be stored :type mol_object: RDKit Mol object :param compound_dict: Additional information about the compound to be stored. Overwritten by calculated values. :type compound_dict: dict :param bulk: A pymongo bulk operation object. If None, reaction is immediately inserted in the database :param kegg_db: The ID of the KEGG Mongo database :type kegg_db: str :param pubchem_db: The ID of the PubChem Mongo database :type pubchem_db: str :param modelseed_db: The ID of the ModelSEED Mongo database :type modelseed_db: str :return: The hashed _id of the compound :rtype: str """ if compound_dict is None: compound_dict = {} # Store all different representations of the molecule (SMILES, Formula, # InChI key, etc.) as well as its properties in a dictionary compound_dict['SMILES'] = AllChem.MolToSmiles(mol_object, True) compound_dict['Inchi'] = AllChem.MolToInchi(mol_object) compound_dict['Inchikey'] = AllChem.InchiToInchiKey( compound_dict['Inchi']) compound_dict['Mass'] = AllChem.CalcExactMolWt(mol_object) compound_dict['Formula'] = AllChem.CalcMolFormula(mol_object) compound_dict['Charge'] = AllChem.GetFormalCharge(mol_object) # Get indices where bits are 1 compound_dict['MACCS'] = list( AllChem.GetMACCSKeysFingerprint(mol_object).GetOnBits()) compound_dict['len_MACCS'] = len(compound_dict['MACCS']) # Get indices where bits are 1 compound_dict['RDKit'] = list( AllChem.RDKFingerprint(mol_object).GetOnBits()) compound_dict['len_RDKit'] = len(compound_dict['RDKit']) compound_dict['logP'] = AllChem.CalcCrippenDescriptors(mol_object)[0] compound_dict['_id'] = utils.compound_hash( compound_dict['SMILES'], ('Type' in compound_dict and compound_dict['Type'] == 'Coreactant')) if '_atom_count' in compound_dict: del compound_dict['_atom_count'] # Caching this for rapid reaction mass change calculation self._mass_cache[compound_dict['_id']] = compound_dict['Mass'] # If the compound is a reactant, then make sure the reactant name is # in a correct format. if "Reactant_in" in compound_dict and isinstance( compound_dict['Reactant_in'], str) \ and compound_dict['Reactant_in']: compound_dict['Reactant_in'] = ast.literal_eval( compound_dict['Reactant_in']) # If the compound is a product, then make sure the reactant name is # in a correct format. if "Product_of" in compound_dict \ and isinstance(compound_dict['Product_of'], str) \ and compound_dict['Product_of']: compound_dict['Product_of'] = ast.literal_eval( compound_dict['Product_of']) # Store links to external databases where compound is present if compound_dict['Inchikey']: if kegg_db: compound_dict = self.link_to_external_database( kegg_db, compound=compound_dict, fields_to_copy=[('Pathways', 'Pathways'), ('Names', 'Names'), ('DB_links', 'DB_links'), ('Enzymes', 'Enzymes')]) if pubchem_db: compound_dict = self.link_to_external_database( pubchem_db, compound=compound_dict, fields_to_copy=[('COMPOUND_CID', 'DB_links.PubChem')]) if modelseed_db: compound_dict = self.link_to_external_database( modelseed_db, compound=compound_dict, fields_to_copy=[('DB_links', 'DB_links')]) # Calculate natural product likeness score and store in dict if not self.np_model: self.np_model = np.readNPModel() compound_dict["NP_likeness"] = np.scoreMol(mol_object, self.np_model) compound_dict = utils.convert_sets_to_lists(compound_dict) # Assign an id to the compound if self.id_db: mine_comp = self.id_db.compounds.find_one( {"Inchikey": compound_dict['Inchikey']}, { 'MINE_id': 1, "Pos_CFM_spectra": 1, "Neg_CFM_spectra": 1 }) # If compound already exists in MINE, store its MINE id in the dict if mine_comp: compound_dict['MINE_id'] = mine_comp['MINE_id'] if 'Pos_CFM_spectra' in mine_comp: compound_dict['Pos_CFM_spectra'] = mine_comp[ 'Pos_CFM_spectra'] if 'Neg_CFM_spectra' in mine_comp: compound_dict['Neg_CFM_spectra'] = mine_comp[ 'Neg_CFM_spectra'] # If compound does not exist, create new id based on number of # current ids in the MINE else: compound_dict['MINE_id'] = self.id_db.compounds.count() self.id_db.compounds.save(compound_dict) # If bulk insertion, upsert (insert and update) the database if bulk: bulk.find({'_id': compound_dict['_id']}).upsert().\ replace_one(compound_dict) else: self.compounds.save(compound_dict) return compound_dict['_id']
def CalcCrippenLogPAndMR(mol): LogP, MR = AllChem.CalcCrippenDescriptors(mol) return LogP, MR
def calculate_descriptors(self, mol): """ Calculate MUV descriptors for a molecule. Parameters ---------- mol : Mol Molecule. """ d = [] # prep mol = Chem.AddHs(mol) Chem.AssignStereochemistry(mol, cleanIt=True, force=True, flagPossibleStereoCenters=True) # atom counts atoms = { 'B': 5, 'Br': 35, 'C': 6, 'Cl': 17, 'F': 9, 'I': 53, 'N': 7, 'O': 8, 'P': 15, 'S': 16 } counts = self.atom_counts(mol) total = mol.GetNumAtoms() d.append(total) heavy = mol.GetNumHeavyAtoms() d.append(heavy) for name in sorted(atoms.keys()): if atoms[name] in counts: d.append(counts[atoms[name]]) else: d.append(0) # hydrogen bond acceptors / donors n_acc = AllChem.CalcNumHBA(mol) d.append(n_acc) n_don = AllChem.CalcNumHBD(mol) d.append(n_don) # cLogP c_log_p, _ = AllChem.CalcCrippenDescriptors(mol) d.append(c_log_p) # number of chiral centers n_chiral = 0 for atom in mol.GetAtoms(): if (atom.GetChiralTag() == ChiralType.CHI_TETRAHEDRAL_CW or atom.GetChiralTag() == ChiralType.CHI_TETRAHEDRAL_CCW): n_chiral += 1 d.append(n_chiral) # number of ring systems (not the number of rings) n_ring_systems = self.count_ring_systems(mol) d.append(n_ring_systems) return np.asarray(d)