def molecular_weight_filter_wrapper(*args, **kwargs): data = [] for original_mol in original_func(*args, **kwargs): if AllChem.CalcExactMolWt(original_mol.mol) <= config.MAX_MW: data.append(original_mol) return data
def set_computable(self): mol = tool_chemical.read_string("mol", self._mol) # molecular_formula = Descriptors.rdMolDescriptors.CalcMolFormula(mol) # molecular_weight = Descriptors.ExactMolWt(mol) self._smiles = Chem.MolToSmiles(mol, isomericSmiles=False) self._inchi = inchi.MolToInchi(mol) self._inchikey = inchi.MolToInchiKey(mol) self._molecular_formula = Chem.CalcMolFormula(mol) self._molecular_weight = Chem.CalcExactMolWt(mol)
def featurize(aa): mol = Chem.MolFromFASTA(aa) mol = Chem.AddHs(mol) descriptors = { 'MolWT': AllChem.CalcExactMolWt(mol), 'LogP': Chem.Crippen.MolLogP(mol), 'HBondDonors': AllChem.CalcNumLipinskiHBD(mol), 'HBondAcceptors': AllChem.CalcNumLipinskiHBA(mol), 'nAromaticRings': AllChem.CalcNumAromaticRings(mol), 'nHeteroAtoms': AllChem.CalcNumHeteroatoms(mol), 'nRotatableBonds': AllChem.CalcNumRotatableBonds(mol) }
def insert_core_compound(self, compound_dict, requests=None): """This method generates a mongo request to save a compound into the core database. The necessary fields for the API are calculated. If a list of requests are given the request is appended for later bulk writing. Otherwise a single entry is made. If a compound is already in the core database nothing is written. :param compound_dict: Compound Dictionary :type compound_dict: dict :param requests: List of requests for bulk insert :type requests: None """ core_dict = copy(compound_dict) cpd_id = core_dict['_id'] mol_object = AllChem.MolFromSmiles(core_dict['SMILES']) if 'Generation' in core_dict: del (core_dict['Generation']) if 'Expand' in core_dict: del (core_dict['Expand']) if 'Type' in core_dict: del (core_dict['Type']) if 'Product_of' in core_dict: del (core_dict['Product_of']) if 'Reactant_in' in core_dict: del (core_dict['Reactant_in']) # Store all different representations of the molecule (SMILES, Formula, # InChI key, etc.) as well as its properties in a dictionary if not 'SMILES' in core_dict: core_dict['SMILES'] = AllChem.MolToSmiles(mol_object, True) if not 'Inchi' in core_dict: core_dict['Inchi'] = AllChem.MolToInchi(mol_object) if not 'Inchikey' in core_dict: core_dict['Inchikey'] = AllChem.InchiToInchiKey(core_dict['Inchi']) core_dict['Mass'] = AllChem.CalcExactMolWt(mol_object) core_dict['Formula'] = AllChem.CalcMolFormula(mol_object) core_dict['logP'] = AllChem.CalcCrippenDescriptors(mol_object)[0] core_dict['NP_likeness'] = nps.scoreMol(mol_object, self.nps_model) core_dict['Spectra'] = {} # Record which expansion it's coming from core_dict['MINES'] = [] if requests != None: requests.append( pymongo.UpdateOne({'_id': cpd_id}, {'$setOnInsert': core_dict}, upsert=True)) else: self.core_compounds.update_one({'_id': cpd_id}, {'$setOnInsert': core_dict}, upsert=True) return None
def convert_to_nM(radek): # pokud má molekula aktivitu v ug.mL-1, převede ji na nM if radek["ACTIVITY_UNITS"] == "ug.mL-1": act = int(radek["ACTIVITY"]) molwt = AllChem.CalcExactMolWt(radek["MOL_OBJECT"]) radek["ACTIVITY"] = (act / molwt) * 1000000 radek["ACTIVITY_UNITS"] = "nM" return radek else: return radek
def test_api_addMolecule(self): response = self.client.post(path="/api/addMolecule", data={"molfile": self.propane}) self.assertEqual(response.status_code, 200) mol = AllChem.MolFromMolBlock(self.propane) mol_added = Molecule.objects.last() self.assertEqual(float("{0:.2f}".format(AllChem.CalcExactMolWt(mol))), mol_added.mw) self.assertEqual(AllChem.MolToSmiles(mol), mol_added.smiles) self.assertEqual(AllChem.CalcMolFormula(mol), mol_added.sum_formula) inchi = AllChem.MolToInchi(mol) self.assertEqual(inchi, mol_added.inchi) self.assertEqual(AllChem.InchiToInchiKey(inchi), mol_added.inchi_key)
def _make_compound_info(mol_object): return { 'smiles': AllChem.MolToSmiles(mol_object, True), 'inchikey': AllChem.InchiToInchiKey(AllChem.MolToInchi(mol_object)), 'mass': Descriptors.MolWt(mol_object), 'exactmass': AllChem.CalcExactMolWt(mol_object), 'formula': AllChem.CalcMolFormula(mol_object), 'charge': AllChem.GetFormalCharge(mol_object), 'fingerprints': { 'maccs': dict([(str(x), 1) for x in AllChem.GetMACCSKeysFingerprint(mol_object).GetOnBits()]), 'rdkit': dict([(str(x), 1) for x in AllChem.RDKFingerprint(mol_object).GetOnBits()]), }, 'dblinks': {}, }
def filter_size(in_lines, maxweight=650, Verbose=False): # remove compounds with a MW that is greater than the maximum # this needs to be run after the structure standardization and desalting step for i in range(len(in_lines)): molweight = Chem.CalcExactMolWt( Chem.MolFromSmiles(in_lines['canonical_smiles'][i])) if molweight >= maxweight: in_lines = in_lines.drop(i) if Verbose: print('Number of compounds after molecular weight filter: ', len(in_lines)) return in_lines.reset_index(drop=True)
def sdf_parser(soubor): mol_counter = 0 suppl = Chem.SDMolSupplier(soubor) for mol in suppl: if mol is None: continue print(mol.GetNumAtoms()) mol_counter += 1 new_inchi = Chem.MolToInchi(mol) new_inchikey = Chem.InchiToInchiKey(new_inchi) # kontrola jestli je molekula již v databázi dle inchikey - ten by měl být unikátní if Molecule.objects.filter(inchikey=new_inchikey).exists(): print(mol, "already exists") else: new_smiles = Chem.MolToSmiles(mol) new_summaryForm = AllChem.CalcMolFormula(mol) new_molweigth = AllChem.CalcExactMolWt(mol) if mol.HasProp('PUBCHEM_SUBSTANCE_SYNONYM'): new_name = mol.GetProp('PUBCHEM_SUBSTANCE_SYNONYM').split("\n")[0] newInsertedMolecule = Molecule(name=new_name, smiles=new_smiles, mol_weight=new_molweigth, inchi=new_inchi, inchikey=new_inchikey, summary_formula=new_summaryForm) newInsertedMolecule.save() """ new_name = django_form.cleaned_data['new_name'] new_smiles = django_form.cleaned_data.get('new_smiles', '') new_summaryForm = django_form.cleaned_data.get('new_summaryForm', '') newInsertedMolecule = Molecule(name=new_name, smiles=new_smiles, summary_formula=new_summaryForm) newInsertedMolecule.save() """ # ulož do databáze, naparsuj atd. #mols = [x for x in suppl] return mol_counter
def _get_core_cpd_insert(cpd_dict: dict) -> pymongo.UpdateOne: """Generate core compound to be inserted""" core_keys = ["_id", "SMILES", "Inchi", "InchiKey", "Mass", "Formula"] core_dict = { key: cpd_dict.get(key) for key in core_keys if cpd_dict.get(key) != None } mol_object = AllChem.MolFromSmiles(core_dict["SMILES"]) rdk_fp = [ i for i, val in enumerate( list(AllChem.RDKFingerprint(mol_object, fpSize=512))) if val ] # Store all different representations of the molecule (SMILES, Formula, # InChI key, etc.) as well as its properties in a dictionary if not "SMILES" in core_dict: core_dict["SMILES"] = AllChem.MolToSmiles(mol_object, True) if not "Inchi" in core_dict: core_dict["Inchi"] = AllChem.MolToInchi(mol_object) if not "Inchikey" in core_dict: core_dict["Inchikey"] = AllChem.InchiToInchiKey(core_dict["Inchi"]) core_dict["Mass"] = AllChem.CalcExactMolWt(mol_object) core_dict["Charge"] = AllChem.GetFormalCharge(mol_object) core_dict["Formula"] = AllChem.CalcMolFormula(mol_object) core_dict["logP"] = AllChem.CalcCrippenDescriptors(mol_object)[0] core_dict["RDKit_fp"] = rdk_fp core_dict["len_RDKit_fp"] = len(rdk_fp) # core_dict['NP_likeness'] = nps.scoreMol(mol_object, nps_model) core_dict["Spectra"] = {} # Record which expansion it's coming from core_dict["MINES"] = [] return pymongo.UpdateOne({"_id": core_dict["_id"]}, {"$setOnInsert": core_dict}, upsert=True)
def save(self, smiles=None, molfile=None, rdmol=None, inchi=None, name=None, update=False, *args, **kwargs): if not update: if molfile: mol = AllChem.MolFromMolBlock(molfile) elif smiles: mol = AllChem.MolFromSmiles(smiles) elif rdmol: mol = rdmol elif inchi: mol = AllChem.MolFromInchi(inchi) if mol: inchi = AllChem.MolToInchi(mol) smiles = AllChem.MolToSmiles(mol) if inchi and Molecule.objects.filter( inchi=inchi).count() == 0 and len(inchi) > 1: self.inchi = inchi self.mw = float("{0:.2f}".format( AllChem.CalcExactMolWt(mol))) self.sum_formula = AllChem.CalcMolFormula(mol) self.fingerprint = AllChem.GetMorganFingerprintAsBitVect( mol, 4, nBits=1024).ToBitString() self.inchi_key = AllChem.InchiToInchiKey(self.inchi) self.molfile = AllChem.MolToMolBlock(mol) self.smiles = smiles self.rdmol = mol # generating SVG image if self.smiles not in self.EXCLUDED_MOLECULES: binMol = AllChem.Mol(self.rdmol.ToBinary()) if not binMol.GetNumConformers(): rdDepictor.Compute2DCoords(self.rdmol) drawer = rdMolDraw2D.MolDraw2DSVG(100, 100) drawer.DrawMolecule(self.rdmol) drawer.FinishDrawing() svg = drawer.GetDrawingText().replace('svg:', '') # remove first line containg XML meta information self.image_svg = "\n".join(svg.split("\n")[1:]).strip() else: self.image_svg = None if name: self.name = name else: try: self.name = mol.GetProp("LONGNAME") except KeyError: self.name = None if Molecule.objects.all().count() == 0: self.internal_id = "MI-J-1" else: self.internal_id = "MI-J-{}".format( Molecule.objects.latest("id").id + 1) super(Molecule, self).save(*args, **kwargs) else: raise self.MoleculeExistsInDatabase(smiles) else: raise self.MoleculeCreationError else: super(Molecule, self).save(*args, **kwargs)
inpath = path + '../data/' # read, filter and write the commercial compounds count = 0 outfile = gzip.open(path + 'commercial_cmps_cleaned.dat.gz', 'w') outfile.write("#Identifier\tSMILES\n") for line in gzip.open(inpath + 'parent.smi.gz', 'r'): if line[0] == "#": continue line = line.rstrip().split() # contains: [smiles, identifier] m = Chem.MolFromSmiles(line[0]) if m is None: continue # number of heavy atoms num_ha = m.GetNumHeavyAtoms() if num_ha < 15 or num_ha > 50: continue # molecular weight mw = AllChem.CalcExactMolWt(m) if mw < 200 or mw > 700: continue # number of rotatable bonds num_rb = AllChem.CalcNumRotatableBonds(m) if num_rb > 8: continue # number of H-bond donors and acceptors num_hba = AllChem.CalcNumHBA(m) num_hbd = AllChem.CalcNumHBD(m) if num_hba > 10 or num_hbd > 5: continue # keep the molecule outfile.write("%s\t%s\n" % (line[0], line[1])) count += 1 outfile.close() print "number of molecules that passed the filters:", count
def run( mol2=None, smiles=None, standardise=STANDARDISE_DEF, num_conf=NUM_CONF_DEF, first=FIRST_DEF, pool_multiplier=POOL_MULTIPLIER_DEF, rmsd_cutoff=RMSD_CUTOFF_DEF, max_energy_diff=MAX_ENERGY_DIFF_DEF, forcefield=FORCEFIELD_DEF, seed=SEED_DEF, params=None, prioritize=False, out_dir=OUTDIR_DEF, compress=COMPRESS_DEF, overwrite=False, values_file=None, log=None, num_proc=None, parallel_mode=None, verbose=False, ): """Run conformer generation.""" setup_logging(log, verbose=verbose) if params is not None: params = read_params(params) standardise = get_value(params, "preprocessing", "standardise", bool) num_conf = get_value(params, "conformer_generation", "num_conf", int) first = get_value(params, "conformer_generation", "first", int) pool_multiplier = get_value(params, "conformer_generation", "pool_multiplier", int) rmsd_cutoff = get_value(params, "conformer_generation", "rmsd_cutoff", float) max_energy_diff = get_value(params, "conformer_generation", "max_energy_diff", float) forcefield = get_value(params, "conformer_generation", "forcefield") seed = get_value(params, "conformer_generation", "seed", int) # check args if forcefield not in FORCEFIELD_CHOICES: raise ValueError( "Specified forcefield {} is not in valid options {!r}".format( forcefield, FORCEFIELD_CHOICES)) para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode) # Check to make sure args make sense if mol2 is None and smiles is None: if para.is_master(): parser.print_usage() logging.error("Please provide mol2 file or a SMILES file.") sys.exit() if mol2 is not None and smiles is not None: if para.is_master(): parser.print_usage() logging.error("Please provide only a mol2 file OR a SMILES file.") sys.exit() if num_proc and num_proc < 1: if para.is_master(): parser.print_usage() logging.error( "Please provide more than one processor with `--num_proc`.") sys.exit() # Set up input type if mol2 is not None: in_type = "mol2" elif smiles is not None: in_type = "smiles" if para.is_master(): if in_type == "mol2": logging.info("Input type: mol2 file(s)") logging.info("Input file number: {:d}".format(len(mol2))) mol_iter = (mol_from_mol2(_mol2_file, _name, standardise=standardise) for _mol2_file, _name in mol2_generator(*mol2)) else: logging.info("Input type: Detected SMILES file(s)") logging.info("Input file number: {:d}".format(len(smiles))) mol_iter = (mol_from_smiles(_smiles, _name, standardise=standardise) for _smiles, _name in smiles_generator(*smiles)) if prioritize: logging.info(("Prioritizing mols with low rotatable bond number" " and molecular weight first.")) mols_with_properties = [( AllChem.CalcNumRotatableBonds(mol), AllChem.CalcExactMolWt(mol), mol, ) for mol in mol_iter if mol is not None] data_iterator = make_data_iterator( (x[-1] for x in sorted(mols_with_properties))) else: data_iterator = make_data_iterator( (x for x in mol_iter if x is not None)) # Set up parallel-specific options logging.info("Parallel Type: {}".format(para.parallel_mode)) # Set other options touch_dir(out_dir) if not num_conf: num_conf = -1 logging.info("Out Directory: {}".format(out_dir)) logging.info("Overwrite Existing Files: {}".format(overwrite)) if values_file is not None: if os.path.exists(values_file) and overwrite is not True: value_args = (values_file, "a") logging.info("Values file: {} (append)".format((values_file))) else: value_args = (values_file, "w") logging.info("Values file: {} (new file)".format( (values_file))) if num_conf is None or num_conf == -1: logging.info("Target Conformer Number: auto") else: logging.info("Target Conformer Number: {:d}".format(num_conf)) if first is None or first == -1: logging.info("First Conformers Number: all") else: logging.info("First Conformers Number: {:d}".format(first)) logging.info("Pool Multiplier: {:d}".format(pool_multiplier)) logging.info("RMSD Cutoff: {:.4g}".format(rmsd_cutoff)) if max_energy_diff is None: logging.info("Maximum Energy Difference: None") else: logging.info("Maximum Energy Difference: {:.4g} kcal".format( max_energy_diff)) logging.info("Forcefield: {}".format(forcefield.upper())) if seed != -1: logging.info("Seed: {:d}".format(seed)) logging.info("Starting.") else: data_iterator = iter([]) gen_conf_kwargs = { "out_dir": out_dir, "num_conf": num_conf, "rmsd_cutoff": rmsd_cutoff, "max_energy_diff": max_energy_diff, "forcefield": forcefield, "pool_multiplier": pool_multiplier, "first": first, "seed": seed, "save": True, "overwrite": overwrite, "compress": compress, } run_kwargs = {"kwargs": gen_conf_kwargs} results_iterator = para.run_gen(generate_conformers, data_iterator, **run_kwargs) if para.is_master() and values_file is not None: hdf5_buffer = HDF5Buffer(*value_args) for result, data in results_iterator: if (para.is_master() and values_file is not None and result is not False): values_to_hdf5(hdf5_buffer, result) if para.is_master() and values_file is not None: hdf5_buffer.flush() hdf5_buffer.close()
def __call__(self, mol): return (AllChem.CalcExactMolWt(Chem.Mol(mol.binary)), )
if len(can) > 250: #way too big sys.stderr.write('%s is too large. Omitted.\n' % name) continue cursor = conn.cursor() cursor.execute('SELECT sdfloc FROM structures WHERE smile = %s', (can, )) #if smile is not in structures row = cursor.fetchone() isnew = (row == None) or (row[0] == None) or (not os.path.exists(row[0])) sdfloc = None if row == None: #insert without sdfs to get unique id cursor.execute( 'INSERT INTO structures (smile,weight) VALUES(%s,%s) ', (can, Chem.CalcExactMolWt(mol))) elif row[0] and not os.path.exists(row[0]) and 'conformer' in row[ 0]: #hacky workaround for mistake I made w/prefixes sdfloc = row[0] #previously generated, but lost! #get unique id cursor.execute('SELECT id FROM structures WHERE smile = %s', (can, )) result = cursor.fetchone() uniqueid = result[0] #we always update the name unless otherwise specified if not missingname and not options.nonames: cursor.execute( 'SELECT * FROM names WHERE smile = %s and name = %s', (can, name))
mol = Chem.MolFromSmiles(smile) Chem.SanitizeMol(mol) #to be sure, canonicalize smile (with iso) can = Chem.MolToSmiles(mol,isomericSmiles=True) if len(can) > 250: #way too big sys.stderr.write('%s is too large. Omitted.\n' % name) continue cursor = conn.cursor() cursor.execute('SELECT sdfloc FROM structures WHERE smile = %s', (can,)) #if smile is not in structures row = cursor.fetchone() isnew = (row == None) or (row[0] == None) or (not os.path.exists(row[0])) sdfloc = None if row == None: #insert without sdfs to get unique id cursor.execute('INSERT INTO structures (smile,weight) VALUES(%s,%s) ', (can, Chem.CalcExactMolWt(mol))) elif row[0] and not os.path.exists(row[0]) and 'conformer' in row[0]: #hacky workaround for mistake I made w/prefixes sdfloc = row[0] #previously generated, but lost! #get unique id cursor.execute('SELECT id FROM structures WHERE smile = %s', (can,)) result = cursor.fetchone(); uniqueid = result[0] #we always update the name unless otherwise specified if not missingname and not options.nonames: cursor.execute('SELECT * FROM names WHERE smile = %s and name = %s', (can,name)) row = cursor.fetchone() if row == None: cursor.execute('INSERT IGNORE INTO names (smile,name) VALUES(%s,%s)', (can,name)) conn.commit()
def create_sdf_ligs(conn, libraryid, cprefixes): #break up molecules into individual files, assume molecules with the same name #are the same conformer; return true if successfull #outputs an ligs.in file in the current directory whichprefix = -1 lastsmi = '' fname = False confconn = MySQLdb.connect(host="localhost", user="******", db="conformers") try: ligout = open("ligs.in", 'wt') infile = gzip.open('input.sdf.gz') mols = Chem.ForwardSDMolSupplier(infile) molcnt = 0 for mol in mols: try: if mol is None: continue Chem.SanitizeMol(mol) can = Chem.MolToSmiles(mol, isomericSmiles=True) if can != lastsmi: #a different molecule if len(can) > 250: #way too big continue molcnt += 1 #get/assign a unique id cursor = confconn.cursor() cursor.execute( 'SELECT id FROM structures WHERE smile = %s', (can, )) row = cursor.fetchone() if row == None: #insert without sdfs to get unique id cursor.execute( 'INSERT INTO structures (smile,weight) VALUES(%s,%s) ', (can, Chem.CalcExactMolWt(mol))) cursor.execute( 'SELECT id FROM structures WHERE smile = %s', (can, )) row = cursor.fetchone() uniqueid = row[0] #we do not store the user supplied conformers, but leave sdfloc blank if fname: writer.close( ) #we have a file we have previously opened out.close() whichprefix = (whichprefix + 1) % len(cprefixes) subdir = "%s/user/%s/" % (cprefixes[whichprefix], libraryid) if not os.path.isdir(subdir): os.makedirs(subdir) fname = "%s/%d.sdf.gz" % (subdir, uniqueid) out = gzip.open(fname, 'wt') writer = Chem.SDWriter(out) if mol.HasProp('_Name'): name = mol.GetProp('_Name') else: name = str(molcnt) ligout.write('%s %d %s\n' % (fname, uniqueid, name)) #have file setup for this molecule, may be conformer writer.write(mol) except: #catch rdkit issues traceback.print_exc() continue if fname: writer.close() out.close() return True except: traceback.print_exc() return False
def insert_compound(self, mol_object, compound_dict=None, bulk=None, kegg_db="KEGG", pubchem_db='PubChem-8-28-2015', modelseed_db='ModelSEED'): """This class saves a RDKit Molecule as a compound entry in the MINE. Calculates necessary fields for API and includes additional information passed in the compound dict. Overwrites preexisting compounds in MINE on _id collision. :param mol_object: The compound to be stored :type mol_object: RDKit Mol object :param compound_dict: Additional information about the compound to be stored. Overwritten by calculated values. :type compound_dict: dict :param bulk: A pymongo bulk operation object. If None, reaction is immediately inserted in the database :param kegg_db: The ID of the KEGG Mongo database :type kegg_db: str :param pubchem_db: The ID of the PubChem Mongo database :type pubchem_db: str :param modelseed_db: The ID of the ModelSEED Mongo database :type modelseed_db: str :return: The hashed _id of the compound :rtype: str """ if compound_dict is None: compound_dict = {} # Store all different representations of the molecule (SMILES, Formula, # InChI key, etc.) as well as its properties in a dictionary compound_dict['SMILES'] = AllChem.MolToSmiles(mol_object, True) compound_dict['Inchi'] = AllChem.MolToInchi(mol_object) compound_dict['Inchikey'] = AllChem.InchiToInchiKey( compound_dict['Inchi']) compound_dict['Mass'] = AllChem.CalcExactMolWt(mol_object) compound_dict['Formula'] = AllChem.CalcMolFormula(mol_object) compound_dict['Charge'] = AllChem.GetFormalCharge(mol_object) # Get indices where bits are 1 compound_dict['MACCS'] = list( AllChem.GetMACCSKeysFingerprint(mol_object).GetOnBits()) compound_dict['len_MACCS'] = len(compound_dict['MACCS']) # Get indices where bits are 1 compound_dict['RDKit'] = list( AllChem.RDKFingerprint(mol_object).GetOnBits()) compound_dict['len_RDKit'] = len(compound_dict['RDKit']) compound_dict['logP'] = AllChem.CalcCrippenDescriptors(mol_object)[0] compound_dict['_id'] = utils.compound_hash( compound_dict['SMILES'], ('Type' in compound_dict and compound_dict['Type'] == 'Coreactant')) if '_atom_count' in compound_dict: del compound_dict['_atom_count'] # Caching this for rapid reaction mass change calculation self._mass_cache[compound_dict['_id']] = compound_dict['Mass'] # If the compound is a reactant, then make sure the reactant name is # in a correct format. if "Reactant_in" in compound_dict and isinstance( compound_dict['Reactant_in'], str) \ and compound_dict['Reactant_in']: compound_dict['Reactant_in'] = ast.literal_eval( compound_dict['Reactant_in']) # If the compound is a product, then make sure the reactant name is # in a correct format. if "Product_of" in compound_dict \ and isinstance(compound_dict['Product_of'], str) \ and compound_dict['Product_of']: compound_dict['Product_of'] = ast.literal_eval( compound_dict['Product_of']) # Store links to external databases where compound is present if compound_dict['Inchikey']: if kegg_db: compound_dict = self.link_to_external_database( kegg_db, compound=compound_dict, fields_to_copy=[('Pathways', 'Pathways'), ('Names', 'Names'), ('DB_links', 'DB_links'), ('Enzymes', 'Enzymes')]) if pubchem_db: compound_dict = self.link_to_external_database( pubchem_db, compound=compound_dict, fields_to_copy=[('COMPOUND_CID', 'DB_links.PubChem')]) if modelseed_db: compound_dict = self.link_to_external_database( modelseed_db, compound=compound_dict, fields_to_copy=[('DB_links', 'DB_links')]) # Calculate natural product likeness score and store in dict if not self.np_model: self.np_model = np.readNPModel() compound_dict["NP_likeness"] = np.scoreMol(mol_object, self.np_model) compound_dict = utils.convert_sets_to_lists(compound_dict) # Assign an id to the compound if self.id_db: mine_comp = self.id_db.compounds.find_one( {"Inchikey": compound_dict['Inchikey']}, { 'MINE_id': 1, "Pos_CFM_spectra": 1, "Neg_CFM_spectra": 1 }) # If compound already exists in MINE, store its MINE id in the dict if mine_comp: compound_dict['MINE_id'] = mine_comp['MINE_id'] if 'Pos_CFM_spectra' in mine_comp: compound_dict['Pos_CFM_spectra'] = mine_comp[ 'Pos_CFM_spectra'] if 'Neg_CFM_spectra' in mine_comp: compound_dict['Neg_CFM_spectra'] = mine_comp[ 'Neg_CFM_spectra'] # If compound does not exist, create new id based on number of # current ids in the MINE else: compound_dict['MINE_id'] = self.id_db.compounds.count() self.id_db.compounds.save(compound_dict) # If bulk insertion, upsert (insert and update) the database if bulk: bulk.find({'_id': compound_dict['_id']}).upsert().\ replace_one(compound_dict) else: self.compounds.save(compound_dict) return compound_dict['_id']