Пример #1
0
    def molecular_weight_filter_wrapper(*args, **kwargs):

        data = []
        for original_mol in original_func(*args, **kwargs):
            if AllChem.CalcExactMolWt(original_mol.mol) <= config.MAX_MW:
                data.append(original_mol)

        return data
Пример #2
0
 def set_computable(self):
     mol = tool_chemical.read_string("mol", self._mol)
     # molecular_formula = Descriptors.rdMolDescriptors.CalcMolFormula(mol)
     # molecular_weight = Descriptors.ExactMolWt(mol)
     self._smiles = Chem.MolToSmiles(mol, isomericSmiles=False)
     self._inchi = inchi.MolToInchi(mol)
     self._inchikey = inchi.MolToInchiKey(mol)
     self._molecular_formula = Chem.CalcMolFormula(mol)
     self._molecular_weight = Chem.CalcExactMolWt(mol)
Пример #3
0
def featurize(aa):
    mol = Chem.MolFromFASTA(aa)
    mol = Chem.AddHs(mol)
    descriptors = {
        'MolWT': AllChem.CalcExactMolWt(mol),
        'LogP': Chem.Crippen.MolLogP(mol),
        'HBondDonors': AllChem.CalcNumLipinskiHBD(mol),
        'HBondAcceptors': AllChem.CalcNumLipinskiHBA(mol),
        'nAromaticRings': AllChem.CalcNumAromaticRings(mol),
        'nHeteroAtoms': AllChem.CalcNumHeteroatoms(mol),
        'nRotatableBonds': AllChem.CalcNumRotatableBonds(mol)
    }
Пример #4
0
    def insert_core_compound(self, compound_dict, requests=None):
        """This method generates a mongo request to save a compound into the core database.
        The necessary fields for the API are calculated.
        If a list of requests are given the request is appended for later bulk writing.
        Otherwise a single entry is made. If a compound is already in the core database
        nothing is written.

        :param compound_dict: Compound Dictionary
        :type compound_dict: dict
        :param requests: List of requests for bulk insert
        :type requests: None
        """
        core_dict = copy(compound_dict)
        cpd_id = core_dict['_id']
        mol_object = AllChem.MolFromSmiles(core_dict['SMILES'])

        if 'Generation' in core_dict:
            del (core_dict['Generation'])
        if 'Expand' in core_dict:
            del (core_dict['Expand'])
        if 'Type' in core_dict:
            del (core_dict['Type'])
        if 'Product_of' in core_dict:
            del (core_dict['Product_of'])
        if 'Reactant_in' in core_dict:
            del (core_dict['Reactant_in'])
        # Store all different representations of the molecule (SMILES, Formula,
        #  InChI key, etc.) as well as its properties in a dictionary
        if not 'SMILES' in core_dict:
            core_dict['SMILES'] = AllChem.MolToSmiles(mol_object, True)
        if not 'Inchi' in core_dict:
            core_dict['Inchi'] = AllChem.MolToInchi(mol_object)
        if not 'Inchikey' in core_dict:
            core_dict['Inchikey'] = AllChem.InchiToInchiKey(core_dict['Inchi'])
        core_dict['Mass'] = AllChem.CalcExactMolWt(mol_object)
        core_dict['Formula'] = AllChem.CalcMolFormula(mol_object)
        core_dict['logP'] = AllChem.CalcCrippenDescriptors(mol_object)[0]
        core_dict['NP_likeness'] = nps.scoreMol(mol_object, self.nps_model)
        core_dict['Spectra'] = {}
        # Record which expansion it's coming from
        core_dict['MINES'] = []

        if requests != None:
            requests.append(
                pymongo.UpdateOne({'_id': cpd_id}, {'$setOnInsert': core_dict},
                                  upsert=True))
        else:
            self.core_compounds.update_one({'_id': cpd_id},
                                           {'$setOnInsert': core_dict},
                                           upsert=True)

        return None
Пример #5
0
def convert_to_nM(radek):
    # pokud má molekula aktivitu v ug.mL-1, převede ji na nM
    if radek["ACTIVITY_UNITS"] == "ug.mL-1":
        act = int(radek["ACTIVITY"])
        molwt = AllChem.CalcExactMolWt(radek["MOL_OBJECT"])

        radek["ACTIVITY"] = (act / molwt) * 1000000
        radek["ACTIVITY_UNITS"] = "nM"

        return radek

    else:
        return radek
    def test_api_addMolecule(self):
        response = self.client.post(path="/api/addMolecule", data={"molfile": self.propane})
        self.assertEqual(response.status_code, 200)

        mol = AllChem.MolFromMolBlock(self.propane)
        mol_added = Molecule.objects.last()

        self.assertEqual(float("{0:.2f}".format(AllChem.CalcExactMolWt(mol))), mol_added.mw)
        self.assertEqual(AllChem.MolToSmiles(mol), mol_added.smiles)
        self.assertEqual(AllChem.CalcMolFormula(mol), mol_added.sum_formula)
        inchi = AllChem.MolToInchi(mol)
        self.assertEqual(inchi, mol_added.inchi)
        self.assertEqual(AllChem.InchiToInchiKey(inchi), mol_added.inchi_key)
def _make_compound_info(mol_object):
    return {
        'smiles': AllChem.MolToSmiles(mol_object, True),
        'inchikey': AllChem.InchiToInchiKey(AllChem.MolToInchi(mol_object)),
        'mass': Descriptors.MolWt(mol_object),
        'exactmass': AllChem.CalcExactMolWt(mol_object),
        'formula': AllChem.CalcMolFormula(mol_object),
        'charge': AllChem.GetFormalCharge(mol_object),
        'fingerprints': {
            'maccs': dict([(str(x), 1) for x in AllChem.GetMACCSKeysFingerprint(mol_object).GetOnBits()]),
            'rdkit': dict([(str(x), 1) for x in AllChem.RDKFingerprint(mol_object).GetOnBits()]),
        },
        'dblinks': {},
    }
Пример #8
0
def filter_size(in_lines, maxweight=650, Verbose=False):

    # remove compounds with a MW that is greater than the maximum
    # this needs to be run after the structure standardization and desalting step

    for i in range(len(in_lines)):
        molweight = Chem.CalcExactMolWt(
            Chem.MolFromSmiles(in_lines['canonical_smiles'][i]))
        if molweight >= maxweight:
            in_lines = in_lines.drop(i)

    if Verbose:
        print('Number of compounds after molecular weight filter: ',
              len(in_lines))

    return in_lines.reset_index(drop=True)
Пример #9
0
def sdf_parser(soubor):
    mol_counter = 0
    suppl = Chem.SDMolSupplier(soubor)
    for mol in suppl:
        if mol is None: continue
        print(mol.GetNumAtoms())
        mol_counter += 1
        new_inchi = Chem.MolToInchi(mol)
        new_inchikey = Chem.InchiToInchiKey(new_inchi)
        # kontrola jestli je molekula již v databázi dle inchikey - ten by měl být unikátní
        
        if Molecule.objects.filter(inchikey=new_inchikey).exists():
            print(mol, "already exists")
        
        else:
            new_smiles = Chem.MolToSmiles(mol)
            new_summaryForm = AllChem.CalcMolFormula(mol)
            new_molweigth = AllChem.CalcExactMolWt(mol)
            
            if mol.HasProp('PUBCHEM_SUBSTANCE_SYNONYM'):
                new_name = mol.GetProp('PUBCHEM_SUBSTANCE_SYNONYM').split("\n")[0]
            
            
            newInsertedMolecule = Molecule(name=new_name, 
                                           smiles=new_smiles, 
                                           mol_weight=new_molweigth, 
                                           inchi=new_inchi, 
                                           inchikey=new_inchikey, 
                                           summary_formula=new_summaryForm)
            newInsertedMolecule.save()
        
        """
        new_name = django_form.cleaned_data['new_name']
        new_smiles = django_form.cleaned_data.get('new_smiles', '')
        new_summaryForm = django_form.cleaned_data.get('new_summaryForm', '')
        newInsertedMolecule = Molecule(name=new_name, smiles=new_smiles, summary_formula=new_summaryForm)
        newInsertedMolecule.save()
        """
        
        # ulož do databáze, naparsuj atd.
    #mols = [x for x in suppl]
    return mol_counter
Пример #10
0
def _get_core_cpd_insert(cpd_dict: dict) -> pymongo.UpdateOne:
    """Generate core compound to be inserted"""
    core_keys = ["_id", "SMILES", "Inchi", "InchiKey", "Mass", "Formula"]
    core_dict = {
        key: cpd_dict.get(key)
        for key in core_keys if cpd_dict.get(key) != None
    }

    mol_object = AllChem.MolFromSmiles(core_dict["SMILES"])
    rdk_fp = [
        i for i, val in enumerate(
            list(AllChem.RDKFingerprint(mol_object, fpSize=512))) if val
    ]

    # Store all different representations of the molecule (SMILES, Formula,
    #  InChI key, etc.) as well as its properties in a dictionary
    if not "SMILES" in core_dict:
        core_dict["SMILES"] = AllChem.MolToSmiles(mol_object, True)
    if not "Inchi" in core_dict:
        core_dict["Inchi"] = AllChem.MolToInchi(mol_object)
    if not "Inchikey" in core_dict:
        core_dict["Inchikey"] = AllChem.InchiToInchiKey(core_dict["Inchi"])

    core_dict["Mass"] = AllChem.CalcExactMolWt(mol_object)
    core_dict["Charge"] = AllChem.GetFormalCharge(mol_object)
    core_dict["Formula"] = AllChem.CalcMolFormula(mol_object)
    core_dict["logP"] = AllChem.CalcCrippenDescriptors(mol_object)[0]
    core_dict["RDKit_fp"] = rdk_fp
    core_dict["len_RDKit_fp"] = len(rdk_fp)
    # core_dict['NP_likeness'] = nps.scoreMol(mol_object, nps_model)
    core_dict["Spectra"] = {}
    # Record which expansion it's coming from
    core_dict["MINES"] = []

    return pymongo.UpdateOne({"_id": core_dict["_id"]},
                             {"$setOnInsert": core_dict},
                             upsert=True)
Пример #11
0
    def save(self,
             smiles=None,
             molfile=None,
             rdmol=None,
             inchi=None,
             name=None,
             update=False,
             *args,
             **kwargs):
        if not update:
            if molfile:
                mol = AllChem.MolFromMolBlock(molfile)
            elif smiles:
                mol = AllChem.MolFromSmiles(smiles)
            elif rdmol:
                mol = rdmol
            elif inchi:
                mol = AllChem.MolFromInchi(inchi)

            if mol:
                inchi = AllChem.MolToInchi(mol)
                smiles = AllChem.MolToSmiles(mol)

                if inchi and Molecule.objects.filter(
                        inchi=inchi).count() == 0 and len(inchi) > 1:
                    self.inchi = inchi

                    self.mw = float("{0:.2f}".format(
                        AllChem.CalcExactMolWt(mol)))
                    self.sum_formula = AllChem.CalcMolFormula(mol)
                    self.fingerprint = AllChem.GetMorganFingerprintAsBitVect(
                        mol, 4, nBits=1024).ToBitString()
                    self.inchi_key = AllChem.InchiToInchiKey(self.inchi)
                    self.molfile = AllChem.MolToMolBlock(mol)
                    self.smiles = smiles
                    self.rdmol = mol

                    # generating SVG image
                    if self.smiles not in self.EXCLUDED_MOLECULES:
                        binMol = AllChem.Mol(self.rdmol.ToBinary())

                        if not binMol.GetNumConformers():
                            rdDepictor.Compute2DCoords(self.rdmol)

                        drawer = rdMolDraw2D.MolDraw2DSVG(100, 100)
                        drawer.DrawMolecule(self.rdmol)
                        drawer.FinishDrawing()
                        svg = drawer.GetDrawingText().replace('svg:', '')

                        # remove first line containg XML meta information
                        self.image_svg = "\n".join(svg.split("\n")[1:]).strip()
                    else:
                        self.image_svg = None

                    if name:
                        self.name = name
                    else:
                        try:
                            self.name = mol.GetProp("LONGNAME")
                        except KeyError:
                            self.name = None

                    if Molecule.objects.all().count() == 0:
                        self.internal_id = "MI-J-1"
                    else:
                        self.internal_id = "MI-J-{}".format(
                            Molecule.objects.latest("id").id + 1)

                    super(Molecule, self).save(*args, **kwargs)
                else:
                    raise self.MoleculeExistsInDatabase(smiles)
            else:
                raise self.MoleculeCreationError
        else:
            super(Molecule, self).save(*args, **kwargs)
Пример #12
0
inpath = path + '../data/'

# read, filter and write the commercial compounds
count = 0
outfile = gzip.open(path + 'commercial_cmps_cleaned.dat.gz', 'w')
outfile.write("#Identifier\tSMILES\n")
for line in gzip.open(inpath + 'parent.smi.gz', 'r'):
    if line[0] == "#": continue
    line = line.rstrip().split()
    # contains: [smiles, identifier]
    m = Chem.MolFromSmiles(line[0])
    if m is None: continue
    # number of heavy atoms
    num_ha = m.GetNumHeavyAtoms()
    if num_ha < 15 or num_ha > 50: continue
    # molecular weight
    mw = AllChem.CalcExactMolWt(m)
    if mw < 200 or mw > 700: continue
    # number of rotatable bonds
    num_rb = AllChem.CalcNumRotatableBonds(m)
    if num_rb > 8: continue
    # number of H-bond donors and acceptors
    num_hba = AllChem.CalcNumHBA(m)
    num_hbd = AllChem.CalcNumHBD(m)
    if num_hba > 10 or num_hbd > 5: continue
    # keep the molecule
    outfile.write("%s\t%s\n" % (line[0], line[1]))
    count += 1
outfile.close()
print "number of molecules that passed the filters:", count
Пример #13
0
def run(
    mol2=None,
    smiles=None,
    standardise=STANDARDISE_DEF,
    num_conf=NUM_CONF_DEF,
    first=FIRST_DEF,
    pool_multiplier=POOL_MULTIPLIER_DEF,
    rmsd_cutoff=RMSD_CUTOFF_DEF,
    max_energy_diff=MAX_ENERGY_DIFF_DEF,
    forcefield=FORCEFIELD_DEF,
    seed=SEED_DEF,
    params=None,
    prioritize=False,
    out_dir=OUTDIR_DEF,
    compress=COMPRESS_DEF,
    overwrite=False,
    values_file=None,
    log=None,
    num_proc=None,
    parallel_mode=None,
    verbose=False,
):
    """Run conformer generation."""
    setup_logging(log, verbose=verbose)

    if params is not None:
        params = read_params(params)
        standardise = get_value(params, "preprocessing", "standardise", bool)
        num_conf = get_value(params, "conformer_generation", "num_conf", int)
        first = get_value(params, "conformer_generation", "first", int)
        pool_multiplier = get_value(params, "conformer_generation",
                                    "pool_multiplier", int)
        rmsd_cutoff = get_value(params, "conformer_generation", "rmsd_cutoff",
                                float)
        max_energy_diff = get_value(params, "conformer_generation",
                                    "max_energy_diff", float)
        forcefield = get_value(params, "conformer_generation", "forcefield")
        seed = get_value(params, "conformer_generation", "seed", int)

    # check args
    if forcefield not in FORCEFIELD_CHOICES:
        raise ValueError(
            "Specified forcefield {} is not in valid options {!r}".format(
                forcefield, FORCEFIELD_CHOICES))

    para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode)

    # Check to make sure args make sense
    if mol2 is None and smiles is None:
        if para.is_master():
            parser.print_usage()
            logging.error("Please provide mol2 file or a SMILES file.")
        sys.exit()

    if mol2 is not None and smiles is not None:
        if para.is_master():
            parser.print_usage()
            logging.error("Please provide only a mol2 file OR a SMILES file.")
        sys.exit()

    if num_proc and num_proc < 1:
        if para.is_master():
            parser.print_usage()
            logging.error(
                "Please provide more than one processor with `--num_proc`.")
        sys.exit()

    # Set up input type
    if mol2 is not None:
        in_type = "mol2"
    elif smiles is not None:
        in_type = "smiles"

    if para.is_master():
        if in_type == "mol2":
            logging.info("Input type: mol2 file(s)")
            logging.info("Input file number: {:d}".format(len(mol2)))
            mol_iter = (mol_from_mol2(_mol2_file,
                                      _name,
                                      standardise=standardise)
                        for _mol2_file, _name in mol2_generator(*mol2))
        else:
            logging.info("Input type: Detected SMILES file(s)")
            logging.info("Input file number: {:d}".format(len(smiles)))
            mol_iter = (mol_from_smiles(_smiles,
                                        _name,
                                        standardise=standardise)
                        for _smiles, _name in smiles_generator(*smiles))

        if prioritize:
            logging.info(("Prioritizing mols with low rotatable bond number"
                          " and molecular weight first."))
            mols_with_properties = [(
                AllChem.CalcNumRotatableBonds(mol),
                AllChem.CalcExactMolWt(mol),
                mol,
            ) for mol in mol_iter if mol is not None]
            data_iterator = make_data_iterator(
                (x[-1] for x in sorted(mols_with_properties)))
        else:
            data_iterator = make_data_iterator(
                (x for x in mol_iter if x is not None))

        # Set up parallel-specific options
        logging.info("Parallel Type: {}".format(para.parallel_mode))

        # Set other options
        touch_dir(out_dir)

        if not num_conf:
            num_conf = -1

        logging.info("Out Directory: {}".format(out_dir))
        logging.info("Overwrite Existing Files: {}".format(overwrite))
        if values_file is not None:
            if os.path.exists(values_file) and overwrite is not True:
                value_args = (values_file, "a")
                logging.info("Values file: {} (append)".format((values_file)))
            else:
                value_args = (values_file, "w")
                logging.info("Values file: {} (new file)".format(
                    (values_file)))
        if num_conf is None or num_conf == -1:
            logging.info("Target Conformer Number: auto")
        else:
            logging.info("Target Conformer Number: {:d}".format(num_conf))
        if first is None or first == -1:
            logging.info("First Conformers Number: all")
        else:
            logging.info("First Conformers Number: {:d}".format(first))
        logging.info("Pool Multiplier: {:d}".format(pool_multiplier))
        logging.info("RMSD Cutoff: {:.4g}".format(rmsd_cutoff))
        if max_energy_diff is None:
            logging.info("Maximum Energy Difference: None")
        else:
            logging.info("Maximum Energy Difference: {:.4g} kcal".format(
                max_energy_diff))
        logging.info("Forcefield: {}".format(forcefield.upper()))
        if seed != -1:
            logging.info("Seed: {:d}".format(seed))

        logging.info("Starting.")
    else:
        data_iterator = iter([])

    gen_conf_kwargs = {
        "out_dir": out_dir,
        "num_conf": num_conf,
        "rmsd_cutoff": rmsd_cutoff,
        "max_energy_diff": max_energy_diff,
        "forcefield": forcefield,
        "pool_multiplier": pool_multiplier,
        "first": first,
        "seed": seed,
        "save": True,
        "overwrite": overwrite,
        "compress": compress,
    }

    run_kwargs = {"kwargs": gen_conf_kwargs}

    results_iterator = para.run_gen(generate_conformers, data_iterator,
                                    **run_kwargs)

    if para.is_master() and values_file is not None:
        hdf5_buffer = HDF5Buffer(*value_args)

    for result, data in results_iterator:
        if (para.is_master() and values_file is not None
                and result is not False):
            values_to_hdf5(hdf5_buffer, result)

    if para.is_master() and values_file is not None:
        hdf5_buffer.flush()
        hdf5_buffer.close()
 def __call__(self, mol):
     return (AllChem.CalcExactMolWt(Chem.Mol(mol.binary)), )
Пример #15
0
            if len(can) > 250:  #way too big
                sys.stderr.write('%s is too large. Omitted.\n' % name)
                continue
            cursor = conn.cursor()
            cursor.execute('SELECT sdfloc FROM structures WHERE smile = %s',
                           (can, ))
            #if smile is not in structures
            row = cursor.fetchone()
            isnew = (row == None) or (row[0]
                                      == None) or (not os.path.exists(row[0]))
            sdfloc = None
            if row == None:
                #insert without sdfs to get unique id
                cursor.execute(
                    'INSERT INTO structures (smile,weight) VALUES(%s,%s) ',
                    (can, Chem.CalcExactMolWt(mol)))
            elif row[0] and not os.path.exists(row[0]) and 'conformer' in row[
                    0]:  #hacky workaround for mistake I made w/prefixes
                sdfloc = row[0]  #previously generated, but lost!

            #get unique id
            cursor.execute('SELECT id FROM structures WHERE smile = %s',
                           (can, ))
            result = cursor.fetchone()
            uniqueid = result[0]

            #we always update the name unless otherwise specified
            if not missingname and not options.nonames:
                cursor.execute(
                    'SELECT * FROM names WHERE smile = %s and name = %s',
                    (can, name))
Пример #16
0
 mol = Chem.MolFromSmiles(smile)
 Chem.SanitizeMol(mol)
 #to be sure, canonicalize smile (with iso)
 can = Chem.MolToSmiles(mol,isomericSmiles=True)
 if len(can) > 250: #way too big
     sys.stderr.write('%s is too large. Omitted.\n' % name)
     continue
 cursor = conn.cursor()
 cursor.execute('SELECT sdfloc FROM structures WHERE smile = %s', (can,))
 #if smile is not in structures
 row = cursor.fetchone()
 isnew = (row == None) or (row[0] == None) or (not os.path.exists(row[0]))
 sdfloc = None
 if row == None:
     #insert without sdfs to get unique id 
     cursor.execute('INSERT INTO structures (smile,weight) VALUES(%s,%s) ', (can, Chem.CalcExactMolWt(mol)))
 elif row[0] and not os.path.exists(row[0]) and 'conformer' in row[0]: #hacky workaround for mistake I made w/prefixes
     sdfloc = row[0] #previously generated, but lost!
     
 #get unique id
 cursor.execute('SELECT id FROM structures WHERE smile = %s', (can,))
 result = cursor.fetchone();
 uniqueid = result[0]
 
 #we always update the name unless otherwise specified
 if not missingname and not options.nonames:
     cursor.execute('SELECT * FROM names WHERE smile = %s and name = %s', (can,name))
     row = cursor.fetchone()
     if row == None: 
          cursor.execute('INSERT IGNORE INTO names (smile,name) VALUES(%s,%s)', (can,name))
 conn.commit()
Пример #17
0
def create_sdf_ligs(conn, libraryid, cprefixes):
    #break up molecules into individual files, assume molecules with the same name
    #are the same conformer; return true if successfull
    #outputs an ligs.in file in the current directory
    whichprefix = -1
    lastsmi = ''
    fname = False
    confconn = MySQLdb.connect(host="localhost",
                               user="******",
                               db="conformers")

    try:
        ligout = open("ligs.in", 'wt')
        infile = gzip.open('input.sdf.gz')
        mols = Chem.ForwardSDMolSupplier(infile)
        molcnt = 0
        for mol in mols:
            try:
                if mol is None: continue
                Chem.SanitizeMol(mol)
                can = Chem.MolToSmiles(mol, isomericSmiles=True)

                if can != lastsmi:  #a different molecule
                    if len(can) > 250:  #way too big
                        continue
                    molcnt += 1
                    #get/assign a unique id
                    cursor = confconn.cursor()
                    cursor.execute(
                        'SELECT id FROM structures WHERE smile = %s', (can, ))
                    row = cursor.fetchone()
                    if row == None:
                        #insert without sdfs to get unique id
                        cursor.execute(
                            'INSERT INTO structures (smile,weight) VALUES(%s,%s) ',
                            (can, Chem.CalcExactMolWt(mol)))
                        cursor.execute(
                            'SELECT id FROM structures WHERE smile = %s',
                            (can, ))
                        row = cursor.fetchone()

                    uniqueid = row[0]
                    #we do not store the user supplied conformers, but leave sdfloc blank
                    if fname:
                        writer.close(
                        )  #we have a file we have previously opened
                        out.close()
                    whichprefix = (whichprefix + 1) % len(cprefixes)
                    subdir = "%s/user/%s/" % (cprefixes[whichprefix],
                                              libraryid)
                    if not os.path.isdir(subdir):
                        os.makedirs(subdir)
                    fname = "%s/%d.sdf.gz" % (subdir, uniqueid)
                    out = gzip.open(fname, 'wt')
                    writer = Chem.SDWriter(out)

                    if mol.HasProp('_Name'):
                        name = mol.GetProp('_Name')
                    else:
                        name = str(molcnt)

                    ligout.write('%s %d %s\n' % (fname, uniqueid, name))

                #have file setup for this molecule, may be conformer
                writer.write(mol)

            except:  #catch rdkit issues
                traceback.print_exc()
                continue

        if fname:
            writer.close()
            out.close()
        return True
    except:
        traceback.print_exc()
        return False
Пример #18
0
    def insert_compound(self,
                        mol_object,
                        compound_dict=None,
                        bulk=None,
                        kegg_db="KEGG",
                        pubchem_db='PubChem-8-28-2015',
                        modelseed_db='ModelSEED'):
        """This class saves a RDKit Molecule as a compound entry in the MINE.
        Calculates necessary fields for API and includes additional
        information passed in the compound dict. Overwrites preexisting
        compounds in MINE on _id collision.
        
        :param mol_object: The compound to be stored
        :type mol_object: RDKit Mol object
        :param compound_dict: Additional information about the compound to be
            stored. Overwritten by calculated values.
        :type compound_dict: dict
        :param bulk: A pymongo bulk operation object. If None, reaction is
         immediately inserted in the database
        :param kegg_db: The ID of the KEGG Mongo database
        :type kegg_db: str
        :param pubchem_db: The ID of the PubChem Mongo database
        :type pubchem_db: str
        :param modelseed_db: The ID of the ModelSEED Mongo database
        :type modelseed_db: str
        :return: The hashed _id of the compound
        :rtype: str
        """

        if compound_dict is None:
            compound_dict = {}

        # Store all different representations of the molecule (SMILES, Formula,
        #  InChI key, etc.) as well as its properties in a dictionary
        compound_dict['SMILES'] = AllChem.MolToSmiles(mol_object, True)
        compound_dict['Inchi'] = AllChem.MolToInchi(mol_object)
        compound_dict['Inchikey'] = AllChem.InchiToInchiKey(
            compound_dict['Inchi'])
        compound_dict['Mass'] = AllChem.CalcExactMolWt(mol_object)
        compound_dict['Formula'] = AllChem.CalcMolFormula(mol_object)
        compound_dict['Charge'] = AllChem.GetFormalCharge(mol_object)
        # Get indices where bits are 1
        compound_dict['MACCS'] = list(
            AllChem.GetMACCSKeysFingerprint(mol_object).GetOnBits())
        compound_dict['len_MACCS'] = len(compound_dict['MACCS'])
        # Get indices where bits are 1
        compound_dict['RDKit'] = list(
            AllChem.RDKFingerprint(mol_object).GetOnBits())
        compound_dict['len_RDKit'] = len(compound_dict['RDKit'])
        compound_dict['logP'] = AllChem.CalcCrippenDescriptors(mol_object)[0]
        compound_dict['_id'] = utils.compound_hash(
            compound_dict['SMILES'],
            ('Type' in compound_dict
             and compound_dict['Type'] == 'Coreactant'))
        if '_atom_count' in compound_dict:
            del compound_dict['_atom_count']
        # Caching this for rapid reaction mass change calculation
        self._mass_cache[compound_dict['_id']] = compound_dict['Mass']

        # If the compound is a reactant, then make sure the reactant name is
        # in a correct format.
        if "Reactant_in" in compound_dict and isinstance(
                compound_dict['Reactant_in'], str) \
                and compound_dict['Reactant_in']:
            compound_dict['Reactant_in'] = ast.literal_eval(
                compound_dict['Reactant_in'])
        # If the compound is a product, then make sure the reactant name is
        # in a correct format.
        if "Product_of" in compound_dict \
                and isinstance(compound_dict['Product_of'], str) \
                and compound_dict['Product_of']:
            compound_dict['Product_of'] = ast.literal_eval(
                compound_dict['Product_of'])

        # Store links to external databases where compound is present
        if compound_dict['Inchikey']:
            if kegg_db:
                compound_dict = self.link_to_external_database(
                    kegg_db,
                    compound=compound_dict,
                    fields_to_copy=[('Pathways', 'Pathways'),
                                    ('Names', 'Names'),
                                    ('DB_links', 'DB_links'),
                                    ('Enzymes', 'Enzymes')])

            if pubchem_db:
                compound_dict = self.link_to_external_database(
                    pubchem_db,
                    compound=compound_dict,
                    fields_to_copy=[('COMPOUND_CID', 'DB_links.PubChem')])

            if modelseed_db:
                compound_dict = self.link_to_external_database(
                    modelseed_db,
                    compound=compound_dict,
                    fields_to_copy=[('DB_links', 'DB_links')])

        # Calculate natural product likeness score and store in dict
        if not self.np_model:
            self.np_model = np.readNPModel()
        compound_dict["NP_likeness"] = np.scoreMol(mol_object, self.np_model)

        compound_dict = utils.convert_sets_to_lists(compound_dict)
        # Assign an id to the compound
        if self.id_db:
            mine_comp = self.id_db.compounds.find_one(
                {"Inchikey": compound_dict['Inchikey']}, {
                    'MINE_id': 1,
                    "Pos_CFM_spectra": 1,
                    "Neg_CFM_spectra": 1
                })
            # If compound already exists in MINE, store its MINE id in the dict
            if mine_comp:
                compound_dict['MINE_id'] = mine_comp['MINE_id']
                if 'Pos_CFM_spectra' in mine_comp:
                    compound_dict['Pos_CFM_spectra'] = mine_comp[
                        'Pos_CFM_spectra']
                if 'Neg_CFM_spectra' in mine_comp:
                    compound_dict['Neg_CFM_spectra'] = mine_comp[
                        'Neg_CFM_spectra']
            # If compound does not exist, create new id based on number of
            # current ids in the MINE
            else:
                compound_dict['MINE_id'] = self.id_db.compounds.count()
                self.id_db.compounds.save(compound_dict)

        # If bulk insertion, upsert (insert and update) the database
        if bulk:
            bulk.find({'_id': compound_dict['_id']}).upsert().\
                replace_one(compound_dict)
        else:
            self.compounds.save(compound_dict)
        return compound_dict['_id']