Пример #1
0
def standarize_mol_by_inchi(mol, neutralize=True):
    newmol = AddHs(mol)
    sinchi, code, msg = generate_inchi(newmol, FixedH=False, RecMet=False)
    if neutralize:
        nsinchi = neutralize_inchi(sinchi)
    else:
        nsinchi = sinchi
    newmol = MolFromInchi(nsinchi, removeHs=False)
    newmol = AddHs(newmol, explicitOnly=True)
    return newmol
Пример #2
0
def xyz_to_rdmol(nxyz, smiles):
    mol = get_mol(smiles)
    mol = AddHs(mol)

    num_atoms = len(nxyz)
    conformer = Conformer(num_atoms)
    for i, quad in enumerate(nxyz):
        conformer.SetAtomPosition(i, quad[1:])

    mol.AddConformer(conformer)
    return mol
Пример #3
0
def generate_conformers(mol, add_hydrogens=True,
                             rmsd_threshold=2.0,    # Arbitrarily selected
                             num_conformers=None,   # None means best guess
                             parallelism=None,
                             forcefield=DEFAULT_FORCEFIELD,
                             log=logging):
    if add_hydrogens:
        log.info("Adding implicit hydrogens")
        mol = AddHs(mol)

    if num_conformers is None:
        num_conformers = get_num_confs_for_mol(mol)

    log.info("Attempting to generate {0} conformations with min RMSD of {1:.4f}".format(num_conformers, rmsd_threshold))
    orig_conf_ids = EmbedMultipleConfs(mol, numConfs=num_conformers, 
                                            pruneRmsThresh=rmsd_threshold,
                                            ignoreSmoothingFailures=True)  # Prevents crashes in some situations
    log.info("Generated {0} initial conformations".format(len(orig_conf_ids)))

    log.info("Optimizing and calculating energies using {0}".format(forcefield))
    conf_energy = optimize_conformers(mol, interfragment=True, 
                                           parallelism=parallelism,
                                           forcefield=forcefield)
    sorted_by_energy = sorted(conf_energy.iteritems(), key=operator.itemgetter(1))

    log.info("Filtering similar conformers")
    selected = []
    min_rmsd, max_rmsd = float('inf'), float('-inf')
    for idx, id_energy in enumerate(sorted_by_energy):
        conf_id, energy = id_energy
        keep = True
        for comp_id, other_energy in sorted_by_energy[idx+1:]:
            rmsd = AlignMol(mol, mol, prbCid=comp_id, refCid=conf_id)
            if rmsd <= rmsd_threshold:
                mol.RemoveConformer(conf_id)
                keep = False
                break
            else:
                if rmsd < min_rmsd:
                    min_rmsd = rmsd
                if rmsd > max_rmsd:
                    max_rmsd = rmsd
        if keep:
            selected.append(id_energy)

    log.debug("Removed {0} after post-optimization RMSD filtering".format(len(orig_conf_ids) - len(selected)))
    log.info("RMSD: min={0:.4f} max={1:.4f}".format(min_rmsd, max_rmsd))
    return mol, selected
Пример #4
0
def embed_etkdg(mol, seed):
    from rdkit.Chem import AllChem, AddHs
    mol = AddHs(mol)
    params = AllChem.ETKDG()
    params.randomSeed = seed
    AllChem.EmbedMolecule(mol, params)
    return mol
Пример #5
0
def rdkit_3d_descirptors(mols, regex="(NPR1)|(NPR2)|(PMI1)|(PMI2)|(PMI3)|(SpherocityIndex)|(InertialShapeFactor)|"
                                     "(Eccentricity)|(Asphericity)"):
    """ embeds molecules in 3D and calculates a set of RDKit descriptors for given molecules (RDKit) ``mols``

    :param mols: {str} RDKit molecules
    :param regex: {str} regular expression to match RDKit functions
    :return: {pandas DataFrame} descriptor names and values
    """
    # embed molecules in 3D
    mols = [AddHs(m) for m in mols]
    for i, m in enumerate(mols):
        AllChem.EmbedMolecule(m, AllChem.ETKDG())
        # AllChem.MMFFOptimizeMolecule(m)

    # create results dictionary with descriptors as keys and append list of values for all mols
    rslt = dict()
    desc_regex = re.compile(regex)
    for descriptor in Descriptors3D.__dict__.keys():
        if desc_regex.match(descriptor):
            print("\t%s..." % descriptor)
            func = getattr(Descriptors3D, descriptor)
            pbar = ProgressBar()
            rslt[descriptor] = list()
            for mol in pbar(mols):
                rslt[descriptor].append(func(mol))
    return pd.DataFrame(rslt)
Пример #6
0
    def calculate(self):
        """
        Check if the SMILES is valid then update the info.

        :return: RDKit Mol object
        """
        try:
            m = MolFromSmiles("".join(p.config['long_prefix']) +
                              "".join(self.smiles.element))
            self.smiles.properties[p.s_valid] = False
            if m is not None:
                m = AddHs(m)
                AllChem.EmbedMolecule(m)
                AllChem.UFFOptimizeMolecule(m)
                self.smiles.properties["InChI"] = MolToInchi(m)
        except Exception as e:
            print("Error rdkit : " + repr(e))
            m = None
        if m is not None:
            self.smiles.properties[p.s_valid] = True
            with p.lock_update_data:
                p.tree_info[p.info_good] += 1
                self.smiles.properties[p.s_id] = p.tree_info[p.info_good]
        else:
            with p.lock_update_data:
                p.tree_info[p.info_bad] += 1
                self.smiles.properties[p.s_id] = p.tree_info[p.info_bad]
        return m
Пример #7
0
    def add_hs(self,
               inplace=False,
               add_coords=True,
               explicit_only=False,
               only_on_atoms=False):
        """ Add hydrogens to self.

        Args:
            inplace (bool):
                Whether to add Hs to `Mol`, or return a new `Mol`.
            add_coords (bool):
                Whether to set 3D coordinate for added Hs.
            explicit_only (bool):
                Whether to add only explicit Hs, or also implicit ones.
            only_on_atoms (iterable<bool>):
                An iterable specifying the atoms to add Hs.
        Returns:
            skchem.Mol:
                `Mol` with Hs added.
        """
        if inplace:
            msg = 'Inplace addition of Hs is not yet supported.'
            raise NotImplementedError(msg)
        raw = AddHs(self,
                    addCoords=add_coords,
                    onlyOnAtoms=only_on_atoms,
                    explicitOnly=explicit_only)
        return self.__class__.from_super(raw)
Пример #8
0
 def add_hydrogen(cls, mol_in, addCoords=True):
     """Explicit all hydrogens.
     
     :param    mol_in: RDKit Mol
     :param addCoords: Add coordinate to added Hs, bool
     :return  mol_out: RDKit Mol
     """
     return AddHs(mol_in, explicitOnly=False, addCoords=addCoords)
Пример #9
0
    def generate(self,
                 max_generated_conformers=50,
                 prune_thresh=0.01,
                 maxattempts_per_conformer=5,
                 output=None,
                 threads=1):
        '''
        Generates conformers

        Note  the number max_generated _conformers required is related to the
        number of rotatable bonds
        '''
        self.mol = AddHs(self.mol, addCoords=True)
        self.initial_confs = EmbedMultipleConfs(
            self.mol,
            numConfs=max_generated_conformers,
            pruneRmsThresh=prune_thresh,
            maxAttempts=maxattempts_per_conformer,
            useRandomCoords=False,
            # Despite what the documentation says -1 is a seed!!
            # It doesn't mean random generation
            numThreads=threads,
            randomSeed=random.randint(1, 10000000))
        if len(self.initial_confs) == 0:
            output.write((f"Generated  {len(self.initial_confs)} "
                          "initial confs\n"))
            output.write((f"Trying again with {max_generated_conformers * 10} "
                          "attempts and random coords\n"))

            self.initial_confs = EmbedMultipleConfs(
                self.mol,
                numConfs=max_generated_conformers,
                pruneRmsThresh=prune_thresh,
                useRandomCoords=True,
                maxAttempts=10 * maxattempts_per_conformer,
                # Despite what the documentation says -1 is a seed!!
                # It doesn't mean random
                # generatrion
                numThreads=threads,
                randomSeed=random.randint(1, 10000000))

        output.write("Generated " + str(len(self.initial_confs)) +
                     " initial confs\n")
        return self.initial_confs
Пример #10
0
def docksmile(smile, filename):

    '''
    coverts a smile string to pdbqt and runs autodock vina,
    returns the binding energy of its top pose
    
    Vina configuration details in config.txt
    '''
    #print(smile, filename)
    if not isinstance(smile, str):
        raise TypeError('Input is not a class of string')
        
    m = MolFromSmiles(smile)
    # assert valid smiles
    if m is None:
        raise ValueError(smile, 'is not a valid smile string')
    mh = AddHs(m)
    embed = AllChem.EmbedMolecule(mh, useRandomCoords=False)
    
    #check if rdkit successfully generates structure
    if embed!=0:
        print('RDkit fails to embed molecule', smile, '; file:%s.pdb'%filename)
        return smile, np.nan
        
    # generate pdb file
    #pdb = MolToPDBFile(mh, 'input/'+filename+'.pdb', flavor=4)
    pdb = MolToPDBBlock(mh, flavor=4)
    open('/tmp/'+filename+'.pdb', 'w').write(pdb)
    
    # convert pdb to pdbqt
    try:
        out = subprocess.run([py_path, lig_path, '-l', '/tmp/'+filename+'.pdb', '-o','/tmp/'+filename+'.pdbqt'])
    except subprocess.CalledProcessError as e:
        print(e.output)
    if not os.path.exists('/tmp/'+filename+'.pdbqt'):
        print("%s does't exist" % (filename+'.pdbqt'))
        return smile, np.nan
    
    try:
        result = subprocess.run(['sh', './run_spike_open_docking.sh', filename], stdout=subprocess.PIPE)
        result = result.stdout.decode('utf-8')
    except subprocess.CalledProcessError as er:
        print(er.output)
        print(smile, '; file:%s.pdbqt'%filename)
        return smile, np.nan
    
    #print(filename+'.pdbqt','docking success')

    # read energy from output
    energy = np.nan
    strings = re.split('\n', result)
    for line in strings:
        if line[0:4] == '   1':
            energy = float(re.split(' +', line)[2])
    #print(energy )      
    return smile, energy
Пример #11
0
 def __init__(self, smiles, forcefield="mmff"):
     '''
     Initialises the class
     '''
     self.mol = MolFromSmiles(smiles)
     self.full_clusters = []
     self.forcefield = forcefield
     self.conf_energies = []
     self.initial_confs = None
     self.smiles = smiles
Пример #12
0
def processline(t, step, line):
    global lensum
    if t.incr():
        return 1
    if step == 0:
        lensum += len(line)
    else:
        m = MolFromSmiles(line)
        if step == 100:
            lensum += len(line)
        elif step == 105:
            lensum += len(sha256(line).hexdigest())
        elif step in (110, 120):
            with open(tmpname, 'wb+') as f:
                print(line, file=f)
                if step == 120:
                    os.fsync(f.fileno())
            lensum += os.stat(tmpname).st_size
        elif step == 210:
            lensum += m.GetNumAtoms()
        elif step == 220:
            lensum += m.GetNumBonds()
        elif step == 300:
            lensum += len(MolToSmiles(m))
        elif step == 400:
            lensum += len(MolToMolBlock(m))
        elif step == 420:
            m2 = AddHs(m)
            EmbedMolecule(m2, randomSeed=2020)
            m2 = RemoveHs(m2)
            m2.SetProp("_Name", "test")
            lensum += len(MolToMolBlock(m2))
        elif step == 600:
            lensum += mol2file(m, 'svg')
        elif step == 610:
            lensum += mol2file(m, 'png')
        else:
            raise ValueError("Not implemented step " + str(step))

    return 0
Пример #13
0
def get_max_atom_bond_size(smiles_iterator, explicit_hs=True):
    """ Convienence function to get max_atoms, max_bonds for a set of input
    SMILES """

    max_atoms = 0
    max_bonds = 0
    for smiles in tqdm(smiles_iterator):
        mol = MolFromSmiles(smiles)
        if explicit_hs:
            mol = AddHs(mol)
        max_atoms = max([max_atoms, len(mol.GetAtoms())])
        max_bonds = max([max_bonds, len(mol.GetBonds())])

    return dict(max_atoms=max_atoms, max_bonds=max_bonds * 2)
Пример #14
0
def rdmols_from_document(document, build_from="inchi", add_hs=True):
    """
    Convert back a document to a set of rdmols. This method is a companion of "as_document".

    :param document: a document produced by the "as_mongo_document" method, dict
    :param build_from: the type of depiction to be used to build back the rdmols, str in ["inchi", "smiles"]
    :param add_hs: add Hs to RDKit mol object, default is True
    :returns list_list_rdmols: list of list of rdmols
    """
    assert build_from in ["inchi", "smiles"]
    assert add_hs in [True, False]

    list_list_rdmols = list()
    list_stoechiometry = document['list_stoechiometry']
    if build_from == 'inchi':
        for list_inchis in document['list_list_inchis']:
            list_rdmols = list()
            for inchi in list_inchis:
                rd_mol = MolFromInchi(inchi, sanitize=True)
                if add_hs:
                    rd_mol = AddHs(rd_mol)
                list_rdmols.append(rd_mol)
            list_list_rdmols.append(list_rdmols)
    elif build_from == 'smiles':
        for list_smiles in document['list_list_smiles']:
            list_rdmols = list()
            for smiles in list_smiles:
                rd_mol = MolFromSmiles(smiles, sanitize=True)
                if add_hs:
                    rd_mol = AddHs(rd_mol)
                list_rdmols.append(rd_mol)
            list_list_rdmols.append(list_rdmols)
    else:
        raise NotImplementedError()

    return list_list_rdmols, list_stoechiometry
Пример #15
0
    def _predict_rt(self, smiles: str) -> Optional[float]:
        """Predict Retention Time from SMILES string using provided predictor.

        Parameters
        ----------
        smiles : str
            SMILES string of input compound.

        Returns
        -------
        predicted_rt : Optional[float]
            Predicted retention time, None if errors occur during prediction,
            for example if certain features of the input compound that are
            required for the prediction cannot be calculated.
        """
        mol = MolFromSmiles(smiles)
        mol = AddHs(mol)

        fp = self.fp_calculator(mol)
        # Transform dict into array of values (fingerprint)
        if self.rt_important_features:
            fp = np.array(
                [fp[feature] for feature in self.rt_important_features]
            ).reshape(1, -1)

        def validate_np_val(val: float) -> bool:
            """Make sure value is numeric, not NaN, and not infinity.

            Parameters
            ----------
            val : float
                Value to check.

            Returns
            -------
            bool
                True if input value is numeric, False otherwise.
            """
            if isinstance(val, float) and not np.isnan(val) and not np.isinf(val):
                return True
            return False

        if all([validate_np_val(val) for val in fp[0]]):
            predicted_rt = self.rt_predictor.predict(fp)[0]
        else:
            return None

        return predicted_rt
Пример #16
0
def smiles_reaction_matrix(smarts, *sources, **kwargs):
    sep = kwargs.setdefault('sep', ' ')
    molValue = int(kwargs.get('molValue', 400))
    logValue = float(kwargs.get('logValue', 4.0))
    reaction = ReactionFromSmarts(smarts)
    smilesLists = [load_smiles_file(source) for source in sources]
    products = reaction_matrix(reaction, *smilesLists)
    for reactants, product in products:
        cids = [r.GetProp("_Name") for r in reactants]
        product_id = '.'.join(cids)
        for mol in product:
            smiles = MolToSmiles(mol, isomericSmiles=True)
            mol.UpdatePropertyCache(strict=False)
            mh = AddHs(mol, addCoords=True)
            mwt = MolWt(mol)
            if mwt <= molValue:
                logp = MolLogP(mol)
                if logp < logValue:
                    yield sep.join((smiles, product_id, str(mwt), str(logp)))+"\n"
Пример #17
0
    def construct_feature_matrices(self, smiles, train=True):
        """ construct a molecule from the given smiles string and return atom
        and bond classes.

        Returns
        dict with entries
        'n_atom' : number of atoms in the molecule
        'n_bond' : number of bonds in the molecule 
        'atom' : (n_atom,) length list of atom classes
        'bond' : (n_bond,) list of bond classes
        'connectivity' : (n_bond, 2) array of source atom, target atom pairs.

        """

        self.atom_tokenizer.train = train
        self.bond_tokenizer.train = train

        logger = logging.getLogger(__name__)
        mol = MolFromSmiles(smiles)
        if self.explicit_hs:
            mol = AddHs(mol)

        n_atom = mol.GetNumAtoms()
        n_bond = 2 * mol.GetNumBonds()

        # If its an isolated atom, add a self-link
        if n_bond == 0:
            n_bond = 1
            logger.warning(f'Found molecule {smiles} with zero bonds')

        atom_feature_matrix = np.zeros(n_atom, dtype='int')
        bond_feature_matrix = np.zeros(n_bond, dtype='int')
        bond_indices = np.zeros(n_bond, dtype='int')
        connectivity = np.zeros((n_bond, 2), dtype='int')

        bond_index = 0
        for n, atom in enumerate(mol.GetAtoms()):

            # Atom Classes
            atom_feature_matrix[n] = self.atom_tokenizer(
                self.atom_features(atom))

            start_index = atom.GetIdx()

            for bond in atom.GetBonds():
                # Is the bond pointing at the target atom
                rev = bond.GetBeginAtomIdx() != start_index

                # Bond Classes
                bond_feature_matrix[bond_index] = self.bond_tokenizer(
                    self.bond_features(bond, flipped=rev))

                # Connect edges to original bonds
                bond_indices[bond_index] = bond.GetIdx()

                # Connectivity
                if not rev:  # Original direction
                    connectivity[bond_index, 0] = bond.GetBeginAtomIdx()
                    connectivity[bond_index, 1] = bond.GetEndAtomIdx()

                else:  # Reversed
                    connectivity[bond_index, 0] = bond.GetEndAtomIdx()
                    connectivity[bond_index, 1] = bond.GetBeginAtomIdx()

                bond_index += 1

        # Track the largest atom and bonds seen
        if train:
            if n_atom > self.max_atoms:
                self.max_atoms = n_atom
            if mol.GetNumBonds() > self.max_bonds:
                self.max_bonds = mol.GetNumBonds()

        return {
            'n_atom': n_atom,
            'n_bond': mol.GetNumBonds(),  # the real number of bonds
            'bond_indices': bond_indices,
            'atom': atom_feature_matrix,
            'bond': bond_feature_matrix,
            'connectivity': connectivity,
        }
Пример #18
0
#!/usr/bin/python2
# Little harness for timing how long it takes to embed a molecule
# which seems extremely variable on one machine,
from __future__ import print_function, division
import sys, time, os
from rdkit.Chem import MolFromSmiles, AddHs, RemoveHs
from rdkit.Chem.AllChem import EmbedMolecule

if __name__ == "__main__":
    dotimestamp = int(os.getenv('MOLEMBED_TIME', '0'))
    doaddh = int(os.getenv('MOLEMBED_ADDH', '0'))
    rseed = int(os.getenv('MOLEMBED_SEED', '0'))
    t0 = time.time()
    for line in sys.stdin.readlines():
        s = line.strip()
        if dotimestamp:
            t1 = time.time()
            dt = (t1 - t0) * 1e3
            print('%.3f' % dt, s)
            t0 = t1
        else:
            print(s)
        m = MolFromSmiles(s)
        if doaddh:
            m2 = AddHs(m)
        else:
            m2 = m
        EmbedMolecule(m2, randomSeed=rseed)
Пример #19
0
    def addhs(self, mol):
        from rdkit.Chem import AddHs

        return AddHs(mol)
Пример #20
0
    def construct_feature_matrices(self, smiles):
        """ construct a molecule from the given smiles string and return atom
        and bond classes.

        Returns
        dict with entries
        'n_atom' : number of atoms in the molecule
        'n_bond' : number of bonds in the molecule 
        'atom' : (n_atom,) length list of atom classes
        'bond' : (n_bond,) list of bond classes
        'connectivity' : (n_bond, 2) array of source atom, target atom pairs.

        """

        mol = MolFromSmiles(smiles)
        if self.explicit_hs:
            mol = AddHs(mol)

        n_atom = len(mol.GetAtoms())
        n_bond = 2 * len(mol.GetBonds())

        # If its an isolated atom, add a self-link
        if n_bond == 0:
            n_bond = 1

        atom_feature_matrix = np.zeros(n_atom, dtype='int')
        bond_feature_matrix = np.zeros(n_bond, dtype='int')
        connectivity = np.zeros((n_bond, 2), dtype='int')

        bond_index = 0

        atom_seq = mol.GetAtoms()
        atoms = [atom_seq[i] for i in range(n_atom)]

        for n, atom in enumerate(atoms):

            # Atom Classes
            atom_feature_matrix[n] = self.atom_tokenizer(
                self.atom_features(atom))

            start_index = atom.GetIdx()

            for bond in atom.GetBonds():
                # Is the bond pointing at the target atom
                rev = bond.GetBeginAtomIdx() != start_index

                # Bond Classes
                bond_feature_matrix[bond_index] = self.bond_tokenizer(
                    self.bond_features(bond, flipped=rev))

                # Connectivity
                if not rev:  # Original direction
                    connectivity[bond_index, 0] = bond.GetBeginAtomIdx()
                    connectivity[bond_index, 1] = bond.GetEndAtomIdx()

                else:  # Reversed
                    connectivity[bond_index, 0] = bond.GetEndAtomIdx()
                    connectivity[bond_index, 1] = bond.GetBeginAtomIdx()

                bond_index += 1

        return {
            'n_atom': n_atom,
            'n_bond': n_bond,
            'atom': atom_feature_matrix,
            'bond': bond_feature_matrix,
            'connectivity': connectivity,
        }
#! /usr/bin/env python

import sys
from rdkit.Chem import SDMolSupplier, MolToPDBFile, AllChem, AddHs, RemoveHs
from rdkit.Chem.Draw import MolsToGridImage

spl = SDMolSupplier(sys.argv[1])
mols = [m for m in spl]

for i, m in enumerate(mols):
    m = AddHs(m)
    AllChem.EmbedMolecule(m, useBasicKnowledge=True, maxAttempts=100)
    AllChem.MMFFOptimizeMolecule(m)
    RemoveHs(m)
    MolToPDBFile(m, 'ligand_%d.pdb' % i)

img = MolsToGridImage(mols,
                      legends=["ligand_%d" % i for i in range(len(mols))])
img.save('ligands.png')
Пример #22
0
class ConformerGenerator(object):
    '''
    Generates conformations of molecules from 2D representation.
    '''
    def __init__(self, smiles, forcefield="mmff"):
        '''
        Initialises the class
        '''
        self.mol = MolFromSmiles(smiles)
        self.full_clusters = []
        self.forcefield = forcefield
        self.conf_energies = []
        self.initial_confs = None
        self.smiles = smiles

    def generate(self,
                 max_generated_conformers=50,
                 prune_thresh=0.01,
                 maxattempts_per_conformer=5,
                 output=None,
                 threads=1):
        '''
        Generates conformers

        Note  the number max_generated _conformers required is related to the
        number of rotatable bonds
        '''
        self.mol = AddHs(self.mol, addCoords=True)
        self.initial_confs = EmbedMultipleConfs(
            self.mol,
            numConfs=max_generated_conformers,
            pruneRmsThresh=prune_thresh,
            maxAttempts=maxattempts_per_conformer,
            useRandomCoords=False,
            # Despite what the documentation says -1 is a seed!!
            # It doesn't mean random generation
            numThreads=threads,
            randomSeed=random.randint(1, 10000000))
        if len(self.initial_confs) == 0:
            output.write((f"Generated  {len(self.initial_confs)} "
                          "initial confs\n"))
            output.write((f"Trying again with {max_generated_conformers * 10} "
                          "attempts and random coords\n"))

            self.initial_confs = EmbedMultipleConfs(
                self.mol,
                numConfs=max_generated_conformers,
                pruneRmsThresh=prune_thresh,
                useRandomCoords=True,
                maxAttempts=10 * maxattempts_per_conformer,
                # Despite what the documentation says -1 is a seed!!
                # It doesn't mean random
                # generatrion
                numThreads=threads,
                randomSeed=random.randint(1, 10000000))

        output.write("Generated " + str(len(self.initial_confs)) +
                     " initial confs\n")
        return self.initial_confs

    def minimise(self, output=None):
        '''
        Minimises conformers using a force field
        '''

        if "\\" in self.smiles or "/" in self.smiles:
            output.write(("WARNING: Smiles string contains slashes, "
                          "which specify cis/trans stereochemistry.\n"))
            output.write(("Force-field minimization may change the "
                          "stereochemistry.\n"))

        if self.forcefield != "mmff" and self.forcefield != "uff":
            raise ValueError("Unrecognised force field")
        if self.forcefield == "mmff":
            props = MMFFGetMoleculeProperties(self.mol)
            for i in range(0, len(self.initial_confs)):
                potential = MMFFGetMoleculeForceField(self.mol,
                                                      props,
                                                      confId=i)
                if potential is None:
                    output.write("MMFF not available, using UFF\n")
                    potential = UFFGetMoleculeForceField(self.mol, confId=i)
                    assert potential is not None
                output.write(f"Minimising conformer number {i}\n")
                potential.Minimize()
                mmff_energy = potential.CalcEnergy()
                self.conf_energies.append((i, mmff_energy))

        elif self.forcefield == "uff":
            for i in range(0, len(self.initial_confs)):
                potential = UFFGetMoleculeForceField(self.mol, confId=i)
                assert potential is not None
                potential.Minimize()
                uff_energy = potential.CalcEnergy()
                self.conf_energies.append((i, uff_energy))
        self.conf_energies = sorted(self.conf_energies, key=lambda tup: tup[1])
        return self.mol

    def cluster(self,
                rms_tolerance=0.1,
                max_ranked_conformers=10,
                energy_window=5,
                Report_e_tol=10,
                output=None):
        '''
        Removes duplicates after minimization
        '''
        self.counter = 0
        self.factormax = 3
        self.mol_no_h = RemoveHs(self.mol)
        calcs_performed = 0
        self.full_clusters = []
        confs = self.conf_energies[:]
        ignore = []
        ignored = 0

        for i, pair_1 in enumerate(confs):
            if i == 0:
                index_0, energy_0 = pair_1
            output.write((f"clustering cluster {i} of "
                          f"{len(self.conf_energies)}\n"))
            index_1, energy_1 = pair_1
            if abs(energy_1 - energy_0) > Report_e_tol:
                output.write(("Breaking because hit Report Energy Window, "
                              f"E was {energy_1} kcal/mol "
                              f"and minimum was {energy_0} \n"))

                break
            if i in ignore:
                ignored += i
                continue
            self.counter += 1
            if self.counter == self.factormax * max_ranked_conformers:
                output.write('Breaking because hit MaxNConfs \n')
                break
            clustered = [[self.mol.GetConformer(id=index_1), energy_1, 0.00]]
            ignore.append(i)
            for j, pair_2 in enumerate(confs):
                if j > 1:
                    index_2, energy_2 = pair_2
                    if j in ignore:
                        ignored += 1
                        continue
                    if abs(energy_1 - energy_2) > energy_window:
                        break
                    if abs(energy_1 - energy_2) <= 1e-3:
                        clustered.append([
                            self.mol.GetConformer(id=index_2), energy_2, 0.00
                        ])
                        ignore.append(j)
                        rms = GetConformerRMS(self.mol_no_h, index_1, index_2)
                        calcs_performed += 1
                        if rms <= rms_tolerance:
                            clustered.append([
                                self.mol.GetConformer(id=index_2), energy_2,
                                rms
                            ])
                            ignore.append(j)
            self.full_clusters.append(clustered)
        output.write(f"{ignored} ignore passes made\n")
        output.write((f"{calcs_performed} overlays needed out "
                      f"of a possible {len(self.conf_energies) ** 2}\n"))

        ranked_clusters = []
        for i, cluster in enumerate(self.full_clusters):
            if i < self.factormax * max_ranked_conformers:
                ranked_clusters.append(cluster[0])

        return ranked_clusters

    def recluster(self,
                  path,
                  rms_tolerance=0.1,
                  max_ranked_conformers=10,
                  energy_window=5,
                  output=None,
                  clustered_confs=[],
                  molecule=None,
                  key=None,
                  fallback_to_align=False):
        self.removed = []
        self.counter = 0
        i = -1
        for conf_a in clustered_confs:
            i += 1
            j = i
            if self.counter == max_ranked_conformers:
                for k in range(i, len(clustered_confs)):
                    if os.path.isfile(key + "_Conf_" + str(k + 1) + ".xyz"):
                        os.remove(key + "_Conf_" + str(k + 1) + ".xyz")
                        output.write("Removed " + key + "_Conf_" + str(k + 1) +
                                     ".xyz\n")
                break
            if i in self.removed:
                continue
            self.counter += 1
            for conf_b in clustered_confs[i + 1:]:
                j += 1
                if conf_b[1] - conf_a[1] > energy_window:
                    break
                if j in self.removed:
                    continue
                try:
                    rms = obfit_rmsd(key + "_Conf_" + str(i + 1),
                                     key + "_Conf_" + str(j + 1),
                                     str(molecule),
                                     path=path)
                except (subprocess.CalledProcessError, ValueError) as e:
                    if fallback_to_align:
                        output.write(
                            'obfit failed, falling back to obabel --align')
                        output.write(f'Exception {e}\n')
                        try:
                            rms = align_rmsd(f"{key}_Conf_{str(i + 1)}",
                                             f"{key}_Conf_{str(j + 1)}", path)
                        except ValueError:
                            continue
                    else:
                        continue

                output.write("Comparing " + str(i + 1) + " " + str(j + 1) +
                             ' RMSD ' + str(rms) + "\n")
                if rms > rms_tolerance:
                    pos = _atomic_pos_from_conformer(conf_b[0])
                    elements = _extract_atomic_type(conf_b[0])
                    pos = [[-float(coor[k]) for k in range(3)] for coor in pos]
                    coords = list(zip(elements, pos))

                    filename = os.path.join(
                        path, key + "_Conf_" + str(j + 1) + "_inv.xyz")
                    write_xyz(coords=coords,
                              filename=filename,
                              comment=conf_b[1])
                    try:
                        file1 = key + "_Conf_" + str(i + 1)
                        file2 = key + "_Conf_" + str(j + 1) + "_inv"
                        rmsinv = obfit_rmsd(file1, file2, str(molecule))
                    except (subprocess.CalledProcessError, ValueError) as e:
                        if fallback_to_align:
                            output.write(
                                'obfit failed, falling back to obabel --align')
                            output.write(f'Exception {e}\n')
                            try:
                                i_key = f"{key}_Conf_{str(i + 1)}"
                                inv_key = f"{key}_Conf_{str(j + 1)}_inv"
                                rmsinv = align_rmsd(i_key, inv_key)
                            except ValueError:
                                continue
                        else:
                            continue

                    rms = min([rms, rmsinv])
                    os.remove(key + "_Conf_" + str(j + 1) + "_inv.xyz")
                    output.write((f"Comparing {i + 1} {j + 1} "
                                  f"RMSD after checking inversion {rms}\n"))
                if rms <= rms_tolerance:
                    self.removed.append(j)
                    output.write("Removed Conf_" + str(j + 1) + "\n")
                    os.remove(key + "_Conf_" + str(j + 1) + ".xyz")
Пример #23
0
    def process(self):
        data1 = np.load(self.raw_paths[0])
        data2 = np.load(self.raw_paths[1])
        data1_feed_dict = {
            'E': torch.as_tensor(data1['E']),
            'N': torch.as_tensor(data1['N']),
            'R': torch.as_tensor(data1['R_qm'] if self.qm else data1['R_mmff']),
            'D': torch.as_tensor(data1['D_qm'] if self.qm else data1['D_mmff']),
            'Q': torch.as_tensor(data1['Q']),
            'Z': torch.as_tensor(data1['Z'])
        }
        data2_feed_dict = {
            'E': torch.as_tensor(data2['E']),
            'N': torch.as_tensor(data2['N']),
            'R': torch.as_tensor(data2['R_qm'] if self.qm else data2['R_mmff']),
            'D': torch.as_tensor(data2['D_qm'] if self.qm else data2['D_mmff']),
            'Q': torch.as_tensor(data2['Q']),
            'Z': torch.as_tensor(data2['Z'])
        }

        data1_size = data1['E'].shape[0]
        data2_size = data2['E'].shape[0]

        if not self.sep_heavy_atom:
            data_size = data1_size + data2_size
        else:
            in_part1 = (self.num_heavy_atom < 14)
            heavy_atom_data = pd.read_csv(self.raw_paths[2] if in_part1 else self.raw_paths[3])
            num_heavy_atom = torch.as_tensor(heavy_atom_data['numberHA']).long()
            atom_mask = (num_heavy_atom == self.num_heavy_atom)
            atom_mask = atom_mask.view(-1)
            data_dict_used = data1_feed_dict if in_part1 else data2_feed_dict
            for key in data_dict_used.keys():
                data_dict_used[key] = data_dict_used[key][atom_mask]
            '''
            Here is a trick to make sure later part only calculate data_dict_used
            '''
            data_size = data_dict_used['E'].shape[0]
            data1_feed_dict = data_dict_used

        data_array = np.empty(data_size, dtype=Data)

        for i in tqdm(range(data_size)):

            data_index = i if i < data1_size else i - data1_size

            if i < data1_size:
                tmp_data = _get_ith_data(data_index, **data1_feed_dict)
            else:
                tmp_data = _get_ith_data(data_index, **data2_feed_dict)
            tmp_data = self.pre_transform(tmp_data, edge_version='cutoff', do_sort_edge=True, cal_efg=False,
                                          cutoff=self.cutoff, boundary_factor=None, use_center=None,
                                          mol=AddHs(MolFromSmiles('C')),
                                          cal_3body_term=self.cal_3body_term, bond_atom_sep=self.bond_atom_sep,
                                          record_long_range=self.record_long_range)
            data_array[i] = tmp_data

        data_list = [data_array[i] for i in range(data_size)]
        print('collating...')
        data1, slices = self.collate(data_list)
        print('saving...')
        torch.save((data1, slices), self.processed_paths[0])
Пример #24
0
def _addHs(mol, explicitOnly=False, addCoords=False):
    return AddHs(mol, explicitOnly=explicitOnly, addCoords=addCoords)
Пример #25
0
def MolToMol2Block(mol, confId=-1, addHs=False, addCharges=False):
    """Returns a Mol2 string block for a molecule
      ARGUMENTS:

        - mol: the molecule
        - confId: (optional) selects which conformation to output (-1 = default)
                  if set to None will return all conformers

      RETURNS:

        a string
    """

    #
    # References
    # - Format specs http://www.tripos.com/data/support/mol2.pdf
    # - Atom typing http://www.sdsc.edu/CCMS/Packages/cambridge/pluto/atom_types.html
    #

    confIds = (confId, )

    if confId == None:
        confIds = Chem.Mol.GetNumConformers()

    blocks = []

    # add explicit hydrogens (since mol2 reader requires them)
    if addHs:
        h_coords = mol.GetNumConformers() > 0 and mol.GetConformer(-1).Is3D()
        try:
            mol = AddHs(mol, addCoords=h_coords)
        except RuntimeError:
            mol = AddHs(mol, addCoords=False)

    # compute charges
    if addCharges:
        ComputeGasteigerCharges(mol)

    for confId in confIds:

        molecule = """@<TRIPOS>MOLECULE
{}
{} {} 0 0 0
SMALL
GASTEIGER\n\n""".format(
            mol.GetProp("_Name") if mol.HasProp("_Name") else "UNK",
            mol.GetNumAtoms(), mol.GetNumBonds())

        # FIXME "USER_CHARGES" could become 'Gasteiger charges'
        # FIXME "SMALL" means small molecule but could become "PROTEIN"

        pos = _get_positions(mol, confId)
        atom_lines = [
            "{:>4} {:>4} {:>13.4f} {:>9.4f} {:>9.4f} {:<5} {} {} {:>7.4f}".
            format(a.GetIdx() + 1,
                   a.GetSymbol(),
                   float(pos[a.GetIdx()][0]),
                   float(pos[a.GetIdx()][1]),
                   float(pos[a.GetIdx()][2]),
                   _sybyl_atom_type(a), 1, "UNL",
                   float(a.GetProp('_GasteigerCharge').replace(',', '.'))
                   if a.HasProp('_GasteigerCharge') else 0.0)
            for a in mol.GetAtoms()
        ]
        atom_lines = ["@<TRIPOS>ATOM"] + atom_lines
        atom_lines = "\n".join(atom_lines) + "\n"

        bond_lines = [
            "{:>5} {:>5} {:>5} {:>2}".format(
                bid + 1,
                b.GetBeginAtomIdx() + 1,
                b.GetEndAtomIdx() + 1, "ar"
                if b.GetBondTypeAsDouble() == 1.5 else "am"
                if _amide_bond(b) else str(int(b.GetBondTypeAsDouble())))
            for bid, (b) in enumerate(mol.GetBonds())
        ]
        bond_lines = ["@<TRIPOS>BOND"] + bond_lines + ["\n"]
        bond_lines = "\n".join(bond_lines)

        block = molecule + atom_lines + bond_lines
        blocks.append(block)
    return "".join(blocks)