示例#1
0
def write_sdf_file(scaffold_graph, output_file):
    """Write an SDF file from a scaffoldgraph

    Parameters
    ----------
    scaffold_graph (sg.ScaffoldGraph): graph to be converted
    output_file (str): path to output file
    """

    N = scaffold_graph.num_scaffold_nodes
    sorted_scaffolds = sorted(scaffold_graph.get_scaffold_nodes(data=True),
                              key=lambda x: x[1]['hierarchy'])
    mapping = dict(zip([s[0] for s in sorted_scaffolds], range(0, N)))
    writer = SDWriter(output_file)
    for scaffold, data in sorted_scaffolds:
        molecule = MolFromSmiles(scaffold)
        if molecule is not None:
            subscaffolds = list(scaffold_graph.predecessors(scaffold))
            molecule.SetProp('_Name', mapping[scaffold])
            molecule.SetIntProp('HIERARCHY',
                                scaffold_graph.nodes[scaffold]['HIERARCHY'])
            molecule.SetProp('SMILES', scaffold)
            molecule.SetProp(
                'SUBSCAFFOLDS',
                ', '.join([str(mapping[s]) for s in subscaffolds]))
            writer.write(molecule)
    writer.close()
示例#2
0
def load_smiles_file(it):
    for line in it:
        smiles, cid = str(line).strip().split()[:2]
        mol = MolFromSmiles(smiles)
        if mol is not None:
            mol.SetProp('_Name', cid)
            yield mol
示例#3
0
def filled_fragmentsdb(fragmentsdb, myshelve):
    fragmentsdb.add_fragments_from_shelve(myshelve)

    mol = MolFromSmiles('[*]COP(=O)([O-])OP(=O)([O-])OC1OC(C(=O)[O-])C(O)C(O)C1O')
    mol.SetProp('_Name', '1muu_GDX_frag7')
    fragmentsdb.add_molecule(mol)
    pdbs = [{
        'chainId': 'A',
        'structureId': '1muu',
        'structureTitle': '2.0 A crystal structure of GDP-mannose dehydrogenase',
        'ecNo': '1.1.1.132',
        'uniprotAcc': 'P11759',
        'compound': 'GDP-mannose 6-dehydrogenase',
        'uniprotRecommendedName': 'GDP-mannose 6-dehydrogenase',
    }, {
        # pdbs which has no fragment should be skipped
        'chainId': 'A',
        'structureId': '2n2k',
        'structureTitle': 'Ensemble structure of the closed state of Lys63-linked diubiquitin in the absence of a ligand',
        'ecNo': None,
        'uniprotAcc': 'P0CG48',
        'compound': 'ubiquitin',
        'uniprotRecommendedName': 'Polyubiquitin-C',
    }]
    fragmentsdb.add_pdbs(pdbs)
    return fragmentsdb
示例#4
0
    def __next__(self):
        values = next(self.supplier)
        try:
            mol = MolFromSmiles(values[0])
            mol.SetProp('_Name', str(values[1]))
            if self.data_cols is not None:
                for key, value in zip(self.data_cols, values[2]):
                    mol.SetProp(str(key), str(value))
        except AttributeError:
            logger.warning('Molecule {} : {} could not be parsed'.format(
                self.cursor, values[0]))
            self.cursor += 1
            return None

        self.cursor += 1
        return mol
示例#5
0
    def parse_smi(self, smi):
        """parse smiles and return Mol after storing in global dict
			 or return from global dict"""
        smiles_name = smi.split()
        asmi = smiles_name[0]
        if len(smiles_name) > 1:
            aname = smiles_name[1]
        else:
            aname = None
        if asmi in self.mol:
            # return copy is slower, but safer?
            #return Mol(self.mol[asmi])
            #plpy.notice('found mol for %s' % asmi)
            return self.mol[asmi]

        newmol = MolFromSmiles(asmi)
        if newmol:
            if len(self.mol) < self.maxsmi:
                #plpy.notice('new mol for %s' %asmi)
                pass
            else:
                self.mol.popitem()
                #key,psmi = self.mol.popitem()
                #plpy.notice('mol reuse %s for %s' % (key,psmi))
            self.mol[asmi] = newmol
            if aname:
                newmol.SetProp("_Name", aname)
            return newmol
        else:
            return None
示例#6
0
 def write_scaffold(self, scaffold):
     subscaffolds = ', '.join([str(s.id) for s in scaffold.subscaffolds])
     if self.args.sdf:
         molecule = MolFromSmiles(scaffold.smiles)
         if molecule is not None:
             molecule.SetProp('_Name', str(scaffold.id))
             molecule.SetIntProp('HIERARCHY', scaffold.hierarchy)
             molecule.SetProp('SMILES', scaffold.smiles)
             molecule.SetProp('SUBSCAFFOLDS', subscaffolds)
             self.output.write(molecule)
         else:
             logger.warning(f'Failed to parse scaffold: {scaffold.smiles}')
     else:
         self.output.write('{0}\t{1}\t{2}\t{3}\n'.format(
             scaffold.id, scaffold.hierarchy, scaffold.smiles,
             subscaffolds))
示例#7
0
def classify(sdf, label, lambdas):
    new_filename = "%s_class.sdf" % sdf.split('.sdf')[0]
    new_label = label + "_class"
    sdm = ForwardSDMolSupplier(sdf,
                               strictParsing=False,
                               removeHs=False,
                               sanitize=False)
    sdw = SDWriter(new_filename)
    counter = -1
    i = 0
    for mol in sdm:
        print(i)
        sys.stdout.flush()
        i += 1
        counter += 1
        if mol is None:
            print("%d rdkit couldn't read molecule" % counter, file=sys.stderr)
            sys.stderr.flush()
            continue
        c = None
        prop = floatify(mol.GetProp(label))
        if prop is None:
            print("couldn't convert %s to float or int...skip" %
                  mol.GetProp(label),
                  file=sys.stderr)
            sys.stderr.flush()
            continue
        for k, l in lambdas.items():
            if l(prop):
                c = k
                print("hit %s" % k)
                sys.stdout.flush()
                break
        if c is None:
            print("%d no prop range matched '%s' ..skip" %
                  (counter, mol.GetProp(label)),
                  prop,
                  type(prop),
                  file=sys.stderr)
            sys.stderr.flush()
            sys.stdout.flush()
            continue
        mol.SetProp(new_label, c)
        try:
            sdw.write(mol)
        except:
            print(
                "couldn't write mol %d to file, try to build mol from smiles" %
                i,
                file=sys.stderr)
            mol = MolFromSmiles(mol.GetProp("SMILES"))
            AllChem.Compute2DCoords(mol)
            mol.SetProp(new_label, c)
            try:
                sdw.write(mol)
            except:
                print("couldn't write mol %d to file...skip" % i,
                      file=sys.stderr)
    sdw.close()
示例#8
0
def write_sdf_file(scaffold_graph, output_file):
    """Write an SDF file from a ScaffoldGraph.

    All scaffolds in the scaffoldgraph are written to the
    SDF, while molecules are ignored. Scaffolds are sorted
    in ascending order according to their hierarchy level.

    The output follows the standard SDF specification with
    the added property fields:

        TITLE field: scaffold ID
        SUBSCAFFOLDS field: list of sub-scaffold IDs
        HIERARCHY field: hierarchy level of scaffold
        SMILES field: scaffold canonical SMILES

    Parameters
    ----------
    scaffold_graph : scaffoldgraph.core.ScaffoldGraph
        ScaffoldGraph to be written to an SDF.
    output_file : str
        Filepath to an output file.

    """
    N = scaffold_graph.num_scaffold_nodes
    sorted_scaffolds = sorted(scaffold_graph.get_scaffold_nodes(data=True),
                              key=lambda x: x[1]['hierarchy'])
    mapping = dict(zip([s[0] for s in sorted_scaffolds], range(0, N)))
    writer = SDWriter(output_file)
    for scaffold, data in sorted_scaffolds:
        molecule = MolFromSmiles(scaffold)
        if molecule is not None:
            subscaffolds = list(scaffold_graph.predecessors(scaffold))
            molecule.SetProp('_Name', mapping[scaffold])
            molecule.SetIntProp('HIERARCHY',
                                scaffold_graph.nodes[scaffold]['HIERARCHY'])
            molecule.SetProp('SMILES', scaffold)
            molecule.SetProp(
                'SUBSCAFFOLDS',
                ', '.join([str(mapping[s]) for s in subscaffolds]))
            writer.write(molecule)
    writer.close()
示例#9
0
    def testCreateRd(self):
        from rdkit.Chem import MolFromSmiles
        import decaf.toolkits.rd as rd
        molstring, name = self.string.split()
        mol = MolFromSmiles(molstring)
        mol.SetProp("_Name", name)
        phar = rd.phar_from_mol(mol)
        self.assertEqual(phar.numnodes, self.numnodes)
        self.assertEqual(np.sum(phar.edges > 0) / 2.0, self.numedges)

        types = {t: 0 for t in self.types}
        for i in range(phar.numnodes):
            for t in list(phar.nodes[i]["type"].keys()):
                types[t] += 1
        self.assertEqual(types, self.types)
示例#10
0
    def __next__(self):
        smiles, name = next(self.supplier)

        try:
            mol = MolFromSmiles(smiles)
            mol.SetProp('_Name', str(name))

        except AttributeError:
            logger.warning('Molecule {} : {} could not be parsed'.format(
                self.cursor, smiles))
            self.cursor += 1
            return None

        self.cursor += 1
        return mol
示例#11
0
def csv_to_sdf(csv_file, sdf_file, smiles_col, class_col, delim=','):
    sdw = SDWriter(sdf_file)

    with open(csv_file) as fh:
        for i, line in enumerate(fh.readlines()):
            if i == 0:
                continue
            line_split = line.strip().split(delim)
            smiles = line_split[smiles_col].replace('"', '')
            act_class = line_split[class_col].replace('"', '')
            act_newLabel = activity_label_to_id_map[act_class]
            mol = MolFromSmiles(smiles)
            mol.SetProp("TL", act_newLabel)
            sdw.write(mol)
    sdw.close()
示例#12
0
def get_mols_from_smiles(smiles: List[str], **kwargs) -> List[Mol]:
    """
    Converts a list of smiles to a list of mol objects and adds the provided properties.

    Parameters
    ----------
    smiles : List[str]
        Iterable containing smiles strings

    kwargs : Dict[str, List[Any]]
        Every provided keyword argument will be added as property to the resulting molecules.
        The key is the property name and he value have to be list of any values. The lists for
        each keyword argument must have the same length like the the list of smiles.
    """
    mols = []
    for ix, smi in enumerate(smiles):
        mol = MolFromSmiles(smi)
        for prop in kwargs:
            mol.SetProp(prop, str(kwargs[prop][ix]))
        mols.append(mol)
    return mols
示例#13
0
def main(args, output=sys.stdout, log=logging):
    parser = argparse.ArgumentParser(
    """RDKit-based conformer generation proof-of-concept.
    This program accepts either a mol2 file or a SMILES string and produces an SD file
    """)
    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument('-m', '--mol2', type=str, help="Mol2 file to gererate conformers for")
    input_group.add_argument('-s', '--smiles', type=str, help="SMILES string of molecule")

    parser.add_argument('-N', '--name', type=str, default=None, help="Molecule name")
    parser.add_argument('-H', '--no-hydrogens', action='store_true', 
                                                default=False, 
                                                help="Do NOT explicitly add implicit Hydrogens to conformers [default: %(default)s]")
    parser.add_argument('-r', '--rmsd-threshold', type=float,
                                                  default=2.0,
                                                  help="Only accept conformers that have an RMSD of at least this value from previously seen conformers [default: %(default)s")
    parser.add_argument('-n', '--num-conformers', type=int,
                                                  default=None,
                                                  help="Number of conformers to initially generate [default: auto]")
    parser.add_argument('-F', '--forcefield', type=str,
                                              default=DEFAULT_FORCEFIELD,
                                              choices=FORCEFIELDS.keys(),
                                              help="Forcefield to use for optimization [default: %(default)s]")
    parser.add_argument('-P', '--parallelism', type=int,
                                               default=None,
                                               help="Number of processes to use [default: 1]")
    params = parser.parse_args(args)

    # Load input molecule
    if hasattr(params, 'mol2') and params.mol2 is not None:
        mol = MolFromMol2File(params.mol2, sanitize=False)
    else:
        mol = MolFromSmiles(params.smiles, sanitize=False)

    try:
        SanitizeMol(mol)
    except ValueError as e:
        log.critical("Could not sanitize molecule: {0}:".format(str(e)))
        sys.exit(2)
    except Exception:  # This is `Boost.Python.ArgumentError`
        log.critical("Could not parse molecule!")
        sys.exit(2)
        

    # Assign user-provided name if applicable
    if params.name is not None:
        mol.SetProp(RD_NAME, params.name)
    elif not mol.HasProp(RD_NAME):
        mol.SetProp(RD_NAME, 'Ligand')

    # Generate 3D conformers
    embedded, selected = generate_conformers(mol, 
                                             add_hydrogens=not params.no_hydrogens,
                                             rmsd_threshold=params.rmsd_threshold,
                                             num_conformers=params.num_conformers,
                                             parallelism=params.parallelism,
                                             forcefield=params.forcefield,
                                             log=log)

    log.info("Conformers selected: {0}".format(len(selected)))
    log.info("Energy: min={0:.4f} kcal/mol max={1:.4f} kcal/mol".format(selected[0][1], selected[-1][1]))

    # Find lowest-energy conformers
    sorted_by_energy = [item[0] for item in selected]

    # Render SDF file
    names = dump_conformers_sdf(embedded, output, conf_ids=sorted_by_energy, 
                                                  renumber=True)

    for name, (conf_id, energy) in zip(names, selected):
        log.info("\t{0}: {1:0.4f} kcal/mol".format(name, energy))

    return 0