Пример #1
0
def main():

    ### command line args defintions #########################################
    parser = argparse.ArgumentParser(
        description='Calculate plane of best fit for molecules')
    parameter_utils.add_default_io_args(parser)
    args = parser.parse_args()
    utils.log("PBFEV args: ", args)
    input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output(
        args.input, args.informat, args.output, 'PBFEV', args.outformat)
    i = 0
    count = 0
    errors = 0
    out_results = []
    for mol in suppl:
        i += 1
        AllChem.EmbedMolecule(mol)
        if mol is None: continue
        out_vector = PBFev(mol)
        if out_vector is None: continue
        rd = PBFRD(mol)
        mol.SetDoubleProp("distance", rd)
        for j, angle in enumerate(out_vector):
            mol.SetDoubleProp("angle" + "_" + str(j), angle)
        out_results.append(mol)
    count = write_out(out_results, count, writer, args.outformat)
    utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) +
              " outputs")
    writer.flush()
    writer.close()
    input.close()
    output.close()
Пример #2
0
def main():
    parser = argparse.ArgumentParser(description='RDKit constrained conformer generator')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-n', '--num', type=int, default=10, help='number of conformers to generate')
    parser.add_argument('-r', '--refmol', help="Reference molecule file")
    parser.add_argument('--refmolidx', help="Reference molecule index in file", type=int, default=1)
    parser.add_argument('-c', '--core_smi', help='Core substructure. If not specified - guessed using MCS', default='')

    args = parser.parse_args()
    # Get the reference molecule
    ref_mol_input, ref_mol_suppl = rdkit_utils.default_open_input(args.refmol, args.refmol)
    counter = 0
    # Get the specified reference molecule. Default is the first
    for mol in ref_mol_suppl:
        counter+=1
        if counter == args.refmolidx:
            ref_mol = mol
            break
    ref_mol_input.close()

    if counter < args.refmolidx:
        raise ValueError("Invalid refmolidx. " + str(args.refmolidx) + " was specified but only " + str(counter) + " molecules were present in refmol.")


    # handle metadata
    source = "constrained_conf_gen.py"
    datasetMetaProps = {"source":source, "description": "Constrained conformer generation using RDKit " + rdBase.rdkitVersion}
    clsMappings = {"EmbedRMS": "java.lang.Float"}
    fieldMetaProps = [{"fieldName":"EmbedRMS", "values": {"source":source, "description":"Embedding RMS value"}}]

    # Get the molecules
    input, suppl = rdkit_utils.default_open_input(args.input, args.informat)
    output, WRITER, output_base = rdkit_utils.\
        default_open_output(args.output, "const_conf_gen", args.outformat,
                            valueClassMappings=clsMappings,
                            datasetMetaProps=datasetMetaProps,
                            fieldMetaProps=fieldMetaProps)

    inputs = 0
    totalCount = 0
    totalErrors = 0
    for mol in suppl:
        inputs += 1
        if mol:
            count, errors = generate_conformers(inputs, mol, args.num, ref_mol, WRITER, args.core_smi)
            totalCount += count
            totalErrors += errors

    input.close()
    WRITER.close()

    if totalErrors > 0:
        utils.log("WARNING:", totalErrors, "conformers failed to generate")

    # write metrics
    if args.meta:
        metrics = {'__InputCount__':inputs, '__OutputCount__':totalCount, 'RDKitConstrainedConformer':totalCount}
        if totalErrors > 0:
            metrics['__ErrorCount__'] = totalErrors
        utils.write_metrics(output_base, metrics)
Пример #3
0
def main():

    # Example usage
    # python -m pipelines.xchem.featurestein_score -i ../../data/mpro/poses.sdf.gz -f mpro-fstein.p -o fstein

    global fmaps

    parser = argparse.ArgumentParser(description='FeatureStein scoring with RDKit')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-f', '--feat-map', help='Feature Map pickle to score with')
    parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('--metrics', action='store_true', help='Write metrics')


    args = parser.parse_args()
    utils.log("FeatureStein Args: ", args)

    source = "featurestein_score.py"
    datasetMetaProps = {"source":source, "description": "FeatureStein scoring using RDKit " + rdBase.rdkitVersion}

    clsMappings = {}
    fieldMetaProps = []
    clsMappings[field_FeatureSteinQualityScore] = "java.lang.Float"
    clsMappings[field_FeatureSteinQuantityScore] = "java.lang.Float"
    fieldMetaProps.append({"fieldName":field_FeatureSteinQualityScore,   "values": {"source":source, "description":"FeatureStein quality score"},
                           "fieldName":field_FeatureSteinQuantityScore,   "values": {"source":source, "description":"FeatureStein quantity score"}})

    pkl_file = open(args.feat_map, 'rb')
    fmaps = pickle.load(pkl_file)
    utils.log('FeatureMap has', fmaps.GetNumFeatures(), "features")

    inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat)
    output, writer, output_base = rdkit_utils.default_open_output(args.output,
                        'featurestein', args.outformat,
                        valueClassMappings=clsMappings,
                        datasetMetaProps=datasetMetaProps,
                        fieldMetaProps=fieldMetaProps,
                        compress=not args.no_gzip)

    # this does the processing
    total, success, errors = process(inputs_supplr, writer)

    inputs_file.close()
    writer.flush()
    writer.close()
    output.close()

    if args.metrics:
        utils.write_metrics(output_base, {'__InputCount__':total, '__OutputCount__':success, '__ErrorCount__':errors, 'RDKitFeatureMap':success})
Пример #4
0
def main():
    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit cluster 3D')
    parameter_utils.add_default_io_args(parser)

    args = parser.parse_args()

    utils.log("Cluster_3d Args: ", args)

    source = "cluster_3d.py"
    datasetMetaProps = {
        "source": source,
        "description": "Cluster 3D using RDKit " + rdBase.rdkitVersion
    }
    clsMappings = {
        # "RMSToCentroid": "java.lang.Float",
        # "EnergyDelta": "java.lang.Float",
        # "EnergyAbs": "java.lang.Float",
        # "ConformerNum": "java.lang.Integer",
        # "ClusterCentroid": "java.lang.Integer",
        # "ClusterNum": "java.lang.Integer",
        # "StructureNum": "java.lang.Integer"
    }
    fieldMetaProps = [
        # {"fieldName":"RMSToCentroid",   "values": {"source":source, "description":"RMS distance to the cluster centroid"}},
        # {"fieldName":"EnergyDelta",     "values": {"source":source, "description":"Energy difference to lowest energy structure"}},
        # {"fieldName":"EnergyAbs",       "values": {"source":source, "description":"Absolute energy"}},
        # {"fieldName":"ConformerNum",    "values": {"source":source, "description":"Conformer number"}},
        # {"fieldName":"ClusterCentroid", "values": {"source":source, "description":"Conformer number of the cluster centroid"}},
        # {"fieldName":"ClusterNum",      "values": {"source":source, "description":"Cluster number"}},
        # {"fieldName":"StructureNum",    "values": {"source":source, "description":"Structure number this conformer was generated from"}}
    ]

    input, output, suppl, writer, output_base = rdkit_utils. \
        default_open_input_output(args.input, args.informat, args.output,
                                  'conformers', args.outformat,
                                  valueClassMappings=clsMappings,
                                  datasetMetaProps=datasetMetaProps,
                                  fieldMetaProps=fieldMetaProps)

    basemol = combine_conformers(suppl)

    if input:
        input.close()
    writer.flush()
    writer.close()
    output.close()
Пример #5
0
def main():

    # Example usage:
    # python -m pipelines.xchem.xcos -f ../../data/mpro/hits-17.sdf.gz -i ../../data/mpro/poses.sdf.gz  -o xcos

    parser = argparse.ArgumentParser(description='XCos scoring with RDKit')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-f', '--fragments', required=True, help='Fragments to compare')
    parser.add_argument('-ff', '--fragments-format', help='Fragments format')
    parser.add_argument('-t', '--score-threshold', type=float, default=0.4,
                        help='Minimum shape overlay and feature map score required for scoring a bit to a fragment')
    parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('--metrics', action='store_true', help='Write metrics')

    args = parser.parse_args()
    utils.log("XCos Args: ", args)

    source = "xcos.py"
    datasetMetaProps = {"source":source, "description": "XCos scoring using RDKit " + rdBase.rdkitVersion}

    clsMappings = {}
    fieldMetaProps = []

    clsMappings[field_XCosRefMols] = "java.lang.String"
    clsMappings[field_XCosNumHits] = "java.lang.Integer"
    clsMappings[field_XCosScore1] = "java.lang.Float"

    fieldMetaProps.append({"fieldName":field_XCosRefMols,   "values": {"source":source, "description":"XCos reference fragments"}})
    fieldMetaProps.append({"fieldName":field_XCosNumHits,   "values": {"source":source, "description":"XCos number of hits"}})
    fieldMetaProps.append({"fieldName":field_XCosScore1,   "values": {"source":source, "description":"XCos score 1"}})
    
    frags_input,frags_suppl = rdkit_utils.default_open_input(args.fragments, args.fragments_format)

    inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat)
    output, writer, output_base = rdkit_utils.default_open_output(args.output,
                                                                  'xcos', args.outformat,
                                                                  valueClassMappings=clsMappings,
                                                                  datasetMetaProps=datasetMetaProps,
                                                                  fieldMetaProps=fieldMetaProps,
                                                                  compress=not args.no_gzip)

    # this does the processing
    process(inputs_supplr, frags_suppl, writer, threshold=args.score_threshold)

    writer.close()
Пример #6
0
def main():

    parser = argparse.ArgumentParser(description='Open3DAlign with RDKit')
    parser.add_argument('query', help='query molfile')
    parser.add_argument(
        '--qmolidx',
        help="Query molecule index in SD file if not the first",
        type=int,
        default=1)
    parser.add_argument('--crippen',
                        action='store_true',
                        help='Use Crippen (logP) contributions')
    parser.add_argument(
        '-t',
        '--threshold',
        type=float,
        help='score cuttoff relative to alignment of query to itself')
    parser.add_argument(
        '-n',
        '--num',
        default=0,
        type=int,
        help=
        'number of conformers to generate, if None then input structures are assumed to already be 3D'
    )
    parser.add_argument('-a',
                        '--attempts',
                        default=0,
                        type=int,
                        help='number of attempts to generate conformers')
    parser.add_argument('-r',
                        '--rmsd',
                        type=float,
                        default=1.0,
                        help='prune RMSD threshold for excluding conformers')
    parser.add_argument(
        '-e',
        '--emin',
        type=int,
        default=0,
        help=
        'energy minimisation iterations for generated conformers (default of 0 means none)'
    )
    parameter_utils.add_default_io_args(parser)

    args = parser.parse_args()
    utils.log("o3dAlign Args: ", args)

    # TODO - handle molecules with multiple fragments
    # TODO - allow to specify threshold as fraction of perfect score?

    qmol = rdkit_utils.read_single_molecule(args.query, index=args.qmolidx)
    qmol = Chem.RemoveHs(qmol)
    qmol2 = Chem.Mol(qmol)

    source = "conformers.py"
    datasetMetaProps = {
        "source": source,
        "description": "Open3DAlign using RDKit " + rdBase.rdkitVersion
    }
    clsMappings = {"O3DAScore": "java.lang.Float"}
    fieldMetaProps = [{
        "fieldName": "O3DAScore",
        "values": {
            "source": source,
            "description": "Open3DAlign alignment score"
        }
    }]
    if args.num > 0:
        # we generate the conformers so will add energy info
        clsMappings["EnergyDelta"] = "java.lang.Float"
        clsMappings["EnergyAbs"] = "java.lang.Float"
        fieldMetaProps.append({
            "fieldName": "EnergyDelta",
            "values": {
                "source": source,
                "description": "Energy difference to lowest energy conformer"
            }
        })
        fieldMetaProps.append({
            "fieldName": "EnergyAbs",
            "values": {
                "source": source,
                "description": "Absolute energy"
            }
        })


    input,output,suppl,writer,output_base = rdkit_utils.\
        default_open_input_output(args.input, args.informat, args.output,
                                  'o3dAlign', args.outformat,
                                  valueClassMappings=clsMappings,
                                  datasetMetaProps=datasetMetaProps,
                                  fieldMetaProps=fieldMetaProps)

    if args.crippen:
        pyO3A = rdMolAlign.GetCrippenO3A(qmol2, qmol)
    else:
        pyO3A = rdMolAlign.GetO3A(qmol2, qmol)

    perfect_align = pyO3A.Align()
    perfect_score = pyO3A.Score()
    utils.log('Perfect score:', perfect_align, perfect_score,
              Chem.MolToSmiles(qmol, isomericSmiles=True), qmol.GetNumAtoms())

    i = 0
    count = 0
    total = 0
    errors = 0
    for mol in suppl:
        if mol is None:
            i += 1
            continue
        try:
            if args.num > 0:
                mol.RemoveAllConformers()
                conformerProps, minEnergy = conformers.process_mol_conformers(
                    mol, i, args.num, args.attempts, args.rmsd, None, None, 0)
                mol = Chem.RemoveHs(mol)
                count += doO3Dalign(i,
                                    mol,
                                    qmol,
                                    args.crippen,
                                    args.threshold,
                                    perfect_score,
                                    writer,
                                    conformerProps=conformerProps,
                                    minEnergy=minEnergy)
            else:
                mol = Chem.RemoveHs(mol)
                count += doO3Dalign(i, mol, qmol, args.crippen, args.threshold,
                                    perfect_score, writer)
            total += mol.GetNumConformers()
        except ValueError as e:
            errors += 1
            utils.log("Molecule", i, "failed to align:", e.message)
        i += 1

    input.close()
    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': i,
                '__OutputCount__': count,
                '__ErrorCount__': errors,
                'RDKitO3DAlign': total
            })
Пример #7
0
def main():

    parser = argparse.ArgumentParser(description='SuCOS with RDKit')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument(
        '-r',
        '--refmol',
        help=
        'Molecule to compare against in Molfile (.mol) or SDF (.sdf) format')
    parser.add_argument('-tm',
                        '--target',
                        help='Target molecule to compare against')
    parser.add_argument('-tf',
                        '--target-format',
                        help='Target molecule format')
    parser.add_argument('-ti',
                        '--targetidx',
                        help='Target molecule index in file if not the first',
                        type=int,
                        default=1)

    parser.add_argument('--tanimoto',
                        action='store_true',
                        help='Include Tanimoto distance in score')
    parser.add_argument(
        '--score_mode',
        choices=['all', 'closest', 'best'],
        help="choose the scoring mode for the feature map, default is 'all'.")

    args = parser.parse_args()
    utils.log("SuCOS Args: ", args)

    score_mode = parse_score_mode(args.score_mode)

    target_mol = rdkit_utils.read_single_molecule(args.target,
                                                  index=args.targetidx,
                                                  format=args.target_format)
    utils.log("Target mol has", str(target_mol.GetNumHeavyAtoms()),
              "heavy atoms")

    source = "sucos.py"
    datasetMetaProps = {
        "source": source,
        "description": "SuCOS using RDKit " + rdBase.rdkitVersion
    }

    clsMappings = {}
    fieldMetaProps = []
    clsMappings[field_SuCOS_Score] = "java.lang.Float"
    clsMappings[field_SuCOS_FMScore] = "java.lang.Float"
    fieldMetaProps.append({
        "fieldName": field_SuCOS_Score,
        "values": {
            "source": source,
            "description": "SuCOS score"
        }
    })
    fieldMetaProps.append({
        "fieldName": field_SuCOS_FMScore,
        "values": {
            "source": source,
            "description": "SuCOS Feature Map score"
        }
    })

    if args.tanimoto:
        clsMappings[field_SuCOS_TaniScore] = "java.lang.Float"
        fieldMetaProps.append({
            "fieldName": field_SuCOS_TaniScore,
            "values": {
                "source": source,
                "description": "SuCOS Tanimoto score"
            }
        })
    else:
        clsMappings[field_SuCOS_ProtrudeScore] = "java.lang.Float"
        fieldMetaProps.append({
            "fieldName": field_SuCOS_ProtrudeScore,
            "values": {
                "source": source,
                "description": "SuCOS Protrude score"
            }
        })


    inputs_file,output,inputs_supplr,writer,output_base = rdkit_utils. \
        default_open_input_output(args.input, args.informat, args.output,
                                  'sucos', args.outformat,
                                  valueClassMappings=clsMappings,
                                  datasetMetaProps=datasetMetaProps,
                                  fieldMetaProps=fieldMetaProps)

    # this does the processing
    count, total, errors = process(target_mol,
                                   inputs_supplr,
                                   writer,
                                   tani=args.tanimoto,
                                   score_mode=score_mode)

    inputs_file.close()
    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': count,
                '__OutputCount__': total,
                '__ErrorCount__': errors,
                'RDKitSuCOS': total
            })
Пример #8
0
def main():

    parser = argparse.ArgumentParser(description='SuCOS with RDKit')
    parser.add_argument('--target', help='molecule to compare against')
    parser.add_argument(
        '--targetidx',
        help="Target molecule index in SD file if not the first",
        type=int,
        default=1)
    parameter_utils.add_default_io_args(parser)

    args = parser.parse_args()
    utils.log("SuCOS Args: ", args)

    # TODO - handle molecules with multiple fragments

    ref_mol = rdkit_utils.read_single_molecule(args.target,
                                               index=args.targetidx)
    utils.log("Reference mol has", str(ref_mol.GetNumHeavyAtoms()),
              "heavy atoms")

    source = "sucos.py"
    datasetMetaProps = {
        "source": source,
        "description": "SuCOS using RDKit " + rdBase.rdkitVersion
    }
    clsMappings = {"SuCOS_score": "java.lang.Float"}
    fieldMetaProps = [{
        "fieldName": field_SuCOS_Score,
        "values": {
            "source": source,
            "description": "SuCOS score"
        }
    }]

    input,output,suppl,writer,output_base = rdkit_utils.\
        default_open_input_output(args.input, args.informat, args.output,
                                  'sucos', args.outformat,
                                  valueClassMappings=clsMappings,
                                  datasetMetaProps=datasetMetaProps,
                                  fieldMetaProps=fieldMetaProps)

    count = 0
    total = 0
    errors = 0
    for mol in suppl:
        count += 1
        if mol is None:
            continue
        #utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms")
        try:
            fm_score = get_SucosScore(ref_mol, mol, field_SuCOS_Score)
            utils.log("Score:", str(fm_score))
            writer.write(mol)
            total += 1
        except ValueError as e:
            errors += 1
            utils.log("Molecule", count, "failed to score:", e.message)

    input.close()
    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': count,
                '__OutputCount__': total,
                '__ErrorCount__': errors,
                'RDKitSuCOS': total
            })
Пример #9
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit screen')
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--qsmiles',
        help='query structure as smiles (incompatible with -qmolfile arg)')
    group.add_argument(
        '--qmolfile',
        help=
        'query structure as filename in molfile format (incompatible with -qsmiles arg)'
    )
    parser.add_argument('--simmin',
                        type=float,
                        default=0.7,
                        help='similarity lower cutoff (1.0 means identical)')
    parser.add_argument('--simmax',
                        type=float,
                        default=1.0,
                        help='similarity upper cutoff (1.0 means identical)')
    parser.add_argument('-d',
                        '--descriptor',
                        type=str.lower,
                        choices=list(descriptors.keys()),
                        default='rdkit',
                        help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-m',
                        '--metric',
                        type=str.lower,
                        choices=list(metrics.keys()),
                        default='tanimoto',
                        help='similarity metric (default tanimoto)')
    parser.add_argument(
        '-f',
        '--fragment',
        choices=['hac', 'mw'],
        help=
        'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )'
    )
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')

    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    descriptor = descriptors[args.descriptor.lower()]
    metric = metrics[args.metric.lower()]

    if args.qsmiles:
        query_rdkitmol = Chem.MolFromSmiles(args.qsmiles)
    elif args.qmolfile:
        query_rdkitmol = Chem.MolFromMolFile(args.qmolfile)
    else:
        raise ValueError('No query structure specified')

    query_fp = descriptor(query_rdkitmol)

    input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output(
        args.input,
        args.informat,
        args.output,
        'screen',
        args.outformat,
        thinOutput=args.thin)

    i = 0
    count = 0
    for mol in suppl:
        i += 1
        if mol is None: continue
        if args.fragment:
            mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet)
        if not filter.filter(mol,
                             minHac=args.hacmin,
                             maxHac=args.hacmax,
                             minMw=args.mwmin,
                             maxMw=args.mwmax,
                             quiet=args.quiet):
            continue
        target_fp = descriptor(mol)
        sim = metric(query_fp, target_fp)

        if sim >= args.simmin and sim <= args.simmax:
            count += 1
            if not args.quiet:
                utils.log(i, sim)
            mol.SetDoubleProp(field_Similarity, sim)
            writer.write(mol)

    utils.log("Found", count, "similar molecules")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitScreen': i
        })
Пример #10
0
def main():
    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit conformers')
    parser.add_argument('-n',
                        '--num',
                        type=int,
                        default=1,
                        help='number of conformers to generate')
    parser.add_argument('-a',
                        '--attempts',
                        type=int,
                        default=0,
                        help='number of attempts')
    parser.add_argument('-r',
                        '--rmsd',
                        type=float,
                        default=1.0,
                        help='prune RMSD threshold')
    parser.add_argument(
        '-c',
        '--cluster',
        type=str.lower,
        choices=['none', 'rmsd', 'tfd'],
        help='Cluster method (RMSD or TFD). If None then no clustering')
    parser.add_argument(
        '-t',
        '--threshold',
        type=float,
        help='cluster threshold (default of 2.0 for RMSD and 0.3 for TFD)')
    parser.add_argument(
        '-e',
        '--emin',
        type=int,
        default=0,
        help='energy minimisation iterations (default of 0 means none)')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument(
        '-s',
        '--smiles',
        help=
        'input structure as smiles (incompatible with using files or stdin for input)'
    )
    parser.add_argument('-f',
                        '--outfile',
                        type=argparse.FileType('w+'),
                        default=sys.stdout,
                        help="path to the result file, default it sdtout")
    args = parser.parse_args()

    if not args.threshold:
        if args.cluster == 'tfd':
            args.threshold = 0.3
        elif args.cluster == 'rmsd':
            args.threshold = 2.0

    utils.log("Conformers Args: ", args)

    source = "conformers.py"
    datasetMetaProps = {
        "source": source,
        "description":
        "Conformer generation using RDKit " + rdBase.rdkitVersion
    }
    clsMappings = {
        "RMSToCentroid": "java.lang.Float",
        "EnergyDelta": "java.lang.Float",
        "EnergyAbs": "java.lang.Float",
        "ConformerNum": "java.lang.Integer",
        "ClusterCentroid": "java.lang.Integer",
        "ClusterNum": "java.lang.Integer",
        "StructureNum": "java.lang.Integer"
    }
    fieldMetaProps = [{
        "fieldName": "RMSToCentroid",
        "values": {
            "source": source,
            "description": "RMS distance to the cluster centroid"
        }
    }, {
        "fieldName": "EnergyDelta",
        "values": {
            "source": source,
            "description": "Energy difference to lowest energy structure"
        }
    }, {
        "fieldName": "EnergyAbs",
        "values": {
            "source": source,
            "description": "Absolute energy"
        }
    }, {
        "fieldName": "ConformerNum",
        "values": {
            "source": source,
            "description": "Conformer number"
        }
    }, {
        "fieldName": "ClusterCentroid",
        "values": {
            "source": source,
            "description": "Conformer number of the cluster centroid"
        }
    }, {
        "fieldName": "ClusterNum",
        "values": {
            "source": source,
            "description": "Cluster number"
        }
    }, {
        "fieldName": "StructureNum",
        "values": {
            "source": source,
            "description": "Structure number this conformer was generated from"
        }
    }]

    if args.smiles:
        mol = Chem.MolFromSmiles(args.smiles)
        suppl = [mol]
        input = None
    else:
        input, output, suppl, writer, output_base = rdkit_utils. \
            default_open_input_output(args.input, args.informat, args.output,
                                      'conformers', args.outformat,
                                      valueClassMappings=clsMappings,
                                      datasetMetaProps=datasetMetaProps,
                                      fieldMetaProps=fieldMetaProps)

    # OK, all looks good so we can hope that things will run OK.
    # But before we start lets write the metadata so that the results can be handled.
    # if args.meta:
    #    t = open(output_base + '_types.txt', 'w')
    #    t.write(field_StructureNum + '=integer\n')
    #    t.write(field_StructureNum + '=integer\n')
    #    t.write(field_ConformerNum + '=integer\n')
    #    t.write(field_EnergyAbs + '=double\n')
    #    t.write(field_EnergyDelta + '=double\n')
    #    if args.emin > 0:
    #        t.write(field_MinimizationConverged + '=boolean\n')
    #    if args.cluster:
    #        t.write(field_RMSToCentroid + '=double\n')
    #        t.write(field_ClusterNum + '=integer\n')
    #        t.write(field_ClusterCentroid + '=integer\n')
    #    t.flush()
    #    t.close()

    i = 0
    count = 0
    writer = rdkit_utils.ThickSDWriter(args.outfile)
    for mol in suppl:
        if mol is None: continue
        m = Chem.AddHs(mol)
        conformerPropsDict, minEnergy = process_mol_conformers(
            m, i, args.num, args.attempts, args.rmsd, args.cluster,
            args.threshold, args.emin)
        m = Chem.RemoveHs(m)
        write_conformers(m, i, conformerPropsDict, minEnergy, writer)
        count = count + m.GetNumConformers()
        i += 1

    if input:
        input.close()
    writer.flush()
    writer.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitConformer': count
        })
Пример #11
0
def main():
    global PDB_PATH, WRITER, THRESHOLD
    parser = argparse.ArgumentParser(
        description='PLI scoring - Docking calculation.')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring")
    parser.add_argument('-t',
                        '--threshold',
                        type=float,
                        help="The maximum score to allow",
                        default=None)
    parser.add_argument(
        '--threads',
        type=int,
        help="Number of threads to used. Default is the number of cores",
        default=None)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()
    utils.log("PLI Args: ", args)

    # Open up the input file
    input, suppl = rdkit_utils.default_open_input(args.input, args.informat)
    # Open the output file
    s_now = datetime.datetime.utcnow().strftime("%d-%b-%Y %H:%M:%S UTC")
    source = 'pipelines/docking/plip.py'
    output, WRITER, output_base = \
        rdkit_utils.default_open_output(args.output, "plip", args.outformat,
                                        compress=not args.no_gzip,
                                        thinOutput=args.thin,
                                        valueClassMappings={'pliff_iscore': 'java.lang.Float',
                                                            'pliff_cscore': 'java.lang.Float',
                                                            'pliff_nb_score': 'java.lang.Float',
                                                            'pliff_gscore': 'java.lang.Float',
                                                            'pliff_score': 'java.lang.Float',
                                                            'pliff_tscore': 'java.lang.Float'},
                                        datasetMetaProps={'created': s_now,
                                                          'source': source,
                                                          'description': 'PLI scoring of docked structures'}
                                        )

    PDB_PATH = args.pdb_file
    if args.threshold:
        THRESHOLD = args.threshold

    # Iterate over the molecules
    # WARNING - if using parallel processing the order of molecules is not preserved. Set args.threads to 1 to ensure this.
    pool = ThreadPool(args.threads if args.
                      threads is not None else multiprocessing.cpu_count())
    pool.map(run_dock, suppl)
    pool.close()
    pool.join()
    # Close the file
    WRITER.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': COUNTER,
            '__OutputCount__': SUCCESS,
            'PLI': COUNTER
        })
def main():
    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit filter')
    parser.add_argument(
        '-f',
        '--fragment',
        choices=['hac', 'mw'],
        help=
        'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)'
    )
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    parser.add_argument('--rotbmin',
                        type=float,
                        help='Min rotatable bond count')
    parser.add_argument('--rotbmax',
                        type=float,
                        help='Max rotatable bond count')
    parser.add_argument('--logpmin', type=float, help='Min logP')
    parser.add_argument('--logpmax', type=float, help='Max logP')
    parser.add_argument('-l',
                        '--limit',
                        type=int,
                        help='Limit output to this many records')
    parser.add_argument(
        '-c',
        '--chunksize',
        type=int,
        help=
        'Split output into chunks of size c. Output will always be files. Names like filter1.sdf.gz, filter2.sdf.gz ...'
    )
    parser.add_argument(
        '-d',
        '--digits',
        type=int,
        default=0,
        help=
        'When splitting zero pad the file name to this many digits so that they are in sorted order. Names like filter001.sdf.gz, filter002.sdf.gz ...'
    )
    parser.add_argument('-r',
                        '--rename',
                        action='append',
                        help='Rename field (fromname:toname)')
    parser.add_argument(
        '-t',
        '--transform',
        action='append',
        help='Transform field value(fieldname:regex:type). ' +
        'Regex is in the form of /regex/substitution/ (the 3 slashes are required). '
        +
        'Type is of int, float, boolean or string. The type is optional and if not specified then string is assumed. '
        +
        'Transformation occurs after field renaming so specify the new name.')
    parser.add_argument('--delete', action='append', help='Delete field')
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    # WARNING: thin output is not appropriate when using --fragment
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument(
        '-q',
        '--quiet',
        action='store_true',
        help='Quiet mode - suppress reporting reason for filtering')
    parameter_utils.add_default_io_args(parser)
    args = parser.parse_args()
    utils.log("Filter Args: ", args)

    field_renames = {}
    if args.rename:
        for t in args.rename:
            parts = t.split(':')
            if len(parts) != 2:
                raise ValueError('Invalid field rename argument:', t)
            field_renames[parts[0]] = parts[1]
    if args.delete:
        for f in args.delete:
            field_renames[f] = None

    field_regexes = {}
    field_replacements = {}
    field_types = {}
    if args.transform:
        for t in args.transform:
            parts = t.split(':')
            if len(parts) < 2 or len(parts) > 3:
                raise ValueError('Invalid field transform argument:', t)
            terms = parts[1].split('/')
            utils.log("|".join(terms) + str(len(terms)))
            field_regexes[parts[0]] = re.compile(terms[1])
            field_replacements[parts[0]] = terms[2]
            if len(parts) == 3:
                t = parts[2]
            else:
                t = 'string'
            field_types[parts[0]] = t
            utils.log("Created transform of " + terms[1] + " to " + terms[2] +
                      " using type of " + t)

    if args.delete:
        for f in args.delete:
            field_renames[f] = None

    input, suppl = rdkit_utils.default_open_input(args.input, args.informat)

    if args.chunksize:
        chunkNum = 1
        if args.output:
            output_base = args.output
        else:
            output_base = 'filter'
        output_base_chunk = output_base + str(chunkNum).zfill(args.digits)
        output, writer, output_base_chunk = rdkit_utils.default_open_output(
            output_base_chunk,
            output_base_chunk,
            args.outformat,
            thinOutput=args.thin,
            compress=not args.no_gzip)
    else:
        output, writer, output_base_chunk = rdkit_utils.default_open_output(
            args.output,
            "filter",
            args.outformat,
            thinOutput=args.thin,
            compress=not args.no_gzip)
        output_base = output_base_chunk

    utils.log("Writing to " + output_base_chunk)

    i = 0
    count = 0
    chunkNum = 1
    for mol in suppl:
        if args.limit and count >= args.limit:
            break
        i += 1
        if mol is None: continue
        if args.fragment:
            mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet)
        if not filter(mol,
                      minHac=args.hacmin,
                      maxHac=args.hacmax,
                      minMw=args.mwmin,
                      maxMw=args.mwmax,
                      minRotb=args.rotbmin,
                      maxRotb=args.rotbmax,
                      minLogp=args.logpmin,
                      maxLogp=args.logpmax,
                      quiet=args.quiet):
            continue
        if args.chunksize:
            if count > 0 and count % args.chunksize == 0:
                # new chunk, so create new writer
                writer.close()
                output.close()
                chunkNum += 1
                output_chunk_base = output_base + str(chunkNum).zfill(
                    args.digits)
                utils.log("Writing to " + output_chunk_base)
                output, writer, output_chunk_base = rdkit_utils.default_open_output(
                    output_chunk_base,
                    output_chunk_base,
                    args.outformat,
                    thinOutput=args.thin,
                    compress=not args.no_gzip)

        for from_name in field_renames:
            to_name = field_renames[from_name]
            if mol.HasProp(from_name):
                val = mol.GetProp(from_name)
                mol.ClearProp(from_name)
                if to_name:
                    mol.SetProp(to_name, val)

        for fieldname in field_regexes:
            p = mol.GetProp(fieldname)
            if p is not None:
                regex = field_regexes[fieldname]
                q = regex.sub(field_replacements[fieldname], p)
                t = field_types[fieldname]
                if t == 'int':
                    mol.SetIntProp(fieldname, int(q))
                elif t == 'float':
                    mol.SetDoubleProp(fieldname, float(q))
                elif t == 'boolean':
                    mol.SetBoolProp(fieldname, bool(q))
                else:
                    mol.SetProp(fieldname, q)

        count += 1
        writer.write(mol)

    utils.log("Filtered", i, "down to", count, "molecules")
    if args.chunksize:
        utils.log("Wrote", chunkNum, "chunks")
        if (args.digits > 0 and len(str(chunkNum)) > args.digits):
            utils.log(
                "WARNING: not enough digits specified for the number of chunks"
            )

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitFilter': i
        })
Пример #13
0
def main():

    ### command line args definitions #########################################

    parser = argparse.ArgumentParser(description='RDKit Standardize')
    parser.add_argument(
        '--fragment-method',
        choices=['hac', 'mw'],
        help=
        'Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)'
    )
    parser.add_argument('--neutralize',
                        action='store_true',
                        help='Neutralize the molecule')

    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()
    utils.log("Standardize Args: ", args)

    # handle metadata
    source = "standardize.py"
    datasetMetaProps = {
        "source": source,
        "description": "Standardize using RDKit " + rdBase.rdkitVersion
    }
    clsMappings = {}
    fieldMetaProps = []


    input,output,suppl,writer,output_base = rdkit_utils.\
        default_open_input_output(args.input, args.informat, args.output,
                                  'standardize', args.outformat,
                                  thinOutput=False, valueClassMappings=clsMappings,
                                  datasetMetaProps=datasetMetaProps,
                                  fieldMetaProps=fieldMetaProps)
    count = 0
    total = 0
    errors = 0
    for mol in suppl:
        count += 1
        if mol is None:
            errors += 1
            continue
        m = standardize(mol, args.neutralize, args.fragment_method)
        writer.write(m)
        total += 1

    input.close()
    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': count,
                '__OutputCount__': total,
                '__ErrorCount__': errors,
                'RDKitStandardize': total
            })
Пример #14
0
def main():
    # Example usage
    # python -m pipelines.xchem.calc_interactions -p ../../data/mpro/Mpro-x0387_0.pdb -i ../../data/mpro/hits-17.sdf.gz -o output

    parser = argparse.ArgumentParser(description='Calculate interactions')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-p',
                        '--protein',
                        nargs='*',
                        help="File with protein (PDB or MOL2 format")
    # NOTE reading mol2 format seems to be problematical.
    parser.add_argument('-pf',
                        '--protein-format',
                        choices=['pdb', 'mol2'],
                        help="Protein file format")
    parser.add_argument('--strict',
                        action='store_true',
                        help='Strict filtering')
    parser.add_argument(
        '--exact-protein',
        action='store_true',
        help='Exact matching of hydrogens and charges for protein')
    parser.add_argument(
        '--exact-ligand',
        action='store_true',
        help='Exact matching of hydrogens and charges for ligand')
    parser.add_argument('--keep-hs-protein',
                        action='store_true',
                        help='Keep hydrogens on the protein')
    parser.add_argument('--keep-hs-ligand',
                        action='store_true',
                        help='Keep hydrogens on the ligand')
    parser.add_argument('--key-hbond',
                        nargs='*',
                        help='List of canonical H-bond interactions to count')
    parser.add_argument(
        '--key-hydrophobic',
        nargs='*',
        help='List of canonical hydrophobic interactions to count')
    parser.add_argument(
        '--key-salt-bridge',
        nargs='*',
        help='List of canonical salt bridge interactions to count')
    parser.add_argument(
        '--key-pi-stacking',
        nargs='*',
        help='List of canonical pi stacking interactions to count')
    parser.add_argument(
        '--key-pi-cation',
        nargs='*',
        help='List of canonical pi cation interactions to count')
    parser.add_argument(
        '--key-halogen',
        nargs='*',
        help='List of canonical halogen bond interactions to count')
    parser.add_argument(
        '--rfscores',
        nargs='*',
        help="Pickle(s) for RFScore model e.g. RFScore_v1_pdbbind2016.pickle")
    parser.add_argument(
        '--nnscores',
        nargs='*',
        help="Pickle(s) for NNScore model e.g. NNScore_pdbbind2016.pickle")
    parser.add_argument(
        '--plecscores',
        nargs='*',
        help=
        "Pickle(s) for PLECScore model e.g. PLEClinear_p5_l1_pdbbind2016_s65536.pickle"
    )
    parser.add_argument('-r', '--report-file', help="File for the report")
    parser.add_argument('-c',
                        '--compare',
                        help="Compare interactions with this report")

    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('--metrics', action='store_true', help='Write metrics')

    args = parser.parse_args()
    utils.log("Calculate interactions Args: ", args)

    key_inters = {}
    if args.key_hbond:
        key_inters[interactions.I_TYPE_HBOND] = args.key_hbond
    if args.key_hydrophobic:
        key_inters[interactions.I_TYPE_HYDROPHOBIC] = args.key_hydrophobic
    if args.key_salt_bridge:
        key_inters[interactions.I_TYPE_SALT_BRIDGE] = args.key_salt_bridge
    if args.key_pi_stacking:
        key_inters[interactions.I_TYPE_PI_STACKING] = args.key_pi_stacking
    if args.key_pi_cation:
        key_inters[interactions.I_TYPE_PI_CATION] = args.key_pi_cation
    if args.key_halogen:
        key_inters[interactions.I_TYPE_HALOGEN] = args.key_halogen

    source = "calc_interactions.py using ODDT"
    datasetMetaProps = {
        "source": source,
        "description": "Calculate interactions using ODDT"
    }

    clsMappings = {}
    fieldMetaProps = []
    clsMappings[interactions.I_NAME_HBOND] = "java.lang.String"
    clsMappings[interactions.I_NAME_HALOGEN] = "java.lang.String"
    clsMappings[interactions.I_NAME_HYDROPHOBIC] = "java.lang.String"
    clsMappings[interactions.I_NAME_SALT_BRIDGE] = "java.lang.String"
    clsMappings[interactions.I_NAME_PI_STACKING] = "java.lang.String"
    clsMappings[interactions.I_NAME_PI_CATION] = "java.lang.String"
    clsMappings['NumTotalInteractions'] = "java.lang.Integer"
    clsMappings['NumKeyInteractions'] = "java.lang.Integer"
    clsMappings['KeyInteractions'] = "java.lang.String"
    fieldMetaProps.append({
        "fieldName": interactions.I_NAME_HBOND,
        "values": {
            "source": source,
            "description": "Hydrogen bond interactions"
        },
        "fieldName": interactions.I_NAME_HALOGEN,
        "values": {
            "source": source,
            "description": "Halogen bond interactions"
        },
        "fieldName": interactions.I_NAME_HYDROPHOBIC,
        "values": {
            "source": source,
            "description": "Hydrophobic interactions"
        },
        "fieldName": interactions.I_NAME_SALT_BRIDGE,
        "values": {
            "source": source,
            "description": "Salt bridge interactions"
        },
        "fieldName": interactions.I_NAME_PI_STACKING,
        "values": {
            "source": source,
            "description": "Pi stacking interactions"
        },
        "fieldName": interactions.I_NAME_PI_CATION,
        "values": {
            "source": source,
            "description": "Pi cation interactions"
        }
    })

    inputs_file, inputs_supplr = rdkit_utils.default_open_input(
        args.input, args.informat)
    output, writer, output_base = rdkit_utils.default_open_output(
        args.output,
        'calc_interactions',
        args.outformat,
        valueClassMappings=clsMappings,
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps,
        compress=not args.no_gzip)

    # this does the processing
    count, errors = process(args.protein,
                            args.input,
                            writer,
                            key_inters,
                            protein_format=args.protein_format,
                            filter_strict=args.strict,
                            exact_protein=args.exact_protein,
                            exact_ligand=args.exact_ligand,
                            keep_hs_protein=args.keep_hs_protein,
                            keep_hs_ligand=args.keep_hs_ligand,
                            report_file=args.report_file,
                            compare_file=args.compare,
                            rfscores=args.rfscores,
                            nnscores=args.nnscores,
                            plecscores=args.plecscores)
    utils.log('Processing complete.', count, 'records processed.', errors,
              'errors')

    inputs_file.close()
    writer.flush()
    writer.close()
    output.close()
    #
    if args.metrics:
        utils.write_metrics(
            output_base, {
                '__InputCount__': total,
                '__OutputCount__': count,
                '__ErrorCount__': errors,
                'ODDTInteraction': count
            })
Пример #15
0
def main():
    ### command line args defintions #########################################

    ### Define the reactions available
    poised_filter = True
    if poised_filter == True:
        from poised_filter import Filter
        filter_to_use = Filter()

    parser = argparse.ArgumentParser(description='RDKit rxn process')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('-m',
                        '--multi',
                        action='store_true',
                        help='Output one file for each reaction')
    parser.add_argument('-r',
                        '--reaction',
                        choices=filter_to_use.poised_reactions.keys(),
                        help='Name of reaction to be run')
    parser.add_argument('-rl',
                        '--reagent_lib',
                        help="Input SD file, if not defined the STDIN is used")
    parser.add_argument(
        '-rlf',
        '--reagent_lib_format',
        choices=['sdf', 'json'],
        help="Input format. When using STDIN this must be specified.")

    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    if not args.output and args.multi:
        raise ValueError(
            "Must specify output location when writing individual result files"
        )

    input, suppl = rdkit_utils.default_open_input(args.input, args.informat)
    reagent_input, reagent_suppl = rdkit_utils.default_open_input(
        args.reagent_lib, args.reagent_lib_format)
    output, writer, output_base = rdkit_utils.default_open_output(
        args.output, "rxn_maker", args.outformat)

    i = 0
    count = 0

    if args.multi:
        dir_base = os.path.dirname(args.output)
        writer_dict = filter_to_use.get_writers(dir_base)
    else:
        writer_dict = None
        dir_base = None

    for mol in suppl:
        i += 1
        if mol is None: continue
        # Return a dict/class here - indicating which filters passed
        count = filter_to_use.perform_reaction(mol, args.reaction,
                                               reagent_suppl, writer, count)

    utils.log("Created", count, "molecules from a total of ", i,
              "input molecules")

    writer.flush()
    writer.close()
    if input:
        input.close()
    if output:
        output.close()
    # close the individual writers
    if writer_dict:
        for key in writer_dict:
            writer_dict[key].close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': i,
                '__OutputCount__': count,
                'RxnSmartsFilter': count
            })
Пример #16
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(
        description='RDKit molecule standardizer / enumerator')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-et',
                        '--enumerate_tauts',
                        action='store_true',
                        help='Enumerate all tautomers')
    parser.add_argument('-es',
                        '--enumerate_stereo',
                        action='store_true',
                        help='Enumerate all stereoisomers')
    parser.add_argument(
        '-st',
        '--standardize',
        action='store_true',
        help='Standardize molecules. Cannot  be true if enumerate is on.')
    parser.add_argument('-stm',
                        '--standardize_method',
                        default="molvs",
                        choices=STANDARD_MOL_METHODS.keys(),
                        help="Choose the method to standardize.")
    parser.add_argument('-mf',
                        '--mol_format',
                        choices=['smiles', 'mol_2d', 'mol_3d'],
                        help="Format for molecules.")

    args = parser.parse_args()

    utils.log("Sanifier Args: ", args)

    if args.standardize and args.enumerate_tauts:
        raise ValueError("Cannot Enumerate Tautomers and Standardize")

    if args.standardize and args.enumerate_stereo:
        raise ValueError("Cannot Enumerate Stereo and Standardize")

    if args.outformat == 'sdf' and args.mol_format == 'smiles':
        raise ValueError("Smiles cannot be used when outputting as SDF")

    if args.standardize:
        getStandardMolecule = STANDARD_MOL_METHODS[args.standardize_method]

    # handle metadata
    source = "sanifier.py"
    datasetMetaProps = {
        "source": source,
        "description": "Enumerate tautomers and stereoisomers"
    }
    clsMappings = {
        "EnumTautIsoSourceMolUUID": "java.lang.String",
        "EnumTautIsoSourceMolIdx": "java.lang.Integer"
    }
    fieldMetaProps = [{
        "fieldName": "EnumTautIsoSourceMolUUID",
        "values": {
            "source": source,
            "description": "UUID of source molecule"
        }
    }, {
        "fieldName": "EnumTautIsoSourceMolIdx",
        "values": {
            "source": source,
            "description": "Index of source molecule"
        }
    }]

    oformat = utils.determine_output_format(args.outformat)

    input,output,suppl,writer,output_base = rdkit_utils. \
        default_open_input_output(args.input, args.informat, args.output,
                                  'sanifier', args.outformat,
                                  thinOutput=False, valueClassMappings=clsMappings,
                                  datasetMetaProps=datasetMetaProps,
                                  fieldMetaProps=fieldMetaProps)

    i = 0
    count = 0
    errors = 0
    for mol in suppl:
        i += 1
        if mol is None: continue

        if args.standardize:
            # we keep the original UUID as there is still a 1-to-1 relationship between the input and outputs
            oldUUID = mol.GetProp("uuid")
            inputCanSmiles = Chem.MolToSmiles(mol,
                                              isomericSmiles=True,
                                              canonical=True)
            try:
                std = getStandardMolecule(mol)
                outputCanSmiles = Chem.MolToSmiles(std,
                                                   isomericSmiles=True,
                                                   canonical=True)
                if oldUUID:
                    std.SetProp("uuid", oldUUID)
                #utils.log("Standardized", i, inputCanSmiles, ">>", outputCanSmiles)
                if inputCanSmiles == outputCanSmiles:
                    std.SetProp("Standardized", "False")
                else:
                    std.SetProp("Standardized", "True")
            except:
                errors += 1
                utils.log("Error standardizing", sys.exc_info()[0])
                std = mol
                std.SetProp("Standardized", "Error")

            count = write_out([std], count, writer, args.mol_format,
                              args.outformat)
        else:
            # we want a new UUID generating as we are generating new molecules
            if mol.HasProp('uuid'):
                parentUuid = mol.GetProp("uuid")
            else:
                parentUuid = None

            results = []

            if args.enumerate_tauts:
                utils.log("Enumerating tautomers")
                results = enumerateTautomers(mol)
            else:
                results.append(mol)

            if args.enumerate_stereo:
                utils.log("Enumerating steroisomers")
                mols = results
                results = []
                for m in mols:
                    enumerated = enumerateStereoIsomers(m)
                    results.extend(enumerated)

            for m in results:
                # copy the src mol props
                for name in mol.GetPropNames():
                    m.SetProp(name, mol.GetProp(name))
                # add our new props
                m.ClearProp("uuid")
                m.SetIntProp("EnumTautIsoSourceMolIdx", i)
                if parentUuid:
                    m.SetProp("EnumTautIsoSourceMolUUID", parentUuid)

            count = write_out(results, count, writer, args.mol_format,
                              args.outformat)

    utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) +
              " outputs")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    # re-write the metadata as we now know the size
    if oformat == 'json':
        utils.write_squonk_datasetmetadata(output_base,
                                           False,
                                           clsMappings,
                                           datasetMetaProps,
                                           fieldMetaProps,
                                           size=count)

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': i,
                '__OutputCount__': count,
                '__ErrorCount__': errors,
                'RDKitSanify': count
            })

    return count
Пример #17
0
def main():
    parser = argparse.ArgumentParser(description='Max SuCOS scores with RDKit')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-tm',
                        '--target-molecules',
                        help='Target molecules to compare against')
    parser.add_argument('-tf',
                        '--targets-format',
                        help='Target molecules format')
    parser.add_argument('-n',
                        '--name-field',
                        help='Name of field with molecule name')
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('--filter-value',
                        type=float,
                        help='Filter out values with scores less than this.')
    parser.add_argument('--filter-field',
                        help='Field to use to filter values.')

    args = parser.parse_args()
    utils.log("Max SuCOSMax Args: ", args)

    source = "sucos_max.py"
    datasetMetaProps = {
        "source": source,
        "description": "SuCOSMax using RDKit " + rdBase.rdkitVersion
    }
    clsMappings = {}
    fieldMetaProps = []

    clsMappings[field_SuCOSMax_Score] = "java.lang.Float"
    clsMappings[field_SuCOSMax_FMScore] = "java.lang.Float"
    clsMappings[field_SuCOSMax_ProtrudeScore] = "java.lang.Float"
    clsMappings[field_SuCOSMax_Index] = "java.lang.Integer"
    clsMappings[field_SuCOSCum_Score] = "java.lang.Float"
    clsMappings[field_SuCOSCum_FMScore] = "java.lang.Float"
    clsMappings[field_SuCOSCum_ProtrudeScore] = "java.lang.Float"

    fieldMetaProps.append({
        "fieldName": field_SuCOSMax_Score,
        "values": {
            "source": source,
            "description": "SuCOS Max score"
        }
    })
    fieldMetaProps.append({
        "fieldName": field_SuCOSMax_FMScore,
        "values": {
            "source": source,
            "description": "SuCOS Max Feature Map score"
        }
    })
    fieldMetaProps.append({
        "fieldName": field_SuCOSMax_ProtrudeScore,
        "values": {
            "source": source,
            "description": "SuCOS Max Protrude score"
        }
    })
    fieldMetaProps.append({
        "fieldName": field_SuCOSMax_Index,
        "values": {
            "source": source,
            "description": "SuCOS Max target index"
        }
    })
    fieldMetaProps.append({
        "fieldName": field_SuCOSCum_Score,
        "values": {
            "source": source,
            "description": "SuCOS Cumulative score"
        }
    })
    fieldMetaProps.append({
        "fieldName": field_SuCOSCum_FMScore,
        "values": {
            "source": source,
            "description": "SuCOS Cumulative Feature Map score"
        }
    })
    fieldMetaProps.append({
        "fieldName": field_SuCOSCum_ProtrudeScore,
        "values": {
            "source": source,
            "description": "SuCOS Cumulative Protrude score"
        }
    })

    if args.name_field:
        clsMappings[field_SuCOSMax_Target] = "java.lang.String"
        fieldMetaProps.append({
            "fieldName": field_SuCOSMax_Target,
            "values": {
                "source": source,
                "description": "SuCOS Max target name"
            }
        })

    inputs_file, inputs_supplr = rdkit_utils.default_open_input(
        args.input, args.informat)
    output, writer, output_base = rdkit_utils.default_open_output(
        args.output,
        'sucos-max',
        args.outformat,
        valueClassMappings=clsMappings,
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps,
        compress=not args.no_gzip)

    targets_file, targets_supplr = rdkit_utils.default_open_input(
        args.target_molecules, args.targets_format)

    count, total, errors = process(inputs_supplr, targets_supplr, writer,
                                   args.name_field, args.filter_value,
                                   args.filter_field)

    inputs_file.close()
    targets_file.close()
    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': count,
                '__OutputCount__': total,
                '__ErrorCount__': errors,
                'RDKitSuCOS': total
            })
Пример #18
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit Butina Cluster')
    parser.add_argument('-t', '--threshold', type=float, default=0.0, help='similarity threshold (1.0 means identical)')
    parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='morgan2', help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')
    parser.add_argument('-n', '--num', type=int, help='maximum number to pick for diverse subset selection')
    parser.add_argument('-s', '--seed-molecules', help='optional file containing any seed molecules that have already been picked')
    parser.add_argument('--fragment-method', choices=['hac', 'mw'], default='hac', help='Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)')
    parser.add_argument('--output-fragment', action='store_true', help='Output the biggest fragment rather than the original molecule')
    parameter_utils.add_default_io_args(parser)

    args = parser.parse_args()
    utils.log("MaxMinPicker Args: ", args)

    descriptor = descriptors[args.descriptor]
    if descriptor is None:
        raise ValueError('No descriptor specified')

    if not args.num and not args.threshold:
        raise ValueError('--num or --threshold arguments must be specified, or both')

    # handle metadata
    source = "max_min_picker.py"
    datasetMetaProps = {"source":source, "description": "MaxMinPicker using RDKit " + rdBase.rdkitVersion}

    ### generate fingerprints
    fps = []
    mols = []
    errors = 0

    # first the initial seeds, if specified
    firstPicks = []
    num_seeds = 0
    if args.seed_molecules:
        seedsInput,seedsSuppl = rdkit_utils.default_open_input(args.seed_molecules, None)
        start = time.time()
        errors += mol_utils.fragmentAndFingerprint(seedsSuppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet)
        end = time.time()
        seedsInput.close()
        num_seeds = len(fps)
        utils.log("Read", len(fps), "fingerprints for seeds in", end-start, "secs,", errors, "errors")
        firstPicks = list(range(num_seeds))

    # now the molecules to pick from
    input,output,suppl,writer,output_base = rdkit_utils.default_open_input_output(args.input, args.informat, args.output, 'max_min_picker',
                                                                            args.outformat, datasetMetaProps=datasetMetaProps)
    # reset the mols list as we don't need the seeds, only the candidates
    mols = []
    start = time.time()
    errs = mol_utils.fragmentAndFingerprint(suppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet)
    end = time.time()
    errors += errs

    input.close()
    num_fps = len(fps)
    num_candidates = num_fps - num_seeds
    utils.log("Read", num_candidates, "fingerprints for candidates in", end-start, "secs,", errs, "errors")

    if not args.num:
        num_to_pick = num_candidates
    elif args.num > num_candidates:
        num_to_pick = num_candidates
        utils.log("WARNING: --num argument (", args.num, ") is larger than the total number of candidates (", num_candidates, ") - resetting to", num_candidates)
    else:
        num_to_pick = args.num

    ### do picking
    utils.log("MaxMinPicking with descriptor", args.descriptor, "and threshold", args.threshold, ",", num_seeds, "seeds,", num_candidates, "candidates", num_fps, "total")
    start = time.time()
    picks, thresh = performPick(fps, num_to_pick + num_seeds, args.threshold, firstPicks)
    end = time.time()
    num_picks = len(picks)

    utils.log("Found", num_picks, "molecules in", end-start, "secs, final threshold", thresh)
    utils.log("Picks:", list(picks[num_seeds:]))
    del fps

    # we want to return the results in the order they were in the input so first we record the order in the pick list
    indices = {}
    i = 0
    for idx in picks[num_seeds:]:
        indices[idx] = i
        i += 1
    # now do the sort
    sorted_picks = sorted(picks[num_seeds:])
    # now write out the mols in the correct order recording the value in the pick list as the PickIndex property
    i = 0
    for idx in sorted_picks:
        mol = mols[idx - num_seeds] # mols array only contains the candidates
        mol.SetIntProp("PickIndex", indices[idx] + 1)
        writer.write(mol)
        i += 1
    utils.log("Output", i, "molecules")

    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        metrics = {}
        status_str = "{} compounds picked. Final threshold was {}.".format(i, thresh)
        if errors > 0:
            metrics['__ErrorCount__'] = errors
            status_str = status_str + " {} errors.".format(errors)

        metrics['__StatusMessage__'] = status_str
        metrics['__InputCount__'] = num_fps
        metrics['__OutputCount__'] = i
        metrics['RDKitMaxMinPicker'] = num_picks

        utils.write_metrics(output_base, metrics)
Пример #19
0
def main():
    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit rxn smarts filter')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('-m',
                        '--multi',
                        action='store_true',
                        help='Output one file for each reaction')
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    if not args.output and args.multi:
        raise ValueError(
            "Must specify output location when writing individual result files"
        )

    ### Define the filter chooser - lots of logic possible
    # SMARTS patterns are defined in poised_filter.py. Currently this is hardcoded.
    # Should make this configurable so that this can be specified by the user at some stage.
    poised_filter = True
    if poised_filter == True:
        from .poised_filter import Filter
        filter_to_use = Filter()
    rxn_names = filter_to_use.get_rxn_names()
    utils.log("Using", len(rxn_names), "reaction filters")

    # handle metadata
    source = "rxn_smarts_filter.py"
    datasetMetaProps = {
        "source": source,
        "description": "Reaction SMARTS filter"
    }
    clsMappings = {}
    fieldMetaProps = []

    for name in rxn_names:
        # this is the Java class type for an array of MoleculeObjects
        clsMappings[name] = "[Lorg.squonk.types.MoleculeObject;"
        fieldMetaProps.append({
            "fieldName": name,
            "values": {
                "source": source,
                "description": "Sythons from " + name + " reaction"
            }
        })

    input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output(
        args.input,
        args.informat,
        args.output,
        'rxn_smarts_filter',
        args.outformat,
        thinOutput=args.thin,
        valueClassMappings=clsMappings,
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps)

    i = 0
    count = 0

    if args.multi:
        dir_base = os.path.dirname(args.output)
        writer_dict = filter_to_use.get_writers(dir_base)
    else:
        writer_dict = None
        dir_base = None

    for mol in suppl:
        i += 1
        if mol is None: continue
        # Return a dict/class here - indicating which filters passed
        filter_pass = filter_to_use.pass_filter(mol)
        utils.log("Found", str(len(filter_pass)), "matches")

        if filter_pass:
            props = {}
            count += 1
            for reaction in filter_pass:
                molObjList = []
                # Write the reaction name as a newline separated list of the synthons to the mol object
                # this is used in SDF output
                mol.SetProp(reaction, "\n".join(filter_pass[reaction]))
                # now write to the props is a way that can be used for the JSON output
                for smiles in filter_pass[reaction]:
                    # generate a dict that generates MoleculeObject JSON
                    mo = utils.generate_molecule_object_dict(
                        smiles, "smiles", None)
                    molObjList.append(mo)
                props[reaction] = molObjList

                if args.multi:
                    writer_dict[reaction].write(mol)
                    writer_dict[reaction].flush()
            # write the output.
            # In JSON format the props will override values set on the mol
            # In SDF format the props are ignored so the values in the mol are used
            writer.write(mol, props)
            writer.flush()
    utils.log("Matched", count, "molecules from a total of", i)
    if dir_base:
        utils.log("Individual SD files found in: " + dir_base)

    writer.flush()
    writer.close()
    if input:
        input.close()
    if output:
        output.close()
    # close the individual writers
    if writer_dict:
        for key in writer_dict:
            writer_dict[key].close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': i,
                '__OutputCount__': count,
                'RxnSmartsFilter': count
            })
Пример #20
0
def main():
    global WRITER, THRESHOLD
    global PDB_PATH
    parser = argparse.ArgumentParser(
        description='SMoG2016 - Docking calculation.')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring")
    parser.add_argument('-t',
                        '--threshold',
                        help="The maximum score to allow",
                        default=None)
    parser.add_argument(
        '--threads',
        type=int,
        help="Number of threads to used. Default is the number of cores",
        default=None)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()

    utils.log("SMoG2016 Args: ", args)

    smog_path = "/usr/local/SMoG2016/"
    if args.threshold:
        THRESHOLD = float(args.threshold)
    else:
        THRESHOLD = None

    PDB_PATH = "/tmp/pdb_file.pdb"
    # Now copy it to prot_pdb.pdb -> silly SMOG bug requires underscore in the filename!
    shutil.copy(args.pdb_file, PDB_PATH)

    # Open up the input file
    input, suppl = rdkit_utils.default_open_input(args.input, args.informat)
    # Open the output file
    output, WRITER, output_base = rdkit_utils.\
        default_open_output(args.output, "SMoG2016",
                            args.outformat, compress=not args.no_gzip)

    # Cd to the route of the action
    # TODO - can this be done without changing dir? It gives problems in finding the input files and in writing the metrics
    cwd = os.getcwd()
    os.chdir(smog_path)

    # Iterate over the molecules
    # WARNING - if using parallel processing the order of molecules is not preserved. Set args.threads to 1 to ensure this.
    if args.threads is None:
        threads = multiprocessing.cpu_count()
    else:
        threads = args.threads
    pool = ThreadPool(threads)
    pool.map(run_dock, suppl)
    # Close the file
    WRITER.close()

    os.chdir(cwd)
    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': COUNTER,
                '__OutputCount__': SUCCESS,
                'SMoG2016': COUNTER
            })

    utils.log("SMoG2016 complete")
Пример #21
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit screen')
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--qsmiles',
        help=
        'filename of query structures as smiles (incompatible with --sdf and --qjson args)'
    )
    group.add_argument(
        '--qsdf',
        help=
        'filename of query structures as sdfile (incompatible with --smiles and --qjson args)'
    )
    group.add_argument(
        '--qjson',
        help=
        'filename of query structures as MoleculeObject JSON (incompatible with --qsmiles and --qsdf args)'
    )
    parser.add_argument('--qsmilesTitleLine',
                        action='store_true',
                        help='the smiles file has a title line')
    parser.add_argument('--qsmilesDelimiter',
                        default='\t',
                        help='delimiter for smiles file (default is tab)')
    parser.add_argument(
        '--qsmilesColumn',
        type=int,
        default=0,
        help='column in smiles file with the smiles (default is first column)')
    parser.add_argument(
        '--qsmilesNameColumn',
        type=int,
        default=1,
        help='column in smiles file with ID (default is second column)')
    parser.add_argument(
        '--qprop',
        help=
        'property name in query molecules to report. If not defined (or property is not present) '
        +
        'then name property is not written. JSON format uses the UUID as default'
    )

    parser.add_argument('--simmin',
                        type=float,
                        default=0.7,
                        help='similarity lower cutoff (1.0 means identical)')
    parser.add_argument('--simmax',
                        type=float,
                        default=1.0,
                        help='similarity upper cutoff (1.0 means identical)')
    parser.add_argument('-d',
                        '--descriptor',
                        type=str.lower,
                        choices=list(descriptors.keys()),
                        default='rdkit',
                        help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-m',
                        '--metric',
                        type=str.lower,
                        choices=list(metrics.keys()),
                        default='tanimoto',
                        help='similarity metric (default tanimoto)')
    parser.add_argument(
        '-f',
        '--fragment',
        choices=['hac', 'mw'],
        help=
        'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )'
    )
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')

    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    descriptor = descriptors[args.descriptor.lower()]
    metric = metrics[args.metric.lower()]

    propName = args.qprop
    if args.qsmiles:
        queryMolsupplier = rdkit_utils.default_open_input_smiles(
            args.qsmiles,
            delimiter=args.qsmilesDelimiter,
            smilesColumn=args.qsmilesColumn,
            nameColumn=args.qsmilesNameColumn,
            titleLine=args.qsmilesTitleLine)
        queryInput = None
    elif args.qsdf:
        queryInput, queryMolsupplier = rdkit_utils.default_open_input_sdf(
            args.qsdf)
    elif args.qjson:
        queryInput, queryMolsupplier = rdkit_utils.default_open_input_json(
            args.qjson, lazy=False)
        if not propName:
            propName = "uuid"
    else:
        raise ValueError('No query structure specified')

    queryFps = {}
    utils.log("Preparing query fingerprints")
    count = 0
    for q in queryMolsupplier:
        count += 1
        if q:
            queryFps[q] = descriptor(q)
        else:
            utils.log("WARNING: Failed to parse Molecule", count)
    if queryInput:
        queryInput.close()

    input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output(
        args.input, args.informat, args.output, 'screen_multi', args.outformat)

    # OK, all looks good so we can hope that things will run OK.
    # But before we start lets write the metadata so that the results can be handled.
    #if args.meta:
    #    t = open(output_base + '_types.txt', 'w')
    #    t.write(field_Similarity + '=integer\n')
    #    t.flush()
    #    t.close()

    i = 0
    count = 0
    for mol in suppl:
        i += 1
        if mol is None: continue
        if args.fragment:
            mol = mol_utils.fragment(mol, args.fragment, quiet=args.quiet)
        if not filter.filter(mol,
                             minHac=args.hacmin,
                             maxHac=args.hacmax,
                             minMw=args.mwmin,
                             maxMw=args.mwmax,
                             quiet=args.quiet):
            continue
        targetFp = descriptor(mol)
        idx = 0
        hits = 0
        bestScore = 0
        bestName = None
        for queryMol in queryFps:
            idx += 1
            sim = metric(queryFps[queryMol], targetFp)
            if propName:
                name = str(queryMol.GetProp(propName))
            else:
                name = None
            if sim >= args.simmin and sim <= args.simmax:
                hits += 1
                if not args.quiet:
                    utils.log(i, idx, sim)
                if sim > bestScore:
                    bestScore = sim
                    bestIdx = idx
                    if name:
                        bestName = name
                if name:
                    mol.SetDoubleProp(field_Similarity + "_" + name, sim)
                else:
                    mol.SetDoubleProp(
                        field_Similarity + "_" + str(idx) + "_Score", sim)

        if hits > 0:
            count += 1
            mol.SetDoubleProp(field_Similarity + "_BestScore", bestScore)
            if bestName:
                mol.SetProp(field_Similarity + "_BestName", bestName)
            else:
                mol.SetIntProp(field_Similarity + "_BestIndex", bestIdx)
            mol.SetIntProp(field_Similarity + "_Count", hits)
            writer.write(mol)

    utils.log("Found", count, "similar molecules")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitScreen': count
        })

    return count
Пример #22
0
            if my_mol.HasProp("uuid"):
                cleaned.SetProp("SourceMolUUID", my_mol.GetProp("uuid"))
            cleaned.SetIntProp("SourceMolNum", molIdx)
            cleaned.SetIntProp("ConformerNum", count + 1)
            outputfile.write(cleaned)
            count += 1
        except ValueError:
            errors += 1
            logging.exception('')
    return count, errors


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='RDKit constrained conformer generator')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-n',
                        '--num',
                        type=int,
                        default=10,
                        help='number of conformers to generate')
    parser.add_argument('-r', '--refmol', help="Reference molecule file")
    parser.add_argument('--refmolidx',
                        help="Reference molecule index in file",
                        type=int,
                        default=1)
    parser.add_argument(
        '-c',
        '--core_smi',
        help='Core substructure. If not specified - guessed using MCS',
        default='')
Пример #23
0
def main():
    ### command line args definitions #########################################

    parser = argparse.ArgumentParser(description='Filter interactions')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-f',
                        '--group-by-field',
                        required=True,
                        help='Field to group records by (must be sequential)')
    parser.add_argument('-s',
                        '--score-field',
                        required=True,
                        help='Field to use to rank records within a group')
    parser.add_argument('-d',
                        '--score-descending',
                        action='store_true',
                        help='Sort records in descending order')
    parser.add_argument('-x',
                        '--stats-fields',
                        nargs='*',
                        help='Field to use to for summary statistics')

    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')

    args = parser.parse_args()
    utils.log("filter_interactions: ", args)

    # handle metadata
    source = "filter_interactions.py"
    datasetMetaProps = {
        "source": source,
        "description": "Filter by interactions"
    }
    clsMappings = {
        # "EnumChargesSrcMolUUID": "java.lang.String",
        # "EnumChargesSrcMolIdx": "java.lang.Integer"
    }
    fieldMetaProps = [
        # {"fieldName": "EnumChargesSrcMolUUID", "values": {"source": source, "description": "UUID of source molecule"}},
        # {"fieldName": "EnumChargesSrcMolIdx", "values": {"source": source, "description": "Index of source molecule"}}
    ]

    input, suppl = rdkit_utils.default_open_input(args.input, args.informat)
    output, writer, output_base = rdkit_utils.default_open_output(
        args.output,
        'filter_interactions',
        args.outformat,
        thinOutput=False,
        valueClassMappings=clsMappings,
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps,
        compress=not args.no_gzip)
    report_file = open(output_base + '.report', 'wt')
    count, total, errors = execute(suppl, writer, report_file,
                                   args.group_by_field, args.score_field,
                                   args.score_descending, args.stats_fields)

    utils.log(count, total, errors)

    if input:
        input.close()
    writer.flush()
    writer.close()
    output.close()
    report_file.close()

    # re-write the metadata as we now know the size
    if args.outformat == 'json':
        utils.write_squonk_datasetmetadata(output_base,
                                           False,
                                           clsMappings,
                                           datasetMetaProps,
                                           fieldMetaProps,
                                           size=total)

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': count,
                '__OutputCount__': total,
                '__ErrorCount__': errors,
                'FilterInteractions': total
            })
Пример #24
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(
        description='RDKit molecule standardiser / enumerator')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-et',
                        '--enumerate_tauts',
                        action='store_true',
                        help='Enumerate all tautomers')
    parser.add_argument('-es',
                        '--enumerate_stereo',
                        action='store_true',
                        help='Enumerate all stereoisomers')
    parser.add_argument(
        '-st',
        '--standardize',
        action='store_true',
        help='Standardize molecules. Cannot  be true if enumerate is on.')
    parser.add_argument('-stm',
                        '--standardize_method',
                        default="molvs",
                        choices=STANDARD_MOL_METHODS.keys(),
                        help="Choose the method to standardize.")
    parser.add_argument('-mf',
                        '--mol_format',
                        choices=['smiles', 'mol_2d', 'mol_3d'],
                        help="Format for molecules.")

    args = parser.parse_args()

    utils.log("Sanifier Args: ", args)

    if args.standardize and args.enumerate_tauts:
        raise ValueError("Cannot Enumerate Tautomers and Standardise")

    if args.standardize and args.enumerate_stereo:
        raise ValueError("Cannot Enumerate Stereo and Standardise")

    if args.outformat == 'sdf' and args.mol_format == 'smiles':
        raise ValueError("Smiles cannot be used when outputting as SDF")

    if args.standardize:
        getStandardMolecule = STANDARD_MOL_METHODS[args.standardize_method]

    input, output, suppl, writer, output_base = rdkit_utils.default_open_input_output(
        args.input, args.informat, args.output, 'sanify', args.outformat)
    i = 0
    count = 0
    errors = 0
    for mol in suppl:
        i += 1
        if mol is None: continue

        if args.standardize:
            # we keep the original UUID as there is still a 1-to-1 relationship between the input and outputs
            oldUUID = mol.GetProp("uuid")
            inputCanSmiles = Chem.MolToSmiles(mol,
                                              isomericSmiles=True,
                                              canonical=True)
            try:
                std = getStandardMolecule(mol)
                outputCanSmiles = Chem.MolToSmiles(std,
                                                   isomericSmiles=True,
                                                   canonical=True)
                if oldUUID:
                    std.SetProp("uuid", oldUUID)
                #utils.log("Standardized", i, inputCanSmiles, ">>", outputCanSmiles)
                if inputCanSmiles == outputCanSmiles:
                    std.SetProp("Standardised", "False")
                else:
                    std.SetProp("Standardised", "True")
            except:
                errors += 1
                utils.log("Error standardizing", sys.exc_info()[0])
                std = mol
                std.SetProp("Standardised", "Error")

            count = write_out([std], count, writer, args.mol_format,
                              args.outformat)
        else:
            # we want a new UUID generating as we are generating new molecules
            if mol.HasProp('uuid'):
                parentUuid = mol.GetProp("uuid")
            else:
                parentUuid = None

            results = []
            results.append(mol)

            if args.enumerate_tauts:
                utils.log("Enumerating tautomers")
                results = enumerateTautomers(mol)

            if args.enumerate_stereo:
                utils.log("Enumerating steroisomers")
                mols = results
                results = []
                for m in mols:
                    enumerated = enumerateStereoIsomers(m)
                    results.extend(enumerated)

            for m in results:
                m.ClearProp("uuid")
                m.SetIntProp("SourceMolNum", i)
                if parentUuid:
                    m.SetProp("SourceMolUUID", parentUuid)

            count = write_out(results, count, writer, args.mol_format,
                              args.outformat)

    utils.log("Handled " + str(i) + " molecules, resulting in " + str(count) +
              " outputs")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': i,
                '__OutputCount__': count,
                '__ErrorCount__': errors,
                'RDKitSanify': count
            })

    return count
Пример #25
0
def main():

    # Example usage
    # python -m pipelines.xchem.rmsd_filter -i ../../data/mpro/poses.sdf.gz -o output -c 0.5

    parser = argparse.ArgumentParser(description='RSMD filter')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-c', '--cutoff-rmsd', type=float, help='RMSD cutoff')
    parser.add_argument('-f',
                        '--field',
                        default='_Name',
                        help='Field to group records')
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('--metrics', action='store_true', help='Write metrics')

    args = parser.parse_args()
    utils.log("RSMD filter Args: ", args)

    source = "rmsd_filter.py"
    datasetMetaProps = {
        "source": source,
        "description": "RMSD filter " + rdBase.rdkitVersion
    }

    clsMappings = {}
    fieldMetaProps = []
    # clsMappings[field_FeatureSteinQualityScore] = "java.lang.Float"
    # clsMappings[field_FeatureSteinQuantityScore] = "java.lang.Float"
    # fieldMetaProps.append({"fieldName":field_FeatureSteinQualityScore,   "values": {"source":source, "description":"FeatureStein quality score"},
    #                        "fieldName":field_FeatureSteinQuantityScore,   "values": {"source":source, "description":"FeatureStein quantity score"}})

    inputs_file, inputs_supplr = rdkit_utils.default_open_input(
        args.input, args.informat)
    output, writer, output_base = rdkit_utils.default_open_output(
        args.output,
        'rmsd_filter',
        args.outformat,
        valueClassMappings=clsMappings,
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps,
        compress=not args.no_gzip)

    # this does the processing
    count, groups, kept, errors = process(inputs_supplr, writer, args.field,
                                          args.cutoff_rmsd)
    utils.log('Processing complete.', count, 'records processed with', groups,
              'groups.', kept, 'records retained.', errors, 'errors')

    inputs_file.close()
    writer.flush()
    writer.close()
    output.close()

    if args.metrics:
        utils.write_metrics(
            output_base, {
                '__InputCount__': total,
                '__OutputCount__': success,
                '__ErrorCount__': errors,
                'RDKitFeatureMap': success
            })
Пример #26
0
def main():
    ### command line args definitions #########################################

    parser = argparse.ArgumentParser(description='Enumerate charges')
    parser.add_argument(
        '--fragment-method',
        choices=['hac', 'mw'],
        help=
        'Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)'
    )
    parser.add_argument('--min-ph',
                        help='The min pH to consider',
                        type=float,
                        default=5.0)
    parser.add_argument('--max-ph',
                        help='The max pH to consider',
                        type=float,
                        default=9.0)

    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()
    utils.log("Enumerate charges: ", args)

    # handle metadata
    source = "enumerate_charges.py"
    datasetMetaProps = {
        "source": source,
        "description": "Enumerate charges using Dimorphite-dl"
    }
    clsMappings = {
        "EnumChargesSrcMolUUID": "java.lang.String",
        "EnumChargesSrcMolIdx": "java.lang.Integer"
    }
    fieldMetaProps = [{
        "fieldName": "EnumChargesSrcMolUUID",
        "values": {
            "source": source,
            "description": "UUID of source molecule"
        }
    }, {
        "fieldName": "EnumChargesSrcMolIdx",
        "values": {
            "source": source,
            "description": "Index of source molecule"
        }
    }]

    oformat = utils.determine_output_format(args.outformat)

    input,output,suppl,writer,output_base = rdkit_utils. \
        default_open_input_output(args.input, args.informat, args.output,
                                  'enumerateCharges', args.outformat,
                                  thinOutput=False, valueClassMappings=clsMappings,
                                  datasetMetaProps=datasetMetaProps,
                                  fieldMetaProps=fieldMetaProps)

    count = 0
    total = 0
    errors = 0
    min_ph = args.min_ph
    max_ph = args.max_ph

    # this hacky bit is needed because the dimporphite entrypoint assumes it's args are passed using argparse
    # but it doesn't understand our args, so we need to switch between the two sets of args.
    dimorphite_sys_argv = sys.argv[:1]
    dimorphite_sys_argv.append('--min_ph')
    dimorphite_sys_argv.append(str(min_ph))
    dimorphite_sys_argv.append('--max_ph')
    dimorphite_sys_argv.append(str(max_ph))
    fragment = args.fragment_method
    for mol in suppl:
        if mol is None:
            continue
        count += 1
        orig_sys_argv = sys.argv[:]
        sys.argv = dimorphite_sys_argv
        enum_mols = enumerateMol(mol, fragment)
        sys.argv = orig_sys_argv
        t, e = writeEnumeratedMols(mol, enum_mols, writer, count)
        total += t
        errors += e

    utils.log(count, total, errors)

    if input:
        input.close()
    writer.flush()
    writer.close()
    output.close()

    # re-write the metadata as we now know the size
    if oformat == 'json':
        utils.write_squonk_datasetmetadata(output_base,
                                           False,
                                           clsMappings,
                                           datasetMetaProps,
                                           fieldMetaProps,
                                           size=total)

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': count,
                '__OutputCount__': total,
                '__ErrorCount__': errors,
                'EnumerateChargesDimporphite': total
            })
Пример #27
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit Butina Cluster')
    parser.add_argument('-t', '--threshold', type=float, default=0.7, help='similarity clustering threshold (1.0 means identical)')
    parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-m', '--metric', type=str.lower, choices=list(metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)')
    parser.add_argument('-n', '--num', type=int, help='maximum number to pick for diverse subset selection')
    parser.add_argument('-e', '--exclude', type=float, default=0.9, help='threshold for excluding structures in diverse subset selection (1.0 means identical)')
    parser.add_argument('--fragment-method', choices=['hac', 'mw'], default='hac', help='Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)')
    parser.add_argument('--output-fragment', action='store_true', help='Output the biggest fragment rather than the original molecule')
    parser.add_argument('-f', '--field', help='field to use to optimise diverse subset selection')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--min', action='store_true', help='pick lowest value specified by the --field option')
    group.add_argument('--max', action='store_true', help='pick highest value specified by the --field option')

    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()
    utils.log("Cluster Args: ", args)

    descriptor = descriptors[args.descriptor]
    if descriptor is None:
        raise ValueError('Invalid descriptor name ' + args.descriptor)

    if args.field and not args.num:
        raise ValueError('--num argument must be specified for diverse subset selection')
    if args.field and not (args.min or args.max):
        raise ValueError('--min or --max argument must be specified for diverse subset selection')

    # handle metadata
    source = "cluster_butina.py"
    datasetMetaProps = {"source":source, "description": "Butina clustering using RDKit " + rdBase.rdkitVersion}
    clsMappings = {"Cluster": "java.lang.Integer"}
    fieldMetaProps = [{"fieldName":"Cluster", "values": {"source":source, "description":"Cluster number"}}]

    input,output,suppl,writer,output_base = rdkit_utils.\
        default_open_input_output(args.input, args.informat, args.output,
                                  'cluster_butina', args.outformat,
                                  thinOutput=False, valueClassMappings=clsMappings,
                                  datasetMetaProps=datasetMetaProps,
                                  fieldMetaProps=fieldMetaProps)

    ### fragment and generate fingerprints
    mols = []
    fps = []
    errs = mol_utils.fragmentAndFingerprint(suppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet)


    input.close()

    ### do clustering
    utils.log("Clustering with descriptor", args.descriptor, "metric", args.metric, "and threshold", args.threshold)
    clusters, dists, matrix = ClusterFps(fps, args.metric, 1.0 - args.threshold)

    utils.log("Found", len(clusters), "clusters")

    ### generate diverse subset if specified
    if args.num:
        utils.log("Generating diverse subset")
        # diverse subset selection is specified
        finalClusters = SelectDiverseSubset(mols, clusters, dists, args.num, args.field, args.max, args.exclude, args.quiet)
    else:
        finalClusters = clusters

    utils.log("Found", len(finalClusters), "clusters")
    lookup = ClustersToMap(finalClusters)

    if not args.quiet:
        utils.log("Final Clusters:", finalClusters)

    ### write the results
    i = 0
    result_count = 0
    for mol in mols:
        if i in lookup:
            if args.thin:
                rdkit_utils.clear_mol_props(mol, ["uuid"])
            cluster = lookup[i]
            mol.SetIntProp(field_Cluster, cluster)
            writer.write(mol)
            result_count += 1
        i += 1


    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        status_str = str(result_count) + ' results from ' + str(len(finalClusters)) + ' clusters'
        utils.write_metrics(output_base, {'__StatusMessage__':status_str, '__InputCount__':i, '__OutputCount__':result_count, 'RDKitCluster':i})
Пример #28
0
def main():

    # Example usage
    # python -m pipelines.xchem.featurestein_generate_and_score -i ../../data/mpro/poses.sdf.gz -f ../../data/mpro/hits-17.sdf.gz -o output_fs

    global fmaps

    parser = argparse.ArgumentParser(
        description='FeatureStein scoring with RDKit')
    parameter_utils.add_default_io_args(parser)
    parser.add_argument('-f',
                        '--fragments',
                        help='Fragments to use to generate the feature map')
    parser.add_argument('-ff', '--fragments-format', help='Fragments format')
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('--metrics', action='store_true', help='Write metrics')

    args = parser.parse_args()
    utils.log("FeatureStein Args: ", args)

    source = "featurestein_generate_and_score.py"
    datasetMetaProps = {
        "source": source,
        "description":
        "FeatureStein scoring using RDKit " + rdBase.rdkitVersion
    }

    clsMappings = {}
    fieldMetaProps = []
    clsMappings[field_FeatureSteinQualityScore] = "java.lang.Float"
    clsMappings[field_FeatureSteinQuantityScore] = "java.lang.Float"
    fieldMetaProps.append({
        "fieldName": field_FeatureSteinQualityScore,
        "values": {
            "source": source,
            "description": "FeatureStein quality score"
        },
        "fieldName": field_FeatureSteinQuantityScore,
        "values": {
            "source": source,
            "description": "FeatureStein quantity score"
        }
    })

    # generate the feature maps
    frags_input, frags_suppl = rdkit_utils.default_open_input(
        args.fragments, args.fragments_format)

    fmaps = create_featuremap(frags_suppl)
    frags_input.close()

    # read the ligands to be scored
    inputs_file, inputs_supplr = rdkit_utils.default_open_input(
        args.input, args.informat)
    output, writer, output_base = rdkit_utils.default_open_output(
        args.output,
        'featurestein',
        args.outformat,
        valueClassMappings=clsMappings,
        datasetMetaProps=datasetMetaProps,
        fieldMetaProps=fieldMetaProps,
        compress=not args.no_gzip)

    # do the scoring
    total, success, errors = score_molecules(inputs_supplr, writer)
    utils.log('Scored', success, 'molecules.', errors, 'errors.')

    inputs_file.close()
    writer.flush()
    writer.close()
    output.close()

    if args.metrics:
        utils.write_metrics(
            output_base, {
                '__InputCount__': total,
                '__OutputCount__': success,
                '__ErrorCount__': errors,
                'RDKitFeatureMap': success
            })