Пример #1
0
def main():
    global WRITER, THRESHOLD
    global PDB_PATH
    parser = argparse.ArgumentParser(
        description='SMoG2016 - Docking calculation.')
    utils.add_default_io_args(parser)
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring")
    parser.add_argument('-t',
                        '--threshold',
                        help="The maximum score to allow",
                        default=None)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()

    utils.log("SMoG2016 Args: ", args)

    smog_path = "/usr/local/SMoG2016_Rev1/"
    if args.threshold:
        THRESHOLD = float(args.threshold)
    else:
        THRESHOLD = None

    PDB_PATH = "/tmp/pdb_file.pdb"
    # Now copy it to prot_pdb.pdb -> silly SMOG bug requires underscore in the filename!
    shutil.copy(args.pdb_file, PDB_PATH)

    # Open up the input file
    input, suppl = utils.default_open_input(args.input, args.informat)
    # Open the ouput file
    output, WRITER, output_base = utils.default_open_output(
        args.output, "SMoG2016", args.outformat, compress=not args.no_gzip)

    # Cd to the route of the action
    # TODO - can this be done without changing dir? It gives problems in finding the input files and in writing the metrics
    cwd = os.getcwd()
    os.chdir(smog_path)

    # Iterate over the molecules
    # TODO - restore parallel processing, but need to ensure the order of molecules is preserved
    pool = ThreadPool(1)
    pool.map(run_dock, suppl)
    # Close the file
    WRITER.close()

    os.chdir(cwd)
    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': COUNTER,
                '__OutputCount__': SUCCESS,
                'SMoG2016': COUNTER
            })

    utils.log("SMoG2016 complete")
Пример #2
0
def main():
    global PDB_PATH, WRITER, THRESHOLD
    parser = argparse.ArgumentParser(
        description='SMoG2016 - Docking calculation.')
    utils.add_default_io_args(parser)
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    parser.add_argument('-pdb', '--pdb_file', help="PDB file for scoring")
    parser.add_argument('-t',
                        '--threshold',
                        type=float,
                        help="The maximum score to allow",
                        default=None)
    parser.add_argument(
        '--threads',
        type=int,
        help="Number of threads to used. Default is the number of cores",
        default=None)
    parser.add_argument('--thin', action='store_true', help='Thin output mode')

    args = parser.parse_args()
    utils.log("PLI Args: ", args)

    # Open up the input file
    input, suppl = utils.default_open_input(args.input, args.informat)
    # Open the ouput file
    output, WRITER, output_base = utils.default_open_output(
        args.output,
        "plip",
        args.outformat,
        compress=not args.no_gzip,
        thinOutput=args.thin)

    PDB_PATH = args.pdb_file
    if args.threshold:
        THRESHOLD = args.threshold

    # Iterate over the molecules
    # WARNING - if using parallel processing the order of molecules is not preserved. Set args.threads to 1 to ensure this.
    pool = ThreadPool(args.threads if args.
                      threads is not None else multiprocessing.cpu_count())
    pool.map(run_dock, suppl)
    pool.close()
    pool.join()
    # Close the file
    WRITER.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': COUNTER,
            '__OutputCount__': SUCCESS,
            'PLI': COUNTER
        })
Пример #3
0
def split(input, informat, fieldName, outputBase, writeMetrics):
    """Splits the input into separate files. The name of each file and the file the each record is written to
    is determined by the fieldName parameter
    """

    input, suppl = utils.default_open_input(input, informat)

    i = 0
    written = 0
    writers = {}
    outputs = []
    filenames = []
    for mol in suppl:
        i += 1
        if mol is None: continue
        if not mol.HasProp(fieldName):
            utils.log("Skipping molecule", i, "- did not contain field",
                      fieldName)
            continue
        value = mol.GetProp(fieldName)
        if value:
            s = str(value)
            if writers.has_key(s):
                writer = writers[s]
            else:
                name = outputBase + s
                output, writer = utils.default_open_output_sdf(
                    name, outputBase, False, False)
                filenames.append(name + '.sdf')
                outputs.append(output)
                writers[s] = writer
            writer.write(mol)
            written += 1

    utils.log("Generated", len(writers), "outputs from", i, "records")

    input.close()
    for k in writers:
        writers[k].close()
    for o in outputs:
        o.close()

    if writeMetrics:
        utils.write_metrics(outputBase, {
            '__InputCount__': i,
            '__OutputCount__': written,
            'Splitter': i
        })

    return filenames
Пример #4
0
def split(input, informat, fieldName, outputBase):
    """Splits the input into separate files. The name of each files and the file the records is written to
    is determined by the fieldName parameter
    """

    input, suppl = utils.default_open_input(input, informat)

    i = 0
    writers = {}
    outputs = []
    filenames = []
    for mol in suppl:
        i += 1
        if mol is None: continue
        value = mol.GetProp(fieldName)
        if value:
            s = str(value)
            if writers.has_key(s):
                writer = writers[s]
            else:
                name = outputBase + s
                output, writer = utils.default_open_output_sdf(
                    name, outputBase, False, False)
                filenames.append(name + '.sdf')
                outputs.append(output)
                writers[s] = writer
            writer.write(mol)

    utils.log("Generated", len(writers), "outputs from", i, "records")

    input.close()
    for k in writers:
        writers[k].close()
    for o in outputs:
        o.close()

    return filenames
Пример #5
0
def main():
    ### command line args defintions #########################################

    ### Define the reactions available
    poised_filter = True
    if poised_filter == True:
        from poised_filter import Filter
        filter_to_use = Filter()

    parser = argparse.ArgumentParser(description='RDKit rxn process')
    utils.add_default_io_args(parser)
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='Quiet mode')
    parser.add_argument('-m',
                        '--multi',
                        action='store_true',
                        help='Output one file for each reaction')
    parser.add_argument('-r',
                        '--reaction',
                        choices=filter_to_use.poised_reactions.keys(),
                        help='Name of reaction to be run')
    parser.add_argument('-rl',
                        '--reagent_lib',
                        help="Input SD file, if not defined the STDIN is used")
    parser.add_argument(
        '-rlf',
        '--reagent_lib_format',
        choices=['sdf', 'json'],
        help="Input format. When using STDIN this must be specified.")

    args = parser.parse_args()
    utils.log("Screen Args: ", args)

    if not args.output and args.multi:
        raise ValueError(
            "Must specify output location when writing individual result files"
        )

    input, suppl = utils.default_open_input(args.input, args.informat)
    reagent_input, reagent_suppl = utils.default_open_input(
        args.reagent_lib, args.reagent_lib_format)
    output, writer, output_base = utils.default_open_output(
        args.output, "rxn_maker", args.outformat)

    i = 0
    count = 0

    if args.multi:
        dir_base = os.path.dirname(args.output)
        writer_dict = filter_to_use.get_writers(dir_base)
    else:
        writer_dict = None
        dir_base = None

    for mol in suppl:
        i += 1
        if mol is None: continue
        # Return a dict/class here - indicating which filters passed
        count = filter_to_use.perform_reaction(mol, args.reaction,
                                               reagent_suppl, writer, count)

    utils.log("Created", count, "molecules from a total of ", i,
              "input molecules")

    writer.flush()
    writer.close()
    if input:
        input.close()
    if output:
        output.close()
    # close the individual writers
    if writer_dict:
        for key in writer_dict:
            writer_dict[key].close()

    if args.meta:
        utils.write_metrics(
            output_base, {
                '__InputCount__': i,
                '__OutputCount__': count,
                'RxnSmartsFilter': count
            })
Пример #6
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit Butina Cluster Matrix')
    utils.add_default_input_args(parser)
    parser.add_argument('-o', '--output', help="Base name for output file (no extension). If not defined then SDTOUT is used for the structures and output is used as base name of the other files.")
    parser.add_argument('-of', '--outformat', choices=['tsv', 'json'], default='tsv', help="Output format. Defaults to 'tsv'.")
    parser.add_argument('--meta', action='store_true', help='Write metadata and metrics files')
    parser.add_argument('-t', '--threshold', type=float, default=0.7, help='Similarity clustering threshold (1.0 means identical)')
    parser.add_argument('-mt', '--matrixThreshold', type=float, default=0.5, help='Threshold for outputting values (1.0 means identical)')
    parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(cluster_butina.descriptors.keys()), default='rdkit', help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-m', '--metric', type=str.lower, choices=list(cluster_butina.metrics.keys()), default='tanimoto', help='similarity metric (default tanimoto)')
    parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')

    args = parser.parse_args()
    utils.log("Cluster Matrix Args: ", args)

    descriptor = cluster_butina.descriptors[args.descriptor]
    if descriptor is None:
        raise ValueError('Invalid descriptor name ' + args.descriptor)

    input,suppl = utils.default_open_input(args.input, args.informat)

    # handle metadata
    source = "cluster_butina_matrix.py"
    datasetMetaProps = {"source":source, "description": "Butina clustering using RDKit " + rdBase.rdkitVersion}
    clsMappings = {
        "Cluster1": "java.lang.Integer",
        "Cluster2": "java.lang.Integer",
        "ID1": "java.lang.String",
        "ID2": "java.lang.String",
        "M1": "java.lang.String",
        "M2": "java.lang.String",
        "Similarity": "java.lang.Float"
    }
    fieldMetaProps = [{"fieldName":"Cluster", "values": {"source":source, "description":"Cluster number"}}]

    fieldNames = collections.OrderedDict()
    fieldNames['ID1'] = 'ID1'
    fieldNames['ID2'] ='ID2'
    fieldNames['Cluster1'] = 'Cluster1'
    fieldNames['Cluster2'] = 'Cluster2'
    fieldNames['Similarity'] = 'Similarity'
    fieldNames['M1'] = 'M1'
    fieldNames['M2'] = 'M2'

    writer,output_base = utils.create_simple_writer(args.output, 'cluster_butina_matrix', args.outformat, fieldNames,
                                                    valueClassMappings=clsMappings, datasetMetaProps=datasetMetaProps, fieldMetaProps=fieldMetaProps)


    ### generate fingerprints
    mols = [x for x in suppl if x is not None]
    fps = [descriptor(x) for x in mols]
    input.close()


    ### do clustering
    utils.log("Clustering with descriptor", args.descriptor, "metric", args.metric, "and threshold", args.threshold)
    clusters, dists, matrix, = cluster_butina.ClusterFps(fps, args.metric, 1.0 - args.threshold)
    utils.log("Found", len(clusters), "clusters")

    MapClusterToMols(clusters, mols)

    if not args.quiet:
        utils.log("Clusters:", clusters)

    writer.writeHeader()

    size = len(matrix)
    #utils.log("len(matrix):", size)
    count = 0
    for i in range(size ):
        #utils.log("element",i, "has length", len(matrix[i]))
        writer.write(create_values(mols, i, i, 1.0))
        count += 1
        for j in range(len(matrix[i])):
            #utils.log("writing",i,j)
            dist = matrix[i][j]
            if dist > args.matrixThreshold:
                # the matrix is the lower left segment without the diagonal
                x = j
                y = i + 1
                writer.write(create_values(mols, x, y, dist))
                writer.write(create_values(mols, y, x, dist))
                count += 2
    writer.write(create_values(mols, size, size, 1.0))

    writer.writeFooter()
    writer.close()

    if args.meta:
        utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitCluster':i})
Пример #7
0
                        default=10,
                        help='number of conformers to generate')
    parser.add_argument('-r', '--refmol', help="Reference molecule file")
    parser.add_argument('--refmolidx',
                        help="Reference molecule index in file",
                        type=int,
                        default=1)
    parser.add_argument(
        '-c',
        '--core_smi',
        help='Core substructure. If not specified - guessed using MCS',
        default='')

    args = parser.parse_args()
    # Get the reference molecule
    ref_mol_input, ref_mol_suppl = utils.default_open_input(
        args.refmol, args.refmol)
    counter = 0
    # Get the specified reference molecule. Default is the first
    for mol in ref_mol_suppl:
        counter += 1
        if counter == args.refmolidx:
            ref_mol = mol
            break
    ref_mol_input.close()

    if counter < args.refmolidx:
        raise ValueError("Invalid refmolidx. " + str(args.refmolidx) +
                         " was specified but only " + str(counter) +
                         " molecules were present in refmol.")

    # handle metadata
Пример #8
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit filter')
    parser.add_argument(
        '-f',
        '--fragment',
        choices=['hac', 'mw'],
        help=
        'Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )'
    )
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    parser.add_argument('-l',
                        '--limit',
                        type=int,
                        help='Limit output to this many records')
    parser.add_argument(
        '-c',
        '--chunksize',
        type=int,
        help=
        'Split output into chunks of size c. Output will always be files. Names like filter1.sdf.gz, filter2.sdf.gz ...'
    )
    parser.add_argument(
        '-d',
        '--digits',
        type=int,
        default=0,
        help=
        'When splitting zero pad the file name to this many digits so that they are in sorted order. Names like filter001.sdf.gz, filter002.sdf.gz ...'
    )
    parser.add_argument(
        '--no-gzip',
        action='store_true',
        help='Do not compress the output (STDOUT is never compressed')
    # WARNING: thin output is not appropriate when using --fragment
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument(
        '-q',
        '--quiet',
        action='store_true',
        help='Quiet mode - suppress reporting reason for filtering')
    utils.add_default_io_args(parser)
    args = parser.parse_args()
    utils.log("Filter Args: ", args)

    input, suppl = utils.default_open_input(args.input, args.informat)

    if args.chunksize:
        chunkNum = 1
        if args.output:
            output_base = args.output
        else:
            output_base = 'filter'
        output_base_chunk = output_base + str(chunkNum).zfill(args.digits)
        output, writer, output_base_chunk = utils.default_open_output(
            output_base_chunk,
            output_base_chunk,
            args.outformat,
            compress=not args.no_gzip)
    else:
        output, writer, output_base_chunk = utils.default_open_output(
            args.output, "filter", args.outformat, compress=not args.no_gzip)
        output_base = output_base_chunk

    utils.log("Writing to " + output_base_chunk)

    i = 0
    count = 0
    chunkNum = 1
    for mol in suppl:
        if args.limit and count >= args.limit:
            break
        i += 1
        if mol is None: continue
        if args.fragment:
            mol = fragment(mol, args.fragment, quiet=args.quiet)
        if not filter(mol,
                      minHac=args.hacmin,
                      maxHac=args.hacmax,
                      minMw=args.mwmin,
                      maxMw=args.mwmax,
                      quiet=args.quiet):
            continue
        if args.chunksize:
            if count > 0 and count % args.chunksize == 0:
                writer.close()
                output.close()
                chunkNum += 1
                output_chunk_base = output_base + str(chunkNum).zfill(
                    args.digits)
                utils.log("Writing to " + output_chunk_base)
                output, writer, output_chunk_base = utils.default_open_output(
                    output_chunk_base,
                    output_chunk_base,
                    args.outformat,
                    compress=not args.no_gzip)

        count += 1
        writer.write(mol)

    utils.log("Filtered", i, "down to", count, "molecules")
    if args.chunksize:
        utils.log("Wrote", chunkNum, "chunks")
        if (args.digits > 0 and len(str(chunkNum)) > args.digits):
            utils.log(
                "WARNING: not enough digits specified for the number of chunks"
            )

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {
            '__InputCount__': i,
            '__OutputCount__': count,
            'RDKitFilter': i
        })
Пример #9
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit Butina Cluster')
    parser.add_argument('-t', '--threshold', type=float, default=0.0, help='similarity threshold (1.0 means identical)')
    parser.add_argument('-d', '--descriptor', type=str.lower, choices=list(descriptors.keys()), default='morgan2', help='descriptor or fingerprint type (default rdkit)')
    parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')
    parser.add_argument('-n', '--num', type=int, help='maximum number to pick for diverse subset selection')
    parser.add_argument('-s', '--seed-molecules', help='optional file containing any seed molecules that have already been picked')
    parser.add_argument('--fragment-method', choices=['hac', 'mw'], default='hac', help='Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)')
    parser.add_argument('--output-fragment', action='store_true', help='Output the biggest fragment rather than the original molecule')
    utils.add_default_io_args(parser)

    args = parser.parse_args()
    utils.log("MaxMinPicker Args: ", args)

    descriptor = descriptors[args.descriptor]
    if descriptor is None:
        raise ValueError('No descriptor specified')

    if not args.num and not args.threshold:
        raise ValueError('--num or --threshold arguments must be specified, or both')

    # handle metadata
    source = "max_min_picker.py"
    datasetMetaProps = {"source":source, "description": "MaxMinPicker using RDKit " + rdBase.rdkitVersion}

    ### generate fingerprints
    fps = []
    mols = []
    errors = 0

    # first the initial seeds, if specified
    firstPicks = []
    num_seeds = 0
    if args.seed_molecules:
        seedsInput,seedsSuppl = utils.default_open_input(args.seed_molecules, None)
        start = time.time()
        errors += mol_utils.fragmentAndFingerprint(seedsSuppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet)
        end = time.time()
        seedsInput.close()
        num_seeds = len(fps)
        utils.log("Read", len(fps), "fingerprints for seeds in", end-start, "secs,", errors, "errors")
        firstPicks = list(range(num_seeds))

    # now the molecules to pick from
    input,output,suppl,writer,output_base = utils.default_open_input_output(args.input, args.informat, args.output, 'cluster_butina',
                                                                            args.outformat, datasetMetaProps=datasetMetaProps)
    # reset the mols list as we don't need the seeds, only the candidates
    mols = []
    start = time.time()
    errs = mol_utils.fragmentAndFingerprint(suppl, mols, fps, descriptor, fragmentMethod=args.fragment_method, outputFragment=args.output_fragment, quiet=args.quiet)
    end = time.time()
    errors += errs

    input.close()
    num_fps = len(fps)
    num_candidates = num_fps - num_seeds
    utils.log("Read", num_candidates, "fingerprints for candidates in", end-start, "secs,", errs, "errors")

    if not args.num:
        num_to_pick = num_candidates
    elif args.num > num_candidates:
        num_to_pick = num_candidates
        utils.log("WARNING: --num argument (", args.num, ") is larger than the total number of candidates (", num_candidates, ") - resetting to", num_candidates)
    else:
        num_to_pick = args.num

    ### do picking
    utils.log("MaxMinPicking with descriptor", args.descriptor, "and threshold", args.threshold, ",", num_seeds, "seeds,", num_candidates, "candidates", num_fps, "total")
    start = time.time()
    picks, thresh = performPick(fps, num_to_pick + num_seeds, args.threshold, firstPicks)
    end = time.time()
    num_picks = len(picks)

    utils.log("Found", num_picks, "molecules in", end-start, "secs, final threshold", thresh)
    utils.log("Picks:", list(picks[num_seeds:]))
    del fps

    # we want to return the results in the order they were in the input so first we record the order in the pick list
    indices = {}
    i = 0
    for idx in picks[num_seeds:]:
        indices[idx] = i
        i += 1
    # now do the sort
    sorted_picks = sorted(picks[num_seeds:])
    # now write out the mols in the correct order recording the value in the pick list as the PickIndex property
    i = 0
    for idx in sorted_picks:
        mol = mols[idx - num_seeds] # mols array only contains the candidates
        mol.SetIntProp("PickIndex", indices[idx] + 1)
        writer.write(mol)
        i += 1
    utils.log("Output", i, "molecules")

    writer.flush()
    writer.close()
    output.close()

    if args.meta:
        metrics = {}
        status_str = "{} compounds picked. Final threshold was {}.".format(i, thresh)
        if errors > 0:
            metrics['__ErrorCount__'] = errors
            status_str = status_str + " {} errors.".format(errors)

        metrics['__StatusMessage__'] = status_str
        metrics['__InputCount__'] = num_fps
        metrics['__OutputCount__'] = i
        metrics['RDKitMaxMinPicker'] = num_picks

        utils.write_metrics(output_base, metrics)
Пример #10
0
def main():

    ### command line args defintions #########################################

    parser = argparse.ArgumentParser(description='RDKit filter')
    parser.add_argument('-f', '--fragment', choices=['hac', 'mw'], help='Find single fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight )')
    parser.add_argument('--hacmin', type=int, help='Min heavy atom count')
    parser.add_argument('--hacmax', type=int, help='Max heavy atom count')
    parser.add_argument('--mwmin', type=float, help='Min mol weight')
    parser.add_argument('--mwmax', type=float, help='Max mol weight')
    parser.add_argument('-l', '--limit', type=int, help='Limit output to this many records')
    parser.add_argument('-c', '--chunksize', type=int, help='Split output into chunks of size c. Output will always be files. Names like filter01.sdf.gz, filter02.sdf.gz ...')
    # WARNING: thin output is not appropriate when using --fragment
    parser.add_argument('--thin', action='store_true', help='Thin output mode')
    parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode - suppress reporting reason for filtering')
    utils.add_default_io_args(parser)
    args = parser.parse_args()
    utils.log("Filter Args: ", args)
        
    input,suppl = utils.default_open_input(args.input, args.informat)
    if args.output:
        output_base = args.output
    else:
        output_base = 'filter'

    # OK, all looks good so we can hope that things will run OK.
    # But before we start lets write the metadata so that the results can be handled.
    #t = open(output_base + '_types.txt', 'w')
    #t.write(field_Similarity + '=integer\n')
    #t.flush()
    #t.close()
    
    if args.chunksize:
        chunkNum = 1
        output = gzip.open(output_base + str(chunkNum) + '.sdf.gz','w+')
    elif args.output:
        output = gzip.open(output_base + '.sdf.gz','w+')
    else:
        output = sys.stdout
        
    writer = Chem.SDWriter(output)

    i=0
    count = 0
    chunkCount = 1
    for mol in suppl:
        if args.limit and count >= args.limit:
            break
        i +=1
        if mol is None: continue
        if args.fragment:
            mol = fragment(mol, args.fragment, quiet=args.quiet)
        if not filter(mol, minHac=args.hacmin, maxHac=args.hacmax, minMw=args.mwmin, maxMw=args.mwmax, quiet=args.quiet):
            continue
        if args.chunksize:
            if count > 0 and count % args.chunksize == 0:
                writer.close()
                output.close()
                chunkCount += 1
                output = gzip.open(output_base + str(chunkCount) + '.sdf.gz','w+')
                writer = Chem.SDWriter(output)

        count += 1
        writer.write(mol)

    utils.log("Filtered", i, "down to", count, "molecules")
    if args.chunksize:
        utils.log("Wrote", chunkCount, "chunks")

    writer.flush()
    writer.close()
    input.close()
    output.close()

    if args.meta:
        utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, 'RDKitFilter':i})