示例#1
0
def generate_sdf(filename):

    suppl = cheminfo.read_sdffile(filename)

    # start
    molobj = next(suppl)
    atoms, coord = cheminfo.molobj_to_xyz(molobj)
    energy = worker.get_energy(molobj)
    representation = sim.get_representation(atoms, coord)

    # init lists
    molobjs = [molobj]
    energies = [energy]
    coordinates = [coord]
    representations = [representation]

    # collect the rest
    for molobj in suppl:

        energy = worker.get_energy(molobj)
        coord = cheminfo.molobj_get_coordinates(molobj)
        representation = sim.get_representation(atoms, coord)

        molobjs.append(molobj)
        energies.append(energy)
        coordinates.append(coord)
        representations.append(representation)

    return molobjs, energies, coordinates, representations
示例#2
0
def conformation(filename, procs=0):

    scr = "_tmp_ensemble_/"

    molecules = cheminfo.read_sdffile(filename)

    if procs == 0:
        for im, molecule in enumerate(molecules):
            get_conformations((im, molecule), scr=scr)

    else:

        def workpackages():
            for im, molecule in enumerate(molecules):
                yield im, molecule

        lines = workpackages()

        results = misc.parallel(lines,
                                get_conformations, [], {"scr": scr},
                                procs=procs)
        for result in results:
            pass

        # misc.parallel(lines)#, get_conformations, [], {"scr":scr}, procs=procs)

    return
def get_avg_repr(idx, scr="_tmp_ensemble_/", **kwargs):

    name = "slatm"

    energies = misc.load_npy(scr + str(idx) + ".energies")
    molobjs = cheminfo.read_sdffile(scr + str(idx) + ".sdf")
    molobjs = [mol for mol in molobjs]

    xyzs = molobjs_to_xyzs(molobjs)
    reprs = xyzs_to_representations(*xyzs, **kwargs)

    # Boltzmann factors
    factors = np.exp(-energies)
    factors /= np.sum(factors)

    length = reprs.shape[1]
    avgrep = np.zeros(length)

    for rep, factor in zip(reprs, factors):
        avgrep += factor * rep

    print(idx, avgrep.shape)

    if "array" in kwargs:
        results = kwargs["array"]
        results[idx, :] = avgrep

    else:
        return idx, avgrep
示例#4
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="tmp2/")
    parser.add_argument('--randomseed',
                        action='store',
                        help='random seed',
                        metavar="int",
                        default=666)
    parser.add_argument('--sdf', action='store', help='', metavar="file")
    parser.add_argument('-j',
                        '--cpu',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0)

    args = parser.parse_args()

    molecules = cheminfo.read_sdffile('data/sdf/structures.sdf.gz')
    properties = open('data/sdf/properties.csv', 'r')

    sub_mol, sub_prop, idxs = search_molcules(molecules, properties)

    properties.close()

    fm = open('data/sdf/subset_structures.sdf', 'w')
    fp = open('data/sdf/subset_properties.csv', 'w')

    for mol, prop in zip(sub_mol, sub_prop):

        sdf = cheminfo.molobj_to_sdfstr(mol)
        fm.write(sdf)
        fm.write("$$$$\n")
        fp.write(str(prop) + "\n")

    fm.close()
    fp.close()

    for i, idx in enumerate(idxs):
        from_dir = "_tmp_ensemble_/"
        to_dir = "_tmp_subset_/conformers/"
        cmd = "cp {:}{:}.sdf {:}{:}.sdf".format(from_dir, str(idx), to_dir,
                                                str(i))
        list(misc.shell(cmd))
        cmd = "cp {:}{:}.energies.npy {:}{:}.energies.npy".format(
            from_dir, str(idx), to_dir, str(i))
        list(misc.shell(cmd))

        print(cmd)

    return
示例#5
0
def merge_results_cumulative(sdffile, filenames, debug=True, molid=0):

    # init
    energies = []
    coordinates = []
    representations = []
    atoms = []
    n_total = 0

    molobjs = cheminfo.read_sdffile(sdffile[0])
    molobjs = [molobj for molobj in molobjs]
    atoms_list = [cheminfo.molobj_to_atoms(molobj) for molobj in molobjs]

    for filename in filenames:

        energies_next, coordinates_next, atoms = read_resulttxt(
            atoms_list, filename)
        representations_next = [
            sim.get_representation(atoms, coord) for coord in coordinates_next
        ]

        if len(energies) == 0:
            energies += energies_next
            coordinates += coordinates_next
            representations += representations_next
            n_total += len(energies_next)
            continue

        idxs = merge_asymmetric(atoms, energies_next, energies,
                                representations_next, representations)

        n_new = 0
        for i, idxl in enumerate(idxs):

            N = len(idxl)
            if N > 0: continue

            energies.append(energies_next[i])
            coordinates.append(coordinates_next[i])
            representations.append(representations_next[i])
            n_new += 1

        if debug:
            n_total += n_new
            print(" - new", n_new)
            print("total", n_total)

    return
示例#6
0
def main_redis(args):

    redis_task = args.redis_task

    if args.redis_connect is not None:
        redis_connection = args.redis_connection_str

    else:

        if not os.path.exists(args.redis_connect_file):
            print("error: redis connection not set and file does not exists")
            print("error: path", args.redis_connect_file)
            quit()

        with open(args.redis_connect_file, 'r') as f:
            redis_connection = f.read().strip()

    if args.debug:
        print("redis: connecting to", redis_connection)

    tasks = rediscomm.Taskqueue(redis_connection, redis_task)


    # Prepare moldb
    molecules = cheminfo.read_sdffile(args.sdf)
    molecules = [molobj for molobj in molecules]

    # Prepare tordb
    if args.sdftor is None:
        tordb = [cheminfo.get_torsions(molobj) for molobj in molecules]
    else:
        tordb = read_tordb(args.sdftor)


    # Make origins
    origins = []
    for molobj in molecules:
        xyz = cheminfo.molobj_get_coordinates(molobj)
        origins.append(xyz)


    # TODO if threads is > 0 then make more redis_workers

    do_work = lambda x: redis_worker(origins, molecules, tordb, x, debug=args.debug)
    tasks.main_loop(do_work)

    return
示例#7
0
def main_file(args):

    suppl = cheminfo.read_sdffile(args.sdf)
    molobjs = [molobj for molobj in suppl]

    if args.sdftor:
        tordb = read_tordb(args.sdftor)
    else:
        tordb = [cheminfo.get_torsions(molobj) for molobj in molobjs]

    if args.jobfile:
        run_jobfile(molobjs, tordb, args.jobfile, threads=args.threads)

    else:
        # TODO Base on tordb
        generate_jobs(molobjs, args, tordb=tordb)

    return
示例#8
0
def get_sdfcontent(sdffile, rtn_atoms=False):

    coordinates = []
    energies = []

    reader = cheminfo.read_sdffile(sdffile)
    molobjs = [molobj for molobj in reader]
    atoms = ""

    for molobj in molobjs:
        atoms, coordinate = cheminfo.molobj_to_xyz(molobj)
        energy = get_energy(molobj)

        coordinates.append(coordinate)
        energies.append(energy)

    if rtn_atoms:
        return molobjs[0], atoms, energies, coordinates

    return energies, coordinates
示例#9
0
def parse_ochem(filename, procs=0, debug=False):

    molobjs = cheminfo.read_sdffile(filename)
    # molobjs = list(molobjs)

    success_properties = []
    success_molobjs = []

    if procs > 0:

        def generate_input(molobjs):
            for molobj in molobjs:
                if molobj is None: continue
                props = molobj.GetPropsAsDict()
                yield molobj, props

        pool = mp.Pool(processes=procs)
        results = pool.map(parse_molandprop, generate_input(molobjs))

        for result in results:
            molobj, value = result
            if molobj is None:
                continue

            success_properties.append(value)
            success_molobjs.append(molobj)

    else:

        for molobj in molobjs:
            molobj, value = parse_molobj(molobj, debug=debug)

            if molobj is None:
                continue

            success_properties.append(value)
            success_molobjs.append(molobj)

    return success_molobjs, success_properties
示例#10
0
def test_kernel():

    smiles = ['c1ccccn1']
    smiles += ['c1ccco1']
    smiles += ['Oc1ccccc1']
    smiles += ['Nc1ccccc1']
    smiles += ['CCO']
    smiles += ['CCN']
    molobjs = [cheminfo.smiles_to_molobj(x)[0] for x in smiles]

    molobjs = cheminfo.read_sdffile("_tmp_bing_bp_/structures.sdf.gz")
    molobjs = [next(molobjs) for _ in range(5000)]

    init = time.time()
    vectors = molobjs_to_fps(molobjs)

    print("init", time.time() - init)

    time_pykernel = time.time()
    kernel = bitmap_jaccard_kernel(vectors)
    print("pykernel", time.time() - time_pykernel)
    print(kernel)

    del kernel

    n_items = vectors.shape[0]
    # kernel = np.zeros((n_items, n_items))

    vectors = vectors.T
    vectors = np.array(vectors, dtype=int)

    # help(bitmap_kernels)
    time_fkernel = time.time()
    kernel = bitmap_kernels.symmetric_jaccard_kernel(n_items, vectors)
    print("fokernel", time.time() - time_fkernel)
    print(kernel)

    return
示例#11
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-v', '--version', action='version', version="1.0")
    parser.add_argument('--sdf',
                        type=str,
                        help='SDF file',
                        metavar='file',
                        default="~/db/qm9s.sdf.gz")
    parser.add_argument('--debug', action="store_true", help="", default=False)

    args = parser.parse_args()

    max_mols = 500

    molobjs = cheminfo.read_sdffile(args.sdf)

    tordb = []
    for i, molobj in enumerate(molobjs):
        # if i > max_mols: break
        torsions = cheminfo.get_torsions(molobj)
        tordb.append(torsions)

    for i, torsions in enumerate(tordb):

        torsions_str = []
        for torsion in torsions:
            fmt = " ".join(["{:}"] * 4).format(*torsion)
            torsions_str.append(fmt)

        state = ",".join(torsions_str)

        print(i, ":", state)

    return
示例#12
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="DIR",
                        default="_tmp_")
    parser.add_argument('--sdf', action='store', help='',
                        metavar="FILE")  #, nargs="+", default=[])
    parser.add_argument('--properties',
                        action='store',
                        help='',
                        metavar="FILE")  #, nargs="+", default=[])
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    fsdf = gzip.open(args.scratch + "structures.sdf.gz", 'w')
    fprop = open(args.scratch + "properties.csv", 'w')

    molecules = cheminfo.read_sdffile(args.sdf)
    properties = open(args.properties, 'r')

    moledict = {}

    for molobj, line in zip(molecules, properties):

        status = molobjfilter(molobj)

        if not status:
            continue

        status = valuefilter(line)

        if not status:
            continue

        smiles = cheminfo.molobj_to_smiles(molobj, remove_hs=True)

        print(smiles)

        sdfstr = cheminfo.molobj_to_sdfstr(molobj)
        sdfstr += "$$$$\n"
        fsdf.write(sdfstr.encode())
        fprop.write(line)

        values = [float(x) for x in line.split()[1:]]
        moledict[smiles] = values

    fsdf.close()
    fprop.close()

    properties.close()

    misc.save_json(args.scratch + "molecules", moledict)
    misc.save_obj(args.scratch + "molecules", moledict)

    return
示例#13
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Read properties
    properties = misc.load_npy(args.scratch + "properties")
    molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")
    molecules = list(molecules)

    heavy_atoms = []
    predictions = []
    errors = []

    for mol, prop in zip(molecules, properties):

        smi = cheminfo.molobj_to_smiles(mol, remove_hs=True)
        J = thermo.joback.Joback(smi)
        # J = thermo.joback.Joback('CC(=O)C')
        # J = thermo.joback.Joback('CCC(=O)OC(=O)CC')

        status = J.status

        atoms, coord = cheminfo.molobj_to_xyz(mol)
        idx = np.where(atoms != 1)
        atoms = atoms[idx]
        N = len(atoms)
        heavy_atoms.append(N)

        if "Did not match all atoms present" in status:
            errors.append(1)
            predictions.append(float("nan"))
            continue

        try:
            estimate = J.estimate()
        except TypeError:
            errors.append(1)
            predictions.append(float("nan"))
            continue

        errors.append(0)

        T_b = estimate["Tb"]
        T_m = estimate["Tm"]

        predictions.append(T_m)

    errors = np.array(errors, dtype=int)

    idx_success, = np.where(errors == 0)

    heavy_atoms = np.array(heavy_atoms)
    predictions = np.array(predictions)
    properties = np.array(properties)

    predictions = predictions[idx_success]
    properties = properties[idx_success]
    heavy_atoms = heavy_atoms[idx_success]

    print("total", errors.shape[0], "filter", idx_success.shape[0])
    print()
    print(rmse(properties, predictions))

    plt.plot(properties, properties, "-k")
    plt.scatter(properties, predictions, s=0.95, alpha=0.8, c=heavy_atoms)

    plt.xlabel("True")
    plt.ylabel("Predicted")

    plt.savefig("_fig_joback")
    plt.clf()

    return
示例#14
0
def mergesdfs(sdflist):
    """
    """
    # Merge sdf0 with sdf1 and etc

    molobjs_list = []

    for sdf in sdflist:
        sdfs = cheminfo.read_sdffile(sdf)
        molobjs = [molobj for molobj in sdfs]
        molobjs_list.append(molobjs)

    coordinates_sdf = []

    for molobjs in molobjs_list:
        coordinates = [
            cheminfo.molobj_get_coordinates(molobj) for molobj in molobjs
        ]
        coordinates_sdf.append(coordinates)

    atoms, xyz = cheminfo.molobj_to_xyz(molobjs_list[0][0])

    coordinates_x = coordinates_sdf[0]
    coordinates_y = coordinates_sdf[1]

    # JCK
    sigmas = [0.8]
    parameters = {
        'alchemy': 'off',
        "cut_distance": 10**6,
        'kernel_args': {
            "sigma": sigmas
        },
    }

    repObjs_x = [
        workkernel.FchlRepresentation(atoms, coordinates, **parameters)
        for coordinates in coordinates_x[:3]
    ]
    repObjs_y = [
        workkernel.FchlRepresentation(atoms, coordinates, **parameters)
        for coordinates in coordinates_y[:4]
    ]

    similarity = workkernel.get_kernel_from_objs(repObjs_x, repObjs_y,
                                                 **parameters)
    print("qml obj kernel:")
    print(similarity)

    # JCK

    # Energy are the same, compare with FCHL
    representations_x = get_representations_fchl(atoms,
                                                 coordinates_x,
                                                 max_size=len(atoms))
    representations_x = np.asarray(representations_x[:3])

    representations_y = get_representations_fchl(atoms,
                                                 coordinates_y,
                                                 max_size=len(atoms))
    representations_y = np.asarray(representations_y[:4])

    similarity = get_kernel_fchl(representations_x, representations_y)
    print("qml kernel:")
    print(similarity)

    similarity = get_kernel_fchl_(representations_x, representations_y)
    print("qml fst kernel:")
    print(similarity)

    return
示例#15
0
import qml
from chemhelp import cheminfo
import numpy as np
from rdkit import Chem

import sys

args = sys.argv[1:]

filename = args[0]

molobjs = cheminfo.read_sdffile(filename)

for i, molobj in enumerate(molobjs):

    molobj = next(molobjs)

    # stat = cheminfo.molobj_optimize(molobj)
    # print(stat)

    dist = Chem.rdmolops.Get3DDistanceMatrix(molobj)
    np.fill_diagonal(dist, 10.0)
    min_dist = np.min(dist)

    if min_dist < 0.01:
        print(i, min_dist)
        smi = cheminfo.molobj_to_smiles(molobj)
        molobj = cheminfo.conformationalsearch(smi)

        dist = Chem.rdmolops.Get3DDistanceMatrix(molobj)
        np.fill_diagonal(dist, 10.0)
示例#16
0
def mergesdfs(sdflist):
    """
    """
    # Merge sdf0 with sdf1 and etc

    molobjs_list = []

    for sdf in sdflist:
        sdfs = cheminfo.read_sdffile(sdf)
        molobjs = [molobj for molobj in sdfs]
        molobjs_list.append(molobjs)

    coordinates_sdf = []

    for molobjs in molobjs_list:
        coordinates = [
            cheminfo.molobj_get_coordinates(molobj) for molobj in molobjs
        ]
        coordinates_sdf.append(coordinates)

    atoms, xyz = cheminfo.molobj_to_xyz(molobjs_list[0][0])

    coordinates_x = coordinates_sdf[0]
    coordinates_y = coordinates_sdf[1]

    # JCK

    representations19_x = [
        sim.get_representation(atoms, coordinates)
        for coordinates in coordinates_x
    ]
    representations19_y = [
        sim.get_representation(atoms, coordinates)
        for coordinates in coordinates_y
    ]
    representations19_x = np.asarray(representations19_x[:5])
    representations19_y = np.asarray(representations19_y[:5])

    nx = len(representations19_x)
    ny = len(representations19_y)

    similarity = sim.get_kernel(representations19_x, representations19_y,
                                [atoms] * nx, [atoms] * ny)
    print(similarity)

    # JCK

    # fchl18
    representations_x = get_representations_fchl(atoms,
                                                 coordinates_x,
                                                 max_size=len(atoms))
    representations_x = np.asarray(representations_x[:5])
    representations_y = get_representations_fchl(atoms,
                                                 coordinates_y,
                                                 max_size=len(atoms))
    representations_y = np.asarray(representations_y[:5])
    similarity = get_kernel_fchl(representations_x, representations_y)
    print("qml kernel:")
    print(similarity)

    return
示例#17
0
def overview_properties_pca():

    elements = []

    with open('data/sdf/subset_properties.csv', 'r') as f:
        properties = f.readlines()
        properties = [float(x) for x in properties]
        properties = np.array(properties)

    representations = []
    molobjs = cheminfo.read_sdffile("data/sdf/subset_structures.sdf")

    mols_atoms = []
    mols_coord = []

    n_atoms = 0
    n_items = 500

    for i, molobj in enumerate(molobjs):

        atoms, coord = cheminfo.molobj_to_xyz(molobj)

        mols_atoms.append(atoms)
        mols_coord.append(coord)

        elements += list(np.unique(atoms))
        elements = list(np.unique(elements))

        if len(atoms) > n_atoms:
            n_atoms = len(atoms)

        i += 1
        if i == n_items:
            break

    properties = properties[:n_items]

    print(elements)
    print(n_atoms)
    print(len(mols_atoms))

    distance_cut = 20.0
    parameters = {
        "pad": n_atoms,
        'nRs2': 22,
        'nRs3': 17,
        'eta2': 0.41,
        'eta3': 0.97,
        'three_body_weight': 45.83,
        'three_body_decay': 2.39,
        'two_body_decay': 2.39,
        "rcut": distance_cut,
        "acut": distance_cut,
        "elements": elements
    }

    for atoms, coord in zip(mols_atoms, mols_coord):
        representation = generate_fchl_acsf(atoms, coord, **parameters)
        representations.append(representation)

    representations = np.array(representations)

    sigma = 10.

    kernel = qml.kernels.get_local_kernel(representations, representations,
                                          mols_atoms, mols_atoms, sigma)

    print(kernel.shape)

    pca = kpca(kernel, n=2)

    fig, axs = plt.subplots(2, 1, figsize=(5, 10))
    sc = axs[0].scatter(*pca, c=properties)
    fig.colorbar(sc, ax=axs[0])
    im = axs[1].imshow(kernel)
    fig.colorbar(im, ax=axs[1])
    fig.savefig("_tmp_pca_prop.png")

    return
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    properties = misc.load_npy(args.scratch + "properties")
    molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    heavy_atoms = []
    distances = []
    volumes = []

    for mol in molecules:

        # atoms = cheminfo.molobj_to_atoms(mol)
        atoms, coord = cheminfo.molobj_to_xyz(mol)

        idx = np.where(atoms != 1)
        atoms = atoms[idx]
        N = len(atoms)
        heavy_atoms.append(N)

        hull = ConvexHull(coord, qhull_options="QJ")

        vol = hull.volume
        volumes.append(vol)

        avgdist = distance.pdist(coord)
        avgdist = np.mean(avgdist)

        distances.append(avgdist)

    heavy_atoms = np.array(heavy_atoms)
    volumes = np.array(volumes)
    distances = np.array(distances)

    #
    #
    #

    representation = distances

    # linear fit
    p = np.polyfit(representation, properties, 3)
    p = np.poly1d(p)

    results = p(representation)
    rmse_error = rmse(results, properties)

    print(rmse_error)

    plt.scatter(representation, properties, c=heavy_atoms, s=0.8)
    x_prop = np.linspace(min(representation), max(representation), 80)
    plt.plot(x_prop, p(x_prop), "k-")

    plt.savefig("i_can_member_it")
    plt.clf()

    return
示例#19
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-v', '--version', action='version', version="1.0")

    parser.add_argument('--method',
                        action='store',
                        help='What QM program to use',
                        metavar='str',
                        default="PM6")

    parser.add_argument('--txtfmt',
                        help='format for cost mergeing',
                        metavar="STR")
    parser.add_argument('--sdf',
                        nargs="+",
                        action='store',
                        help='',
                        metavar='FILE')
    parser.add_argument('--molid',
                        action='store',
                        help='What molid from sdf should be used for txt',
                        metavar='int(s)')

    parser.add_argument('--debug',
                        action='store_true',
                        help='debug statements')

    parser.add_argument('-j',
                        '--procs',
                        type=int,
                        help='Use multiple processes (default=0)',
                        default=0)

    args = parser.parse_args()

    if args.sdf is None:
        print("error: actually we need sdfs information")
        quit()

    molidxs = args.molid
    molidxs = molidxs.split("-")
    if len(molidxs) == 1:
        molidxs = [int(molidx) for molidx in molidxs]
    else:
        molidxs = [int(molidx) for molidx in molidxs]
        molidxs = range(molidxs[0], molidxs[1] + 1)

    # molobj db
    molobjs = [molobj for molobj in cheminfo.read_sdffile(args.sdf[0])]

    dump_results = "_tmp_results_data1/{:}.results"

    if args.procs > 0:
        parallel_parse_results(args.txtfmt,
                               molobjs,
                               molidxs,
                               dump_results,
                               procs=args.procs)
        return

    for idx in molidxs:
        parse_results(idx,
                      args.txtfmt,
                      molobjs,
                      dump_results="_tmp_results_data1/{:}.results".format(
                          args.molid))

    return
示例#20
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="DIR",
                        default="_tmp_")
    parser.add_argument('--sdf',
                        action='store',
                        help='',
                        metavar="FILE",
                        nargs="+",
                        default=[])
    parser.add_argument('--dict',
                        action='store',
                        help='',
                        metavar="FILE",
                        nargs="+",
                        default=[])
    parser.add_argument('--name',
                        action='store',
                        help='',
                        metavar="STR",
                        nargs="+")
    parser.add_argument('--filename', action='store', help='', metavar="STR")
    parser.add_argument('--filter', action='store_true', help='')
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    print()
    databases_set = []
    databases_dict = []

    for sdf in args.sdf:
        molobjs = cheminfo.read_sdffile(sdf)
        molobjs = list(molobjs)
        smiles = [
            cheminfo.molobj_to_smiles(molobj, remove_hs=True)
            for molobj in molobjs
        ]
        smiles = set(smiles)
        databases_set.append(smiles)
        print(sdf, len(smiles))

    for filename in args.dict:
        data = misc.load_obj(filename)
        smiles = data.keys()
        smiles = set(smiles)
        databases_set.append(smiles)
        databases_dict.append(data)
        print(filename, len(smiles))

    if args.scratch is not None:

        # Merge databases
        everything = {}

        for data in databases_dict:

            keys = data.keys()

            for key in keys:

                if key not in everything:
                    everything[key] = []

                everything[key] += data[key]

        if args.filter:
            everything = filter_dict(everything)

        keys = everything.keys()
        print("n items", len(keys))

        # Save
        misc.save_json(args.scratch + "molecule_data", everything)
        misc.save_obj(args.scratch + "molecule_data", everything)

    if args.name is not None:

        n_db = len(databases_set)

        if n_db == 2:
            venn2(databases_set, set_labels=args.name)
        elif n_db == 3:
            venn3(databases_set, set_labels=args.name)

        plt.savefig(args.scratch + "venndiagram")

    return
示例#21
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('--randomseed',
                        action='store',
                        help='random seed',
                        metavar="int",
                        default=1)
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        type=int,
                        metavar="int",
                        default=0)
    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Not that random
    np.random.seed(args.randomseed)

    # Get properties
    properties = misc.load_npy(args.scratch + "properties")
    molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    # Get features
    filename = "repr.ols"
    if os.path.exists(args.scratch + filename + ".pkl"):
        features = misc.load_obj(args.scratch + filename)

    else:
        features = extract_features(properties, molobjs, procs=args.procs)
        features = pd.DataFrame(features)
        features = features.fillna(0)
        misc.save_obj(args.scratch + filename, features)

    n_items = len(features)
    X = np.arange(n_items)

    assert len(properties) == n_items

    # Train
    n_splits = 5
    n_train = misc.load_npy(args.scratch + "n_train")

    fold_five = sklearn.model_selection.KFold(n_splits=n_splits,
                                              random_state=45,
                                              shuffle=True)

    scores = []

    for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)):

        # un-ordered idxs_train
        np.random.seed(45 + i)
        np.random.shuffle(idxs_train)

        learning_curve = []

        for n in n_train:
            idxs = idxs_train[:n]

            # signed difference
            sign_diff = fit_model(features, idxs, idxs_test)

            # rmse
            diff = sign_diff**2
            rmse_test = np.sqrt(diff.mean())

            # save
            learning_curve.append(rmse_test)

        scores.append(learning_curve)

    scores = np.array(scores)
    scores = scores.T

    mean_score = np.mean(scores, axis=1)
    print(mean_score)
    misc.save_npy(args.scratch + "score.ols", scores)

    return
示例#22
0
def main():

    # L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_")
    parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1)
    parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0)
    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Not that random
    np.random.seed(args.randomseed)

    # Get properties
    properties = misc.load_npy(args.scratch + "properties")
    molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    X = []

    try:
        X = misc.load_npy(args.scratch + "repr.rdkitfp")
        print("loaded")
    except:
        for molobj in molobjs:
            bitmap = fingerprints.get_rdkitfp(molobj)
            X.append(bitmap)

    X = np.asarray(X)
    y = properties

    # load predefined training points
    n_train = misc.load_npy(args.scratch + "n_train")


    # CV
    idxs = np.array(list(range(len(properties))), dtype=int)
    scores = []

    for idxs_train, idxs_test in cv.cross_view(idxs):

        learning_curve = []

        for n in n_train:
            idxs = idxs_train[:n]

            clf = get_best_rfr(X[idxs], y[idxs])

            # training error
            # predictions = clf.predict(X)

            # predictions
            predictions = clf.predict(X[idxs_test])
            diff = predictions-y[idxs_test]
            diff = diff**2
            rmse_test = np.sqrt(diff.mean())
            learning_curve.append(rmse_test)
            print(n, rmse_test)

        scores.append(learning_curve)

    scores = np.array(scores)
    scores = scores.T

    mean_score = np.mean(scores, axis=1)
    print(mean_score)
    misc.save_npy(args.scratch + "score.rfr", scores)
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('--conformers', action='store_true', help='')
    parser.add_argument('--sdf', action='store', help='', metavar="file")
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    parser.add_argument('-r',
                        '--representations',
                        action='store',
                        help='',
                        metavar="STR",
                        nargs="+")

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    if args.procs == -1:
        args.procs = int(os.cpu_count())
        print("set procs", args.procs)

    representation_names_coordbased = [
        "cm", "fchl18", "fchl19", "slatm", "bob"
    ]
    representation_names_molbased = ["morgan", "rdkitfp"]

    if args.representations is None:
        # representation_names = ["cm", "fchl18", "fchl19", "slatm", "bob"]
        # representation_names = ["fchl18"]
        # representation_names = ["bob"]
        representation_names = ["slatm", "bob", "cm", "rdkitfp", "morgan"]
    else:
        representation_names = args.representations

    molobjs = cheminfo.read_sdffile(args.sdf)
    molobjs = [mol for mol in molobjs]

    xyzs = molobjs_to_xyzs(molobjs)

    mol_atoms, mol_coords = xyzs
    misc.save_obj(args.scratch + "atoms", mol_atoms)

    # Print unique atoms
    unique_atoms = []
    for atoms in mol_atoms:
        unique_atoms += list(np.unique(atoms))

    unique_atoms = np.array(unique_atoms)
    unique_atoms = unique_atoms.flatten()
    unique_atoms = np.unique(unique_atoms)

    # Calculate max_size
    max_atoms = [len(atoms) for atoms in mol_atoms]
    max_atoms = max(max_atoms)

    n_items = len(mol_coords)

    print("total mols:", n_items)
    print("atom types:", unique_atoms)
    print("max atoms: ", max_atoms)
    print()
    print("representations:", representation_names)
    print()

    misc.save_txt(args.scratch + "n_items", n_items)

    # Gas phase
    for name in representation_names:

        if name not in representation_names_coordbased: continue

        representations = xyzs_to_representations(mol_atoms,
                                                  mol_coords,
                                                  name=name,
                                                  scr=args.scratch,
                                                  max_atoms=max_atoms,
                                                  procs=args.procs)

        if isinstance(representations, (np.ndarray, np.generic)):
            misc.save_npy(args.scratch + "repr." + name, representations)
        else:
            misc.save_obj(args.scratch + "repr." + name, representations)

        representations = None
        del representations

    for name in representation_names:

        if name not in representation_names_molbased: continue

        representations = molobjs_to_representations(molobjs,
                                                     name=name,
                                                     procs=args.procs)

        if isinstance(representations, (np.ndarray, np.generic)):
            misc.save_npy(args.scratch + "repr." + name, representations)
        else:
            misc.save_obj(args.scratch + "repr." + name, representations)

        representations = None
        del representations

    quit()

    # Ensemble
    # if args.conformers:
    #     generate_conformer_representation(scr=args.scratch, procs=args.procs)

    return
示例#24
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-v', '--version', action='version', version="1.0")

    parser.add_argument('--txt',
                        nargs="+",
                        help='Read results from txt file (require sdf)',
                        metavar="FILE")
    parser.add_argument('--txtfmt',
                        help='format for cost mergeing',
                        metavar="STR")
    parser.add_argument('--sdf',
                        nargs="+",
                        action='store',
                        help='',
                        metavar='FILE')
    parser.add_argument('--sdfstdin',
                        action='store_true',
                        help='Read sdf files from stdin')
    parser.add_argument('--txtstdin',
                        action='store_true',
                        help='Read txt files from stdin')
    parser.add_argument('--molid',
                        action='store',
                        help='What molid from sdf should be used for txt',
                        metavar='int')

    parser.add_argument('--format',
                        action='store',
                        help='What output format? (sdf, txt)',
                        metavar='str')
    parser.add_argument('--dump',
                        action='store_true',
                        help='dump sdf str to stdout')

    parser.add_argument('--debug',
                        action='store_true',
                        help='debug statements')
    parser.add_argument('-j',
                        '--procs',
                        type=int,
                        help='Merge using multiprocessing',
                        default=0)

    args = parser.parse_args()

    if args.sdf is None:
        print("error: actually we need sdfs to merge")
        quit()

    molobjs = [molobj for molobj in cheminfo.read_sdffile(args.sdf[0])]

    # Parse txt files individually
    if args.txtstdin:
        filenames = easyusage.stdin()
        merge_individual(molobjs, filenames, procs=args.procs)
    elif args.txt:
        filenames = [txt for txt in args.txt]
        merge_individual(molobjs, filenames, procs=args.procs)

    # Cost merge using txtfilenames format. Should have three {:} expandables
    if args.txtfmt:

        molidxs = args.molid
        molidxs = molidxs.split("-")
        if len(molidxs) == 1:
            molidxs = [int(molidx) for molidx in molidxs]
        else:
            molidxs = [int(molidx) for molidx in molidxs]
            molidxs = range(molidxs[0], molidxs[1] + 1)

        merge_cost_multiple(molidxs, molobjs, args.txtfmt)

        # molobj = molobjs[args.molid]
        # energies, coordinates, costs = merge_results_cumulative_prime(args.molid, molobj, args.txtfmt)

        if args.format == "sdf":
            dump_sdf(molobj, energies, coordinates, costs)
        else:
            out = dump_txt(energies, coordinates, costs)
            print(out)

    quit()

    if args.txt is None:
        merge_sdfs(args.sdf)

    else:
        #TODO Need flags
        # merge_results(args.sdf, args.txt)

        # merge_results_cumulative(args.sdf, args.txt)

        # molobj, energies, coordinates, costs = merge_results_cumulative_prime(args.sdf, args.txt)

        if args.format == "sdf":
            dump_sdf(molobj, energies, coordinates, costs)
        else:
            out = dump_txt(energies, coordinates, costs)
            print(out)

    return