Пример #1
0
def get_conformations(line, scr="_tmp_ensemble_/", **kwargs):

    im, molecule = line

    # smi = Chem.MolToSmiles(molecule)
    energies = generate_conformers(molecule)

    misc.save_npy(scr + str(im) + ".energies", energies)

    txtsdf = cheminfo.molobj_to_sdfstr(molecule)

    fsdf = open(scr + str(im) + ".sdf", 'w')
    fsdf.write(txtsdf)
    fsdf.close()

    print(im, "{:} {:5.2f} {:5.2f}".format("smi", energies.mean(),
                                           energies.std()))

    return
Пример #2
0
def dump_distances_and_kernels(scr, name, procs=0):

    # TODO Properties should be read by scr!!
    # properties
    # print("Saving properties")
    # with open(scr + 'properties.csv', 'r') as f:
    #     properties = f.readlines()
    #     properties = [x.split()[0] for x in properties]
    #     properties = [float(x) for x in properties]
    #     properties = np.array(properties)

    # print(properties.shape)
    # misc.save_npy(scr + "properties", properties)

    representation_names_coordbased = ["cm", "slatm", "bob"]
    representation_names_molbased = ["morgan", "rdkitfp"]

    if procs != 0:
        os.environ["OMP_NUM_THREADS"] = str(procs)

    # Prepare fchl kernels
    if name == "fclh18":
        print("Generating fchl18 kernel")
        start = time.time()
        reps = misc.load_npy(scr + "repr." + "fchl18")
        print("shape:", reps.shape)
        sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True)
        end = time.time()
        print("time:", end - start)
        misc.save_npy(scr + "fchl18." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl18", kernels)

        reps = None
        del reps
        kernels = None
        del kernels

    elif name == "fchl19":
        print("Generating fchl19 kernel")
        reps = misc.load_npy(scr + "repr." + "fchl19")
        print("shape:", reps.shape)
        atoms = misc.load_obj(scr + "atoms")
        start = time.time()
        sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True)
        end = time.time()
        print("time:", end - start)
        misc.save_npy(scr + "fchl19." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl19", kernels)

    elif name in representation_names_coordbased:
        print("Distance", name)
        representations = misc.load_npy(scr + "repr." + name)
        print(representations.shape)
        dist = generate_l2_distances(representations)
        misc.save_npy(scr + "dist." + name, dist)

        dist = None
        del dist

    elif name == "rdkitfp" or name == "morgan":

        print("Generating fingerprint kernel", name)
        representations_fp = misc.load_npy(scr + "repr." + name)
        representations_fp = np.asarray(representations_fp, dtype=np.float)

        # t = time.time()
        # print("jaccard numpy")
        # kernel = fingerprints.bitmap_jaccard_kernel(representations_fp)
        # print("time", time.time()-t)
        # print("saving kernel")
        #
        # kernel = None
        # del kernel

        print(os.environ["OMP_NUM_THREADS"])

        n_items = representations_fp.shape[0]

        # FORTRAN KERNEL
        # t = time.time()
        # print("jaccard fortran")
        # representations_fp = np.array(representations_fp, dtype=int).T
        # kernel = bitmap_kernels.symmetric_jaccard_kernel(n_items, representations_fp)
        # print("time", time.time()-t)

        # kernel = fingerprints.fingerprints_to_kernel(representations_fp, representations_fp, procs=procs)
        # misc.save_npy(scr + "kernel." + name, kernel)

        # DISTANCE
        print("make dist")
        dist = generate_l2_distances(representations_fp)
        print("save dist")
        misc.save_npy(scr + "dist." + name, dist)
        print("saved")

        print(dist.shape)

        kernel = None
        del kernel

    else:
        print("error: unknown representation", name)
        quit()

    return
Пример #3
0
def dump_distances_and_kernels(scr):

    # TODO Properties should be read by scr!!

    # properties
    print("Saving properties")
    with open(scr + 'properties.csv', 'r') as f:
        properties = f.readlines()
        properties = [x.split()[0] for x in properties]
        properties = [float(x) for x in properties]
        properties = np.array(properties)

    print("properties", properties.shape)

    misc.save_npy(scr + "properties", properties)

    # Prepare distances
    representation_names = ["cm", "bob", "slatm"] # + ["avgslatm"]
    for name in representation_names:
        print("Distance", name)
        representations = misc.load_npy(scr + "repr." + name)
        print(representations.shape)
        dist = generate_l2_distances(representations)
        misc.save_npy(scr + "dist." + name, dist)

        dist = None
        del dist

    # Prepare fchl kernels
    if False:
        print("Generating fchl18 kernel")
        start = time.time()
        reps = misc.load_npy(scr + "repr." + "fchl18")
        print("shape:", reps.shape)
        sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True)
        end = time.time()
        print("time:", end-start)
        misc.save_npy(scr + "fchl18." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl18", kernels)

        reps = None
        del reps
        kernels = None
        del kernels

    if False:
        print("Generating fchl19 kernel")
        reps = misc.load_npy(scr + "repr." + "fchl19")
        print("shape:", reps.shape)
        atoms = misc.load_obj(scr + "atoms")
        start = time.time()
        sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True)
        end = time.time()
        print("time:", end-start)
        misc.save_npy(scr + "fchl19." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl19", kernels)

    if True:
        print("Generating fingerprint kernel")
        representations_fp = misc.load_obj(scr + "repr.fp")
        kernel = get_fp_kernel(representations_fp)
        misc.save_npy(scr + "kernel.fp", kernel)

    return
Пример #4
0
def dump_kernel_scores(scr, names=[]):

    # Predefined reg
    l2regs = [10**-x for x in range(1, 6, 2)] + [0.0]
    n_l2regs = len(l2regs)

    # Define n_training
    # n_trains=[2**x for x in range(4, 12)]
    n_trains=[2**x for x in range(4, 17)]
    n_trains = np.array(n_trains, dtype=int)
    n_items = misc.load_txt(scr + "n_items")

    n_train_idx, = np.where(n_trains < n_items*4.0/5.0)
    n_trains = n_trains[n_train_idx]
    n_trains = list(n_trains) # + [-1]

    print("Assume total items", n_items,
            "N train", "{:5.1f}".format(np.floor(n_items*4/5)),
            "N test", "{:5.1f}".format(np.ceil(n_items*1/5)))
    print("Training:", list(n_trains))
    misc.save_npy(scr + "n_train", n_trains)

    # Load properties
    try:
        properties = misc.load_npy(scr + "properties")
    except:
        with open(scr + "properties.csv", 'r') as f:
            lines = f.readlines()
            properties = []
            for line in lines:

                values = [float(x) for x in line.split()]
                values = values[1:]
                value = np.median(values)
                properties.append(value)

            properties = np.array(properties)
            misc.save_npy(scr + "properties", properties)


    print(n_items, "==", len(properties))
    assert n_items == len(properties)

    # Load done kernel
    this_names = ["rdkitfp", "morgan"]
    for name in names:

        break

        if name not in this_names:
            continue

        print("scoring", name)

        now = time.time()

        print("load kernel", name)
        kernel = misc.load_npy(scr + "kernel." + name)

        n_len = kernel.shape[0]
        diaidx = np.diag_indices(n_len)

        def scan_kernels(debug=True):
            kernel[diaidx] += l2regs[0]
            yield kernel
            # for i in tqdm.tqdm(range(1, n_l2regs), ncols=47, ascii=True, desc=name):
            for i in range(1, n_l2regs):
                kernel[diaidx] += -l2regs[i-1] +l2regs[i]
                yield kernel

        generator = functools.partial(tqdm, scan_kernels(), ncols=75, ascii=True, desc=name+ " kernels", total=n_l2regs)

        print("scan kernels", name)
        idx_winners, scores = cross_validation(generator(), properties, training_points=n_trains)
        misc.save_npy(scr + "score."+name, scores)
        scores = np.around(np.mean(scores, axis=1), decimals=2)

        # Save parameters
        winner_parameters = {}
        for ni, index in enumerate(idx_winners):

            n = n_trains[ni]
            l2reg = l2regs[index]

            parameters = {
                "reg": l2reg,
            }

            winner_parameters[str(n)] = parameters

        nower = time.time()

        print("time: {:10.1f}s".format(nower-now))
        print(name, list(scores))

        misc.save_json(scr + "parameters."+name, winner_parameters)

        print("saved")

        kernel = None
        del kernel

    # Load multi kernels (reg search)
    this_names = ["fchl19", "fchl18"]
    for name in names:
        break
        kernels = misc.load_npy(scr + "kernels." + name)

        n_l2regs = len(l2regs)
        n_kernels = kernels.shape[0]
        n_len = kernels[0].shape[0]

        diaidx = np.diag_indices(n_len)

        def scan_kernels():
            for kernel in kernels:
                kernel[diaidx] += l2regs[0]
                yield kernel
                for i in range(1, n_l2regs):
                    kernel[diaidx] += -l2regs[i-1] +l2regs[i]
                    yield kernel

        idx_winners, scores = cross_validation(scan_kernels(), properties, training_points=n_trains)
        misc.save_npy(scr + "score."+name, scores)
        scores = np.around(np.mean(scores, axis=1), decimals=2)

        # Clean
        kernels = None
        del kernels

        # Save parameters
        winner_parameters = {}
        for ni, index in enumerate(idx_winners):

            # convert linear index to multi-dimensions
            idx_parameters = np.unravel_index([index], (n_kernels, n_l2regs))
            i, j = idx_parameters
            i = int(i[0])
            j = int(j[0])

            n = n_trains[ni]
            sigma = i
            l2reg = l2regs[j]

            parameters = {
                "sigma": sigma,
                "reg": l2reg,
            }

            winner_parameters[str(n)] = parameters

        misc.save_json(scr + "parameters."+name, winner_parameters)

        print(name, scores)


    # Load distance kernels
    models = []
    parameters = {
        "name": "rdkitfp",
        "sigma": [2**x for x in range(1, 12, 2)],
        # "sigma": [2**x for x in np.arange(20, 40, 0.5)],
        # "lambda": l2regs,
        # "lambda":  [10.0**-x for x in np.arange(1, 10, 1)]
        "lambda":  [10.0**-6],
    }
    models.append(parameters)
    parameters = {
        "name": "slatm",
        "sigma": [2**x for x in range(1, 12, 2)],
        # "sigma": [2**x for x in np.arange(20, 40, 0.5)],
        # "lambda": l2regs,
        # "lambda":  [10.0**-x for x in np.arange(1, 10, 1)]
        "lambda":  [10.0**-6],
    }
    models.append(parameters)
    parameters = {
        "name": "cm",
        "sigma": [2**x for x in range(1, 12, 2)],
        "lambda": l2regs,
    }
    models.append(parameters)
    parameters = {
        "name": "bob",
        "sigma": [2**x for x in range(1, 12, 2)],
        "lambda": l2regs,
    }
    models.append(parameters)
    parameters = {
        "name": "avgslatm",
        "sigma": [2**x for x in range(1, 20, 2)],
        "lambda": l2regs,
    }
    # models.append(parameters)

    for model in models:
        name = model["name"]

        if name not in names:
            continue

        print("scoring", name)

        parameters = model

        n_sigma = len(parameters["sigma"])
        n_lambda = len(parameters["lambda"])

        print("parameter range")
        print("sigma", min(parameters["sigma"]), max(parameters["sigma"]))

        dist = misc.load_npy(scr + "dist." + name)
        kernels = get_kernels_l2distance(dist, parameters)

        # Cross validate
        idx_winners, scores = cross_validation(kernels, properties, training_points=n_trains)

        # Save scores
        misc.save_npy(scr + "score."+name, scores)
        scores = np.around(np.mean(scores, axis=1), decimals=2)

        # Save parameters
        winner_parameters = {}
        for ni, index in enumerate(idx_winners):

            # convert linear index to multi-dimensions
            idx_parameters = np.unravel_index([index], (n_sigma, n_lambda))
            i, j = idx_parameters
            i = int(i[0])
            j = int(j[0])

            n = n_trains[ni]
            sigma = parameters["sigma"][i]
            l2reg = parameters["lambda"][j]

            this_parameters = {
                "sigma": str(sigma),
                "reg": str(l2reg),
            }

            winner_parameters[str(n)] = this_parameters


        print(name, scores)
        misc.save_json(scr + "parameters."+name, winner_parameters)



    quit()

    return
Пример #5
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('--randomseed',
                        action='store',
                        help='random seed',
                        metavar="int",
                        default=1)
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        type=int,
                        metavar="int",
                        default=0)
    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Not that random
    np.random.seed(args.randomseed)

    # Get properties
    properties = misc.load_npy(args.scratch + "properties")
    molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    # Get features
    filename = "repr.ols"
    if os.path.exists(args.scratch + filename + ".pkl"):
        features = misc.load_obj(args.scratch + filename)

    else:
        features = extract_features(properties, molobjs, procs=args.procs)
        features = pd.DataFrame(features)
        features = features.fillna(0)
        misc.save_obj(args.scratch + filename, features)

    n_items = len(features)
    X = np.arange(n_items)

    assert len(properties) == n_items

    # Train
    n_splits = 5
    n_train = misc.load_npy(args.scratch + "n_train")

    fold_five = sklearn.model_selection.KFold(n_splits=n_splits,
                                              random_state=45,
                                              shuffle=True)

    scores = []

    for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)):

        # un-ordered idxs_train
        np.random.seed(45 + i)
        np.random.shuffle(idxs_train)

        learning_curve = []

        for n in n_train:
            idxs = idxs_train[:n]

            # signed difference
            sign_diff = fit_model(features, idxs, idxs_test)

            # rmse
            diff = sign_diff**2
            rmse_test = np.sqrt(diff.mean())

            # save
            learning_curve.append(rmse_test)

        scores.append(learning_curve)

    scores = np.array(scores)
    scores = scores.T

    mean_score = np.mean(scores, axis=1)
    print(mean_score)
    misc.save_npy(args.scratch + "score.ols", scores)

    return
Пример #6
0
def main():

    # L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch', action='store', help='', metavar="dir", default="_tmp_")
    parser.add_argument('--randomseed', action='store', help='random seed', metavar="int", default=1)
    parser.add_argument('-j', '--procs', action='store', help='pararallize', type=int, metavar="int", default=0)
    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Not that random
    np.random.seed(args.randomseed)

    # Get properties
    properties = misc.load_npy(args.scratch + "properties")
    molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    X = []

    try:
        X = misc.load_npy(args.scratch + "repr.rdkitfp")
        print("loaded")
    except:
        for molobj in molobjs:
            bitmap = fingerprints.get_rdkitfp(molobj)
            X.append(bitmap)

    X = np.asarray(X)
    y = properties

    # load predefined training points
    n_train = misc.load_npy(args.scratch + "n_train")


    # CV
    idxs = np.array(list(range(len(properties))), dtype=int)
    scores = []

    for idxs_train, idxs_test in cv.cross_view(idxs):

        learning_curve = []

        for n in n_train:
            idxs = idxs_train[:n]

            clf = get_best_rfr(X[idxs], y[idxs])

            # training error
            # predictions = clf.predict(X)

            # predictions
            predictions = clf.predict(X[idxs_test])
            diff = predictions-y[idxs_test]
            diff = diff**2
            rmse_test = np.sqrt(diff.mean())
            learning_curve.append(rmse_test)
            print(n, rmse_test)

        scores.append(learning_curve)

    scores = np.array(scores)
    scores = scores.T

    mean_score = np.mean(scores, axis=1)
    print(mean_score)
    misc.save_npy(args.scratch + "score.rfr", scores)
Пример #7
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('--randomseed',
                        action='store',
                        help='random seed',
                        metavar="int",
                        default=1)
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        type=int,
                        metavar="int",
                        default=0)
    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Not that random
    # np.random.seed(args.randomseed)

    # Get properties
    properties = misc.load_npy(args.scratch + "properties")
    # molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    n_items = len(properties)
    X = np.arange(n_items)

    # Train
    n_splits = 5
    n_train = misc.load_npy(args.scratch + "n_train")

    fold_five = sklearn.model_selection.KFold(n_splits=n_splits,
                                              random_state=45,
                                              shuffle=True)

    scores = []

    for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)):

        # un-ordered idxs_train
        np.random.seed(45 + i)
        np.random.shuffle(idxs_train)

        learning_curve = []

        for n in n_train:
            idxs = idxs_train[:n]

            train = properties[idxs]
            model = train.mean()

            test = properties[idxs_test]

            # predict
            sign_diff = model - test

            # rmse
            diff = sign_diff**2
            rmse_test = np.sqrt(diff.mean())

            # save
            learning_curve.append(rmse_test)

        scores.append(learning_curve)

    scores = np.array(scores)
    scores = scores.T

    mean_score = np.mean(scores, axis=1)
    print(mean_score)
    misc.save_npy(args.scratch + "score.null", scores)

    return
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('--conformers', action='store_true', help='')
    parser.add_argument('--sdf', action='store', help='', metavar="file")
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    parser.add_argument('-r',
                        '--representations',
                        action='store',
                        help='',
                        metavar="STR",
                        nargs="+")

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    if args.procs == -1:
        args.procs = int(os.cpu_count())
        print("set procs", args.procs)

    representation_names_coordbased = [
        "cm", "fchl18", "fchl19", "slatm", "bob"
    ]
    representation_names_molbased = ["morgan", "rdkitfp"]

    if args.representations is None:
        # representation_names = ["cm", "fchl18", "fchl19", "slatm", "bob"]
        # representation_names = ["fchl18"]
        # representation_names = ["bob"]
        representation_names = ["slatm", "bob", "cm", "rdkitfp", "morgan"]
    else:
        representation_names = args.representations

    molobjs = cheminfo.read_sdffile(args.sdf)
    molobjs = [mol for mol in molobjs]

    xyzs = molobjs_to_xyzs(molobjs)

    mol_atoms, mol_coords = xyzs
    misc.save_obj(args.scratch + "atoms", mol_atoms)

    # Print unique atoms
    unique_atoms = []
    for atoms in mol_atoms:
        unique_atoms += list(np.unique(atoms))

    unique_atoms = np.array(unique_atoms)
    unique_atoms = unique_atoms.flatten()
    unique_atoms = np.unique(unique_atoms)

    # Calculate max_size
    max_atoms = [len(atoms) for atoms in mol_atoms]
    max_atoms = max(max_atoms)

    n_items = len(mol_coords)

    print("total mols:", n_items)
    print("atom types:", unique_atoms)
    print("max atoms: ", max_atoms)
    print()
    print("representations:", representation_names)
    print()

    misc.save_txt(args.scratch + "n_items", n_items)

    # Gas phase
    for name in representation_names:

        if name not in representation_names_coordbased: continue

        representations = xyzs_to_representations(mol_atoms,
                                                  mol_coords,
                                                  name=name,
                                                  scr=args.scratch,
                                                  max_atoms=max_atoms,
                                                  procs=args.procs)

        if isinstance(representations, (np.ndarray, np.generic)):
            misc.save_npy(args.scratch + "repr." + name, representations)
        else:
            misc.save_obj(args.scratch + "repr." + name, representations)

        representations = None
        del representations

    for name in representation_names:

        if name not in representation_names_molbased: continue

        representations = molobjs_to_representations(molobjs,
                                                     name=name,
                                                     procs=args.procs)

        if isinstance(representations, (np.ndarray, np.generic)):
            misc.save_npy(args.scratch + "repr." + name, representations)
        else:
            misc.save_obj(args.scratch + "repr." + name, representations)

        representations = None
        del representations

    quit()

    # Ensemble
    # if args.conformers:
    #     generate_conformer_representation(scr=args.scratch, procs=args.procs)

    return
def generate_conformer_representation(scr="_tmp_ensemble_/", procs=0):

    names = ["cm", "slatm", "bob"]
    name = "slatm"

    mbtypes = misc.load_npy(scr + "slatm.mbtypes")

    # TODO Calculate max_size
    mol_atoms = misc.load_obj(scr + "atoms")
    max_atoms = [len(atoms) for atoms in mol_atoms]
    max_atoms = max(max_atoms)

    kwargs = {
        "name": name,
        "mbtypes": mbtypes,
        "debug": False,
        "max_atoms": max_atoms,
    }

    # n_total = 1285
    n_total = 3456
    idxs = range(n_total)

    avgreps = [0] * n_total

    if procs == 0:

        for idx in idxs:

            idx, avgrep = get_avg_repr(idx, **kwargs)
            avgreps[idx] = avgrep

    else:

        idx, rep = get_avg_repr(0, **kwargs)
        rep_size = rep.shape[0]
        print("rep size", rep_size)

        m = MyManager()
        m.start()

        results = m.np_zeros((n_total, rep_size))

        # TODO Hardcoded, puuuha
        pool = Pool(32)

        kwargs["array"] = results
        func = partial(get_avg_repr, **kwargs)
        pool.map(func, idxs)
        avgreps = results

        # results = misc.parallel(idxs, get_avg_repr, [], kwargs, procs=nprocs)
        #
        # for result in results:
        #     idx, avgrep = result
        #     avgreps[idx] = avgrep
        #     print(idx, avgrep.mean())

    avgreps = np.array(avgreps)
    misc.save_npy(scr + "repr.avgslatm", avgreps)

    return