Пример #1
0
 def __init__(self, final_matrix_name, word_dict_name, train_test_split):
     self.final_matrix = load_obj(final_matrix_name).astype('float')
     self.word_dict = load_obj(word_dict_name)
     self.num_words = len(self.word_dict)
     self.train_test_split = train_test_split
     self.X_train = self.final_matrix[:self.train_test_split, :]
     self.X_test = self.final_matrix[self.train_test_split:, :]
Пример #2
0
    def create_indices_matrix(self, is_dict_existing):
        """ :param is_dict_existing: if a dictionary already exists (from previous calls), skip creating one"""
        if is_dict_existing:
            word_dict = load_obj(self.dict_name)
        else:  # create a new vocabulary
            word_dict = self.voc2index()

        word_2_num_sentence = lambda t: [
            word_dict[word] for word in t.split()
        ]  # replace every word in the cell with the matching vocab number
        word_2_num_one_word = lambda t: [
            word_dict[t]
        ]  # refer to the cell content as one string and replace this string with the matching vocab number
        ''' for each column of the table, replace (if needed) the words / sentence with the matching index from the vocabulary'''
        names_indices = np.array(
            [word_2_num_sentence(t) for t in self.short_sentences_table[:, 0]])
        item_conditions = np.expand_dims(
            self.short_sentences_table[:, 1].astype('float'), axis=1)
        category_names_indices = np.array(
            [word_2_num_one_word(t) for t in self.short_sentences_table[:, 2]])
        brand_names_indices = np.array(
            [word_2_num_one_word(t) for t in self.short_sentences_table[:, 3]])
        price = np.expand_dims(self.short_sentences_table[:,
                                                          4].astype('float'),
                               axis=1)
        is_shipping = np.expand_dims(
            self.short_sentences_table[:, 5].astype('float'), axis=1)
        indices_matrix = np.concatenate(
            (names_indices, item_conditions, category_names_indices,
             brand_names_indices, price, is_shipping),
            axis=1)
        save_obj(indices_matrix, self.final_matrix_name)
        return indices_matrix
Пример #3
0
def dump_distances_and_kernels(scr, name, procs=0):

    # TODO Properties should be read by scr!!
    # properties
    # print("Saving properties")
    # with open(scr + 'properties.csv', 'r') as f:
    #     properties = f.readlines()
    #     properties = [x.split()[0] for x in properties]
    #     properties = [float(x) for x in properties]
    #     properties = np.array(properties)

    # print(properties.shape)
    # misc.save_npy(scr + "properties", properties)

    representation_names_coordbased = ["cm", "slatm", "bob"]
    representation_names_molbased = ["morgan", "rdkitfp"]

    if procs != 0:
        os.environ["OMP_NUM_THREADS"] = str(procs)

    # Prepare fchl kernels
    if name == "fclh18":
        print("Generating fchl18 kernel")
        start = time.time()
        reps = misc.load_npy(scr + "repr." + "fchl18")
        print("shape:", reps.shape)
        sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True)
        end = time.time()
        print("time:", end - start)
        misc.save_npy(scr + "fchl18." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl18", kernels)

        reps = None
        del reps
        kernels = None
        del kernels

    elif name == "fchl19":
        print("Generating fchl19 kernel")
        reps = misc.load_npy(scr + "repr." + "fchl19")
        print("shape:", reps.shape)
        atoms = misc.load_obj(scr + "atoms")
        start = time.time()
        sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True)
        end = time.time()
        print("time:", end - start)
        misc.save_npy(scr + "fchl19." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl19", kernels)

    elif name in representation_names_coordbased:
        print("Distance", name)
        representations = misc.load_npy(scr + "repr." + name)
        print(representations.shape)
        dist = generate_l2_distances(representations)
        misc.save_npy(scr + "dist." + name, dist)

        dist = None
        del dist

    elif name == "rdkitfp" or name == "morgan":

        print("Generating fingerprint kernel", name)
        representations_fp = misc.load_npy(scr + "repr." + name)
        representations_fp = np.asarray(representations_fp, dtype=np.float)

        # t = time.time()
        # print("jaccard numpy")
        # kernel = fingerprints.bitmap_jaccard_kernel(representations_fp)
        # print("time", time.time()-t)
        # print("saving kernel")
        #
        # kernel = None
        # del kernel

        print(os.environ["OMP_NUM_THREADS"])

        n_items = representations_fp.shape[0]

        # FORTRAN KERNEL
        # t = time.time()
        # print("jaccard fortran")
        # representations_fp = np.array(representations_fp, dtype=int).T
        # kernel = bitmap_kernels.symmetric_jaccard_kernel(n_items, representations_fp)
        # print("time", time.time()-t)

        # kernel = fingerprints.fingerprints_to_kernel(representations_fp, representations_fp, procs=procs)
        # misc.save_npy(scr + "kernel." + name, kernel)

        # DISTANCE
        print("make dist")
        dist = generate_l2_distances(representations_fp)
        print("save dist")
        misc.save_npy(scr + "dist." + name, dist)
        print("saved")

        print(dist.shape)

        kernel = None
        del kernel

    else:
        print("error: unknown representation", name)
        quit()

    return
Пример #4
0
from showing import ploting
from misc import load_obj
# dict = load_obj("all_favourite_methods_x1000", resolution=0.7)
# dict = load_obj("exp-ep-greedy-Dolinar_x500", resolution=0.7)
name = "all_methods_x12_ep100"
dict = load_obj(name, resolution=0.1, layers=2)

# dict = load_obj("ep-greedy-Dolinar_x1", resolution=0.33)
# # for i in dict.keys():
# #     print(i, dict[i]["label"])
# interesting = ["run_10","run_16", "run_2"]
# interesting = ["run_1","run_2", "run_3", "run_4","run_5"]
interesting = ["run_1","run_2", "run_3","run_4","run_5"]
# interesting = dict.keys()
# #
dict_plot = {}
for i in interesting:
    dict_plot[i] = dict[i]
ploting(dict_plot,mode_log="off",save=True,show=True, particular_name=name,mode="stds")
Пример #5
0
def dump_distances_and_kernels(scr):

    # TODO Properties should be read by scr!!

    # properties
    print("Saving properties")
    with open(scr + 'properties.csv', 'r') as f:
        properties = f.readlines()
        properties = [x.split()[0] for x in properties]
        properties = [float(x) for x in properties]
        properties = np.array(properties)

    print("properties", properties.shape)

    misc.save_npy(scr + "properties", properties)

    # Prepare distances
    representation_names = ["cm", "bob", "slatm"] # + ["avgslatm"]
    for name in representation_names:
        print("Distance", name)
        representations = misc.load_npy(scr + "repr." + name)
        print(representations.shape)
        dist = generate_l2_distances(representations)
        misc.save_npy(scr + "dist." + name, dist)

        dist = None
        del dist

    # Prepare fchl kernels
    if False:
        print("Generating fchl18 kernel")
        start = time.time()
        reps = misc.load_npy(scr + "repr." + "fchl18")
        print("shape:", reps.shape)
        sigmas, kernels = get_fchl18_kernels(reps, return_sigmas=True)
        end = time.time()
        print("time:", end-start)
        misc.save_npy(scr + "fchl18." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl18", kernels)

        reps = None
        del reps
        kernels = None
        del kernels

    if False:
        print("Generating fchl19 kernel")
        reps = misc.load_npy(scr + "repr." + "fchl19")
        print("shape:", reps.shape)
        atoms = misc.load_obj(scr + "atoms")
        start = time.time()
        sigmas, kernels = get_fchl19_kernels(reps, atoms, return_sigmas=True)
        end = time.time()
        print("time:", end-start)
        misc.save_npy(scr + "fchl19." + "sigmas", sigmas)
        misc.save_npy(scr + "kernels." + "fchl19", kernels)

    if True:
        print("Generating fingerprint kernel")
        representations_fp = misc.load_obj(scr + "repr.fp")
        kernel = get_fp_kernel(representations_fp)
        misc.save_npy(scr + "kernel.fp", kernel)

    return
Пример #6
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="DIR",
                        default="_tmp_")
    parser.add_argument('--sdf',
                        action='store',
                        help='',
                        metavar="FILE",
                        nargs="+",
                        default=[])
    parser.add_argument('--dict',
                        action='store',
                        help='',
                        metavar="FILE",
                        nargs="+",
                        default=[])
    parser.add_argument('--name',
                        action='store',
                        help='',
                        metavar="STR",
                        nargs="+")
    parser.add_argument('--filename', action='store', help='', metavar="STR")
    parser.add_argument('--filter', action='store_true', help='')
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    print()
    databases_set = []
    databases_dict = []

    for sdf in args.sdf:
        molobjs = cheminfo.read_sdffile(sdf)
        molobjs = list(molobjs)
        smiles = [
            cheminfo.molobj_to_smiles(molobj, remove_hs=True)
            for molobj in molobjs
        ]
        smiles = set(smiles)
        databases_set.append(smiles)
        print(sdf, len(smiles))

    for filename in args.dict:
        data = misc.load_obj(filename)
        smiles = data.keys()
        smiles = set(smiles)
        databases_set.append(smiles)
        databases_dict.append(data)
        print(filename, len(smiles))

    if args.scratch is not None:

        # Merge databases
        everything = {}

        for data in databases_dict:

            keys = data.keys()

            for key in keys:

                if key not in everything:
                    everything[key] = []

                everything[key] += data[key]

        if args.filter:
            everything = filter_dict(everything)

        keys = everything.keys()
        print("n items", len(keys))

        # Save
        misc.save_json(args.scratch + "molecule_data", everything)
        misc.save_obj(args.scratch + "molecule_data", everything)

    if args.name is not None:

        n_db = len(databases_set)

        if n_db == 2:
            venn2(databases_set, set_labels=args.name)
        elif n_db == 3:
            venn3(databases_set, set_labels=args.name)

        plt.savefig(args.scratch + "venndiagram")

    return
Пример #7
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('--randomseed',
                        action='store',
                        help='random seed',
                        metavar="int",
                        default=1)
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        type=int,
                        metavar="int",
                        default=0)
    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Not that random
    np.random.seed(args.randomseed)

    # Get properties
    properties = misc.load_npy(args.scratch + "properties")
    molobjs = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    # Get features
    filename = "repr.ols"
    if os.path.exists(args.scratch + filename + ".pkl"):
        features = misc.load_obj(args.scratch + filename)

    else:
        features = extract_features(properties, molobjs, procs=args.procs)
        features = pd.DataFrame(features)
        features = features.fillna(0)
        misc.save_obj(args.scratch + filename, features)

    n_items = len(features)
    X = np.arange(n_items)

    assert len(properties) == n_items

    # Train
    n_splits = 5
    n_train = misc.load_npy(args.scratch + "n_train")

    fold_five = sklearn.model_selection.KFold(n_splits=n_splits,
                                              random_state=45,
                                              shuffle=True)

    scores = []

    for i, (idxs_train, idxs_test) in enumerate(fold_five.split(X)):

        # un-ordered idxs_train
        np.random.seed(45 + i)
        np.random.shuffle(idxs_train)

        learning_curve = []

        for n in n_train:
            idxs = idxs_train[:n]

            # signed difference
            sign_diff = fit_model(features, idxs, idxs_test)

            # rmse
            diff = sign_diff**2
            rmse_test = np.sqrt(diff.mean())

            # save
            learning_curve.append(rmse_test)

        scores.append(learning_curve)

    scores = np.array(scores)
    scores = scores.T

    mean_score = np.mean(scores, axis=1)
    print(mean_score)
    misc.save_npy(args.scratch + "score.ols", scores)

    return
Пример #8
0
def main(datafile, procs=0, scr="_tmp_"):

    db = misc.load_obj(datafile)

    keys = db.keys()

    print("total keys:", len(keys))

    xaxis = []
    yaxis = []

    if procs == 0:

        def get_results():

            for i, key in enumerate(keys):

                smi = key
                kelvin = db[key]
                result = prepare_sdf_and_csv(smi, kelvin)
                if result is None: continue

                yield result

        results = get_results()

    else:

        def workpackages():
            for i, key in enumerate(keys):

                # if i > 5000: break

                smi = key
                kelvin = db[key]
                yield smi, kelvin

        lines = workpackages()

        results = misc.parallel(lines,
                                prepare_sdf_and_csv_procs, [], {},
                                procs=procs)

        print("streaming results")

    # Write results

    fullsdf = ""
    fsdf = gzip.open("data/sdf/structures.sdf.gz", 'w')
    fprop = open("data/sdf/properties.csv", 'w')

    for i, result in enumerate(results):

        if result is None: continue

        molobj, values = result

        sdfstr = cheminfo.molobj_to_sdfstr(molobj)
        fsdf.write(sdfstr.encode())

        valuesstr = " ".join(values)
        # propstr = "{:} {:}\n".format(mean, standard_deviation)
        propstr = f"{i} " + valuestr
        fprop.write(propstr)

    fsdf.close()
    fprop.close()

    return
Пример #9
0
if __name__ == "__main__":

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--datadict', action='store', help='', metavar='FILE')
    parser.add_argument('--data', action='store', help='', metavar='FILE')
    parser.add_argument('--sdf', action='store', help='', metavar='FILE')
    parser.add_argument('--scratch', action='store', help='', metavar='DIR')
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='',
                        type=int,
                        metavar='int',
                        default=0)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    if args.datadict:
        data = misc.load_obj(args.datadict)
        set_structures(data, args.scratch, procs=args.procs)

    if args.data:
        main(args.data, procs=args.procs)

    if args.sdf:
        conformation("data/sdf/structures.sdf.gz", procs=args.procs)
def get_representations_slatm(atoms,
                              structures,
                              scr="_tmp_/",
                              mbtypes=None,
                              debug=True,
                              procs=0,
                              **kwargs):
    """
    atoms -- list of molecule atoms

    """

    # from qml.representations import get_slatm_mbtypes # Assume 'qm7' is a
    # list of Compound() objects. mbtypes =
    # get_slatm_mbtypes([mol.nuclear_charges for compound in qm7]) # Assume the
    # QM7 dataset is loaded into a list of Compound() for compound in qm7: #
    # Generate the desired representation for each compound
    # compound.generate_slatm(mbtypes, local=True, rcut=2.7)

    if mbtypes is None:

        filename_mbtypes = scr + "slatm.mbtypes"

        try:
            mbtypes = misc.load_obj(filename_mbtypes)
        except FileNotFoundError:

            print("Generate slatm mbtypes")
            mbtypes = qml.representations.get_slatm_mbtypes(atoms)
            misc.save_obj(filename_mbtypes, mbtypes)

    if debug:
        print("Generate slatm representations")

    replist = []

    # Set OMP
    if procs > 1:
        os.environ["OMP_NUM_THREADS"] = "1"

        workargs = zip(structures, atoms)
        workargs = list(workargs)

        pool = Pool(processes=procs)
        funcname = partial(procs_representation_slatm, mbtypes=mbtypes)
        replist = pool.map(funcname, workargs)

    else:
        for i, (coord, atom) in enumerate(zip(structures, atoms)):
            rep = qml.representations.generate_slatm(coord, atom, mbtypes)
            replist.append(rep)

    # replist = [qml.representations.generate_slatm(coordinate, atom, mbtypes) for coordinate, atom in zip(structures, atoms)]
    replist = np.array(replist)

    # for i, rep in enumerate(replist):
    #     m = rep.mean()
    #     if np.isnan(m):
    #         print(i, rep.mean())
    # print(replist.mean())

    return replist
def generate_conformer_representation(scr="_tmp_ensemble_/", procs=0):

    names = ["cm", "slatm", "bob"]
    name = "slatm"

    mbtypes = misc.load_npy(scr + "slatm.mbtypes")

    # TODO Calculate max_size
    mol_atoms = misc.load_obj(scr + "atoms")
    max_atoms = [len(atoms) for atoms in mol_atoms]
    max_atoms = max(max_atoms)

    kwargs = {
        "name": name,
        "mbtypes": mbtypes,
        "debug": False,
        "max_atoms": max_atoms,
    }

    # n_total = 1285
    n_total = 3456
    idxs = range(n_total)

    avgreps = [0] * n_total

    if procs == 0:

        for idx in idxs:

            idx, avgrep = get_avg_repr(idx, **kwargs)
            avgreps[idx] = avgrep

    else:

        idx, rep = get_avg_repr(0, **kwargs)
        rep_size = rep.shape[0]
        print("rep size", rep_size)

        m = MyManager()
        m.start()

        results = m.np_zeros((n_total, rep_size))

        # TODO Hardcoded, puuuha
        pool = Pool(32)

        kwargs["array"] = results
        func = partial(get_avg_repr, **kwargs)
        pool.map(func, idxs)
        avgreps = results

        # results = misc.parallel(idxs, get_avg_repr, [], kwargs, procs=nprocs)
        #
        # for result in results:
        #     idx, avgrep = result
        #     avgreps[idx] = avgrep
        #     print(idx, avgrep.mean())

    avgreps = np.array(avgreps)
    misc.save_npy(scr + "repr.avgslatm", avgreps)

    return