Exemplo n.º 1
0
def main():
    """ Main function """

    # Use 128 permutations to create the MinHash
    enc = tm.Minhash(1024)
    lf = tm.LSHForest(128, file_backed=True)

    # d = 1000
    # n = 1000000
    d = 10000
    n = 1000

    # Generating some random data
    start = timer()
    for _ in range(n):
        # data.append(tm.VectorUint(np.random.randint(0, high=2, size=d)))
        lf.add(
            enc.from_sparse_binary_array(
                tm.VectorUint(np.random.randint(0, high=2, size=d))))

    print(f"Generating the data took {(timer() - start) * 1000}ms.")

    # Index the added data
    start = timer()
    lf.index()
    print(f"Indexing took {(timer() - start) * 1000}ms.")

    # Find the 10 nearest neighbors of the first entry
    start = timer()
    knng_from = tm.VectorUint()
    knng_to = tm.VectorUint()
    knng_weight = tm.VectorFloat()

    _ = lf.get_knn_graph(knng_from, knng_to, knng_weight, 10)
    print(f"The kNN search took {(timer() - start) * 1000}ms.")
Exemplo n.º 2
0
    def test_knn_graph(self):
        random.seed(42)
        data = []
        for _ in range(100):
            row = []
            for _ in range(10):
                row.append(random.randint(0, 20))
            data.append(tm.VectorUint(row))

        mh = tm.Minhash()
        lf = tm.LSHForest()

        lf.batch_add(mh.batch_from_sparse_binary_array(data))
        lf.index()

        f = tm.VectorUint()
        t = tm.VectorUint()
        w = tm.VectorFloat()

        lf.get_knn_graph(f, t, w, 10)
        assert len(f) == 1000
        assert t[0] == 0
        assert t[1] == 26
        assert t[2] == 36
        assert t[3] == 67
        assert t[4] == 33
        assert t[5] == 83
Exemplo n.º 3
0
 def test_from_sparse_binary_array(self):
     mh = tm.Minhash(8)
     a = mh.from_sparse_binary_array(
         tm.VectorUint([6, 22, 26, 62, 626, 226622]))
     b = mh.from_sparse_binary_array(
         tm.VectorUint([6, 22, 26, 62, 262, 226622]))
     assert len(a) == 8
     assert round(mh.get_distance(a, b), 2) == 0.25
Exemplo n.º 4
0
def MolsToLSHForest(mol_list,
                    save_path="./",
                    worker=os.cpu_count() - 1,
                    batch_size=None):

    print('Available CPU Cores =', os.cpu_count())
    print('Number of CPU Core used =', worker)
    print('\nTotal Number of Mols =', len(mol_list))
    if batch_size: print('Batch Size =', batch_size)
    if not os.path.exists(outpath): os.makedirs(outpath)
    print('Saving Files at', outpath)
    sys.stdout.flush()

    if batch_size:
        fps, props = batch_convert(mol_list,
                                   batch_size,
                                   outpath,
                                   props_named_tuple=Props)
    else:
        fps, props = single_convert(mol_list, outpath, props_named_tuple=Props)

    print("Loading data and converting to LSH Forest data")
    print('Converting MinHash Fingerprints to Vectors')
    sys.stdout.flush()
    fps = [tm.VectorUint(fp) for fp in fps]
    print(len(fps), 'Fingerprints Converted')
    sys.stdout.flush()

    lf.batch_add(fps)
    lf.index()
    lf.store(os.path.join(outpath, "lf.dat"))

    return lf, props
Exemplo n.º 5
0
def calc_mhfp(ENC, mol):
    """calculates the minhashed fingerprint

    Arguments:
        mol

    Returns:
        tmap VectorUint -- minhashed fingerprint
    """

    smiles = Chem.MolToSmiles(mol)
    fp = tm.VectorUint(ENC.encode(smiles))
    return fp
Exemplo n.º 6
0
def main():
    """ Main function """

    # Use 128 permutations to create the MinHash
    enc = tm.Minhash(128)
    lf = tm.LSHForest(128)

    d = 1000
    n = 10000

    data = []

    # Generating some random data
    start = timer()
    for _ in range(n):
        data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d)))
    print(f"Generating the data took {(timer() - start) * 1000}ms.")

    # Use batch_add to parallelize the insertion of the arrays
    start = timer()
    lf.batch_add(enc.batch_from_binary_array(data))
    print(f"Adding the data took {(timer() - start) * 1000}ms.")

    # Index the added data
    start = timer()
    lf.index()
    print(f"Indexing took {(timer() - start) * 1000}ms.")

    # Construct the k-nearest neighbour graph
    start = timer()
    knng_from = tm.VectorUint()
    knng_to = tm.VectorUint()
    knng_weight = tm.VectorFloat()

    _ = lf.get_knn_graph(knng_from, knng_to, knng_weight, 10)
    print(f"The kNN search took {(timer() - start) * 1000}ms.")
Exemplo n.º 7
0
    def test_lf_layout(self):
        random.seed(42)
        data = []
        for _ in range(100):
            row = []
            for _ in range(10):
                row.append(random.randint(0, 20))
            data.append(tm.VectorUint(row))

        mh = tm.Minhash()
        lf = tm.LSHForest()

        lf.batch_add(mh.batch_from_sparse_binary_array(data))
        lf.index()

        x, y, s, t, gp = tm.layout_from_lsh_forest(lf)
        assert len(x) == 100
        assert len(s) == 99
Exemplo n.º 8
0
    def test_query(self):
        random.seed(42)
        data = []
        for _ in range(100):
            row = []
            for _ in range(10):
                row.append(random.randint(0, 20))
            data.append(tm.VectorUint(row))

        mh = tm.Minhash()
        lf = tm.LSHForest()

        lf.batch_add(mh.batch_from_sparse_binary_array(data))
        lf.index()

        assert lf.size() == len(data)

        r = lf.query_linear_scan_by_id(0, 10)
        assert r[0][1] == 0
        assert r[1][1] == 26
Exemplo n.º 9
0
def LSH_Convert(mols, outpath, num_workers):
    # MinHash fingerprints (mhfp) encoder for molecular fingerprinting
    enc = MHFPEncoder(1024)
    # Locality Sensitive Hashing Forest Instance
    lf = tm.LSHForest(1024, 64)

    print("Number of mols to be hashed:", len(mols))
    fps = process_map(enc.encode_mol,
                      mols,
                      chunksize=100,
                      max_workers=num_workers)

    fp_vecs = [tm.VectorUint(fp) for fp in fps]

    lf.batch_add(fp_vecs)
    lf.index()
    # save fp and lf
    with open(os.path.join(outpath, "fps.pickle"), "wb") as fpfile:
        pickle.dump(fps, fpfile)
    lf.store(os.path.join(outpath, "lf.dat"))
    print('LSH data files saved!')
    return lf
Exemplo n.º 10
0
def main():
    """ The main function """
    df = pd.read_csv("papers.tar.xz")
    df.drop(df.tail(1).index, inplace=True)
    df["title"] = df["title"].apply(lambda t: t.replace("'", '"'))
    enc = tm.Minhash()
    lf = tm.LSHForest()

    ctr = Counter()
    texts = []
    for _, row in df.iterrows():
        text = re.sub(r"[^a-zA-Z-]+", " ", row["paper_text"])
        text = [t.lower() for t in text.split(" ") if len(t) > 2]
        ctr.update(text)
        texts.append(text)

    # Remove the top n words
    n = 6000
    ctr = ctr.most_common()[: -(len(ctr) - n) - 1 : -1]

    # Make it fast using a lookup map
    all_words = {}
    for i, (key, _) in enumerate(ctr):
        all_words[key] = i

    # Create the fingerprints and also check whether the word
    # "deep" is found in the document
    fingerprints = []
    has_word = []
    for text in texts:
        if "deep" in text:
            has_word.append(1)
        else:
            has_word.append(0)

        fingerprint = []
        for t in text:
            if t in all_words:
                fingerprint.append(all_words[t])
        fingerprints.append(tm.VectorUint(fingerprint))

    # Index the article fingerprints
    lf.batch_add(enc.batch_from_sparse_binary_array(fingerprints))
    lf.index()

    # Create the tmap
    config = tm.LayoutConfiguration()
    config.k = 100
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=config)

    faerun = Faerun(
        view="front", coords=False, legend_title="", legend_number_format="{:.0f}"
    )

    # Add a scatter that is bigger than the one above, to add colored
    # circles.
    faerun.add_scatter(
        "NIPS_word",
        {"x": x, "y": y, "c": has_word, "labels": df["title"]},
        colormap="Set1",
        point_scale=7.5,
        max_point_size=25,
        shader="smoothCircle",
        has_legend=True,
        categorical=True,
        legend_title="Contains word<br/>'deep'",
        legend_labels=[(0, "No"), (1, "Yes")],
        interactive=False,
    )

    # Add a scatter that is colored by year on top
    faerun.add_scatter(
        "NIPS",
        {"x": x, "y": y, "c": df["year"], "labels": df["title"]},
        colormap="gray",
        point_scale=5.0,
        max_point_size=20,
        shader="smoothCircle",
        has_legend=True,
        legend_title="Year of<br/>Publication",
    )

    faerun.add_tree(
        "NIPS_tree", {"from": s, "to": t}, point_helper="NIPS", color="#666666"
    )

    faerun.plot("nips_papers")
import pickle
import random
import sys

import numpy as np
import tmap as tm

fps = []
findCID = {}

with open(sys.argv[1]) as inFile:
    for i, line in enumerate(inFile):
        line = line.strip()
        line = line.split(' ')
        findCID[i] = [line[0], line[1], line[2]]
        fp = tm.VectorUint(np.array(list(map(int, line[3].split(';')))))
        fps.append(fp)

pickle.dump(findCID, open('{}_dictionary'.format(sys.argv[1]), 'wb'))

lf = tm.LSHForest(512, 32)
lf.batch_add(fps)

lf.index()
lf.store('{}_LSHforest'.format(sys.argv[1]))
Exemplo n.º 12
0
def main():
    """ The main function """
    df = pd.read_csv("drugbank.csv").dropna(subset=["SMILES"]).reset_index(
        drop=True)
    enc = MHFPEncoder()
    lf = tm.LSHForest(2048, 128)

    fps = []
    labels = []
    groups = []
    tpsa = []
    logp = []
    mw = []
    h_acceptors = []
    h_donors = []
    ring_count = []
    is_lipinski = []
    has_coc = []
    has_sa = []
    has_tz = []

    substruct_coc = AllChem.MolFromSmiles("COC")
    substruct_sa = AllChem.MolFromSmiles("NS(=O)=O")
    substruct_tz = AllChem.MolFromSmiles("N1N=NN=C1")

    total = len(df)
    for i, row in df.iterrows():
        if i % 1000 == 0 and i > 0:
            print(f"{round(100 * (i / total))}% done ...")

        smiles = row[6]
        mol = AllChem.MolFromSmiles(smiles)

        if mol and mol.GetNumAtoms() > 5 and smiles.count(".") < 2:
            fps.append(tm.VectorUint(enc.encode_mol(mol, min_radius=0)))
            labels.append(
                f'{smiles}__<a href="https://www.drugbank.ca/drugs/{row[0]}" target="_blank">{row[0]}</a>__{row[1]}'
                .replace("'", ""))
            groups.append(row[3].split(";")[0])
            tpsa.append(Descriptors.TPSA(mol))
            logp.append(Descriptors.MolLogP(mol))
            mw.append(Descriptors.MolWt(mol))
            h_acceptors.append(Descriptors.NumHAcceptors(mol))
            h_donors.append(Descriptors.NumHDonors(mol))
            ring_count.append(Descriptors.RingCount(mol))
            is_lipinski.append(lipinski_pass(mol))
            has_coc.append(mol.HasSubstructMatch(substruct_coc))
            has_sa.append(mol.HasSubstructMatch(substruct_sa))
            has_tz.append(mol.HasSubstructMatch(substruct_tz))

    # Create the labels and the integer encoded array for the groups,
    # as they're categorical
    labels_groups, groups = Faerun.create_categories(groups)
    tpsa_ranked = ss.rankdata(np.array(tpsa) / max(tpsa)) / len(tpsa)
    logp_ranked = ss.rankdata(np.array(logp) / max(logp)) / len(logp)
    mw_ranked = ss.rankdata(np.array(mw) / max(mw)) / len(mw)
    h_acceptors_ranked = ss.rankdata(
        np.array(h_acceptors) / max(h_acceptors)) / len(h_acceptors)
    h_donors_ranked = ss.rankdata(
        np.array(h_donors) / max(h_donors)) / len(h_donors)
    ring_count_ranked = ss.rankdata(
        np.array(ring_count) / max(ring_count)) / len(ring_count)

    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()
    cfg.k = 100
    # cfg.sl_extra_scaling_steps = 1
    cfg.sl_repeats = 2
    cfg.mmm_repeats = 2
    cfg.node_size = 2
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg)

    # Define a colormap highlighting approved vs non-approved
    custom_cmap = ListedColormap(
        [
            "#2ecc71", "#9b59b6", "#ecf0f1", "#e74c3c", "#e67e22", "#f1c40f",
            "#95a5a6"
        ],
        name="custom",
    )

    bin_cmap = ListedColormap(["#e74c3c", "#2ecc71"], name="bin_cmap")

    f = Faerun(
        clear_color="#222222",
        coords=False,
        view="front",
        impress=
        'made with <a href="http://tmap.gdb.tools" target="_blank">tmap</a><br />and <a href="https://github.com/reymond-group/faerun-python" target="_blank">faerun</a><br /><a href="https://gist.github.com/daenuprobst/5cddd0159c0cf4758fb16b4b4acbef89">source</a>',
    )

    f.add_scatter(
        "Drugbank",
        {
            "x":
            x,
            "y":
            y,
            "c": [
                groups,
                is_lipinski,
                has_coc,
                has_sa,
                has_tz,
                tpsa_ranked,
                logp_ranked,
                mw_ranked,
                h_acceptors_ranked,
                h_donors_ranked,
                ring_count_ranked,
            ],
            "labels":
            labels,
        },
        shader="smoothCircle",
        colormap=[
            custom_cmap,
            bin_cmap,
            bin_cmap,
            bin_cmap,
            bin_cmap,
            "viridis",
            "viridis",
            "viridis",
            "viridis",
            "viridis",
            "viridis",
        ],
        point_scale=2.5,
        categorical=[
            True, True, True, True, True, False, False, False, False, False
        ],
        has_legend=True,
        legend_labels=[
            labels_groups,
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
        ],
        selected_labels=["SMILES", "Drugbank ID", "Name"],
        series_title=[
            "Group",
            "Lipinski",
            "Ethers",
            "Sulfonamides",
            "Tetrazoles",
            "TPSA",
            "logP",
            "Mol Weight",
            "H Acceptors",
            "H Donors",
            "Ring Count",
        ],
        max_legend_label=[
            None,
            None,
            None,
            None,
            None,
            str(round(max(tpsa))),
            str(round(max(logp))),
            str(round(max(mw))),
            str(round(max(h_acceptors))),
            str(round(max(h_donors))),
            str(round(max(ring_count))),
        ],
        min_legend_label=[
            None,
            None,
            None,
            None,
            None,
            str(round(min(tpsa))),
            str(round(min(logp))),
            str(round(min(mw))),
            str(round(min(h_acceptors))),
            str(round(min(h_donors))),
            str(round(min(ring_count))),
        ],
        title_index=2,
        legend_title="",
    )

    f.add_tree("drugbanktree", {"from": s, "to": t}, point_helper="Drugbank")

    f.plot("drugbank", template="smiles")
Exemplo n.º 13
0
def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 2048
    enc = tm.Minhash(dims)
    lf = tm.LSHForest(dims, 128, store=True)

    fps = []
    # fps_umap = []
    for row in DATA:
        fps.append(tm.VectorUint(list(row)))

    lf.batch_add(enc.batch_from_sparse_binary_array(fps))
    lf.index()

    x_tmap, y_tmap, s, t, _ = tm.layout_from_lsh_forest(lf, CFG_TMAP)
    lf.clear()

    # Prepare custom color map
    tab10 = plt.get_cmap("tab10").colors
    colors_gray = [(0.2, 0.2, 0.2), tab10[0], tab10[1], tab10[2], tab10[3],
                   tab10[4]]
    custom_cm_gray = LinearSegmentedColormap.from_list("custom_cm_gray",
                                                       colors_gray,
                                                       N=len(colors_gray))

    legend_labels = [
        (1, "Rudyard Kipling"),
        (2, "Herbert George Wells"),
        (3, "Charles Darwin"),
        (4, "George Bernard Shaw"),
        (5, "William Wymark Jacobs"),
        (0, "Other"),
    ]

    faerun = Faerun(
        clear_color="#111111",
        view="front",
        coords=False,
        alpha_blending=True,
        legend_title="",
    )
    faerun.add_scatter(
        "gutenberg",
        {
            "x": x_tmap,
            "y": y_tmap,
            "c": LABELS,
            "labels": FAERUN_LABELS
        },
        colormap=custom_cm_gray,
        point_scale=4.2,
        max_point_size=10,
        has_legend=True,
        categorical=True,
        legend_title="Authors",
        legend_labels=legend_labels,
        shader="smoothCircle",
        selected_labels=["Author", "Title"],
    )
    faerun.add_tree(
        "gutenberg_tree",
        {
            "from": s,
            "to": t
        },
        point_helper="gutenberg",
        color="#222222",
    )
    faerun.plot("gutenberg", template="default")