def main():
    """ Main function """

    # Use 128 permutations to create the MinHash
    enc = tm.Minhash(128)
    lf = tm.LSHForest(128)

    d = 10000
    n = 1000

    data = []

    # Generating some random data
    start = timer()
    for i in range(n):
        data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d)))
    print(f"Generating the data took {(timer() - start) * 1000}ms.")

    # Use batch_add to parallelize the insertion of the arrays
    start = timer()
    lf.batch_add(enc.batch_from_binary_array(data))
    print(f"Adding the data took {(timer() - start) * 1000}ms.")

    # Index the added data
    start = timer()
    lf.index()
    print(f"Indexing took {(timer() - start) * 1000}ms.")

    # The configuration for the MST plot
    # Distribute the tree more evenly
    cfg = tm.LayoutConfiguration()
    cfg.sl_scaling_min = 1
    cfg.sl_scaling_max = 1
    cfg.node_size = 1 / 50

    # Construct the k-nearest neighbour graph
    start = timer()
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg)
    print(f"layout_from_lsh_forest took {(timer() - start) * 1000}ms.")

    # Plot spanning tree layout
    start = timer()
    for i in range(len(s)):
        plt.plot(
            [x[s[i]], x[t[i]]],
            [y[s[i]], y[t[i]]],
            "r-",
            linewidth=1.0,
            alpha=0.5,
            zorder=1,
        )

    plt.scatter(x, y, s=0.1, zorder=2)
    plt.tight_layout()
    plt.savefig("lsh_forest_knng_mpl.png")
    print(f"Plotting using matplotlib took {(timer() - start) * 1000}ms.")
示例#2
0
def tree_coords(lf, node_size=1 / 20, k=20, mmm_rps=2):
    print('Converting to tmap coordinates')
    # Create a LayoutConfiguration instance
    cfg = tm.LayoutConfiguration()
    cfg.node_size = node_size
    cfg.mmm_repeats = mmm_rps
    cfg.sl_extra_scaling_steps = 5
    cfg.k = k
    cfg.sl_scaling_type = tm.RelativeToAvgLength

    #Create minimum spanning tree from the LSHForest and LayoutConfiguration instance
    #The x and y coordinates of the vertices, the ids of the vertices spanning the edges
    #information on the graph is ignored

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)
    return list(x), list(y), list(s), list(t)
def main():
    """ Main function """

    n = 1000
    edge_list = []
    weights = {}

    # Create a random graph
    for i in range(n):
        for j in np.random.randint(0, high=n, size=2):
            # Do not add parallel edges here, to be sure
            # to have the right weight later
            if i in weights and j in weights[i] or j in weights and i in weights[j]:
                continue

            weight = np.random.rand(1)
            edge_list.append([i, j, weight])

            # Store the weights in 2d map for easy access
            if i not in weights:
                weights[i] = {}
            if j not in weights:
                weights[j] = {}

            # Invert weights to make lower ones more visible in the plot
            weights[i][j] = 1.0 - weight
            weights[j][i] = 1.0 - weight

    # Set the initial randomized positioning to True
    # Otherwise, OGDF tends to segfault
    cfg = tm.LayoutConfiguration()
    cfg.fme_randomize = True

    # The configuration for the MST plot
    # Distribute the tree more evenly
    cfg_mst = tm.LayoutConfiguration()
    cfg_mst.sl_scaling_min = 1
    cfg_mst.sl_scaling_max = 1

    # Compute the layout
    x, y, s, t, _ = tm.layout_from_edge_list(n, edge_list, config=cfg, create_mst=False)
    x_mst, y_mst, s_mst, t_mst, _ = tm.layout_from_edge_list(
        n, edge_list, config=cfg_mst, create_mst=True
    )

    _, (ax1, ax2) = plt.subplots(ncols=2, sharey=True)

    # Plot graph layout with spanning tree superimposed in red
    for i in range(len(s)):
        ax1.plot(
            [x[s[i]], x[t[i]]],
            [y[s[i]], y[t[i]]],
            "k-",
            linewidth=weights[s[i]][t[i]],
            alpha=0.5,
            zorder=1,
        )

    for i in range(len(s_mst)):
        ax1.plot(
            [x[s_mst[i]], x[t_mst[i]]],
            [y[s_mst[i]], y[t_mst[i]]],
            "r-",
            linewidth=weights[s_mst[i]][t_mst[i]],
            alpha=0.5,
            zorder=2,
        )

    ax1.scatter(x, y, s=0.1, zorder=3)

    # Plot spanning tree layout
    for i in range(len(s_mst)):
        ax2.plot(
            [x_mst[s_mst[i]], x_mst[t_mst[i]]],
            [y_mst[s_mst[i]], y_mst[t_mst[i]]],
            "r-",
            linewidth=weights[s_mst[i]][t_mst[i]],
            alpha=0.5,
            zorder=1,
        )

    ax2.scatter(x_mst, y_mst, s=0.1, zorder=2)

    plt.tight_layout()
    plt.savefig("spanning_tree_big.png")
示例#4
0
from tqdm import tqdm
import os

import tmap as tm

# Load the data
MN = MNIST("./data/mnist-data")
IMAGES_TRAIN, LABELS_TRAIN = MN.load_training()
IMAGES_TEST, LABELS_TEST = MN.load_testing()

IMAGES = np.concatenate((IMAGES_TRAIN, IMAGES_TEST))
LABELS = np.concatenate((LABELS_TRAIN, LABELS_TEST))
IMAGE_LABELS = []

# Coniguration for the tmap layout
CFG = tm.LayoutConfiguration()
CFG.node_size = 1 / 50


def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 1024
    enc = tm.Minhash(dims)
    lf = tm.LSHForest(dims, 128)

    print("Converting images ...")
    for image in tqdm(IMAGES):
        img = Image.fromarray(np.uint8(np.split(np.array(image), 28)))
        buffered = BytesIO()
示例#5
0
def main():
    """ The main function """
    df = pd.read_csv("papers.tar.xz")
    df.drop(df.tail(1).index, inplace=True)
    df["title"] = df["title"].apply(lambda t: t.replace("'", '"'))
    enc = tm.Minhash()
    lf = tm.LSHForest()

    ctr = Counter()
    texts = []
    for _, row in df.iterrows():
        text = re.sub(r"[^a-zA-Z-]+", " ", row["paper_text"])
        text = [t.lower() for t in text.split(" ") if len(t) > 2]
        ctr.update(text)
        texts.append(text)

    # Remove the top n words
    n = 6000
    ctr = ctr.most_common()[: -(len(ctr) - n) - 1 : -1]

    # Make it fast using a lookup map
    all_words = {}
    for i, (key, _) in enumerate(ctr):
        all_words[key] = i

    # Create the fingerprints and also check whether the word
    # "deep" is found in the document
    fingerprints = []
    has_word = []
    for text in texts:
        if "deep" in text:
            has_word.append(1)
        else:
            has_word.append(0)

        fingerprint = []
        for t in text:
            if t in all_words:
                fingerprint.append(all_words[t])
        fingerprints.append(tm.VectorUint(fingerprint))

    # Index the article fingerprints
    lf.batch_add(enc.batch_from_sparse_binary_array(fingerprints))
    lf.index()

    # Create the tmap
    config = tm.LayoutConfiguration()
    config.k = 100
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=config)

    faerun = Faerun(
        view="front", coords=False, legend_title="", legend_number_format="{:.0f}"
    )

    # Add a scatter that is bigger than the one above, to add colored
    # circles.
    faerun.add_scatter(
        "NIPS_word",
        {"x": x, "y": y, "c": has_word, "labels": df["title"]},
        colormap="Set1",
        point_scale=7.5,
        max_point_size=25,
        shader="smoothCircle",
        has_legend=True,
        categorical=True,
        legend_title="Contains word<br/>'deep'",
        legend_labels=[(0, "No"), (1, "Yes")],
        interactive=False,
    )

    # Add a scatter that is colored by year on top
    faerun.add_scatter(
        "NIPS",
        {"x": x, "y": y, "c": df["year"], "labels": df["title"]},
        colormap="gray",
        point_scale=5.0,
        max_point_size=20,
        shader="smoothCircle",
        has_legend=True,
        legend_title="Year of<br/>Publication",
    )

    faerun.add_tree(
        "NIPS_tree", {"from": s, "to": t}, point_helper="NIPS", color="#666666"
    )

    faerun.plot("nips_papers")
示例#6
0
def main():
    """ Main function """

    dims = 512
    lf = tm.LSHForest(dims, 128, store=True)

    # Due to the large data size (> 1GB) the following files are not provided directly
    smiles, target_class, activity, chembl_id = pickle.load(
        open("chembl.pickle", "rb"))

    labels = []
    for i, s in enumerate(smiles):
        labels.append(
            s + "__" + chembl_id[i] + "__" +
            f'<a target="_blank" href="https://www.ebi.ac.uk/chembl/compound_report_card/{chembl_id[i]}">{chembl_id[i]}</a>'
        )

    lf.restore("chembl.dat")

    target_class_map = dict([(y, x + 1)
                             for x, y in enumerate(sorted(set(target_class)))])

    classes = [
        "enzyme",
        "kinase",
        "protease",
        "cytochrome p450",
        "ion channel",
        "transporter",
        "transcription factor",
        "membrane receptor",
        "epigenetic regulator",
    ]

    i = 0
    for key, value in target_class_map.items():
        if key not in classes:
            target_class_map[key] = 7
        else:
            target_class_map[key] = i
            i += 1
            if i == 7:
                i = 8

    cfg = tm.LayoutConfiguration()
    cfg.node_size = 1 / 70
    cfg.mmm_repeats = 2
    cfg.sl_repeats = 2

    start = timer()
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)
    end = timer()
    print(end - start)

    activity = np.array(activity)
    activity = np.maximum(0.0, activity)
    activity = np.minimum(100.0, activity)
    activity = 10.0 - activity

    legend_labels = [
        (0, "Cytochrome p450"),
        (1, "Other Enzyme"),
        (2, "Epigenetic Regulator"),
        (3, "Ion Channel"),
        (4, "Kinase"),
        (5, "Membrane Receptor"),
        (6, "Protease"),
        (8, "Transcription Factor"),
        (9, "Transporter"),
        (7, "Other"),
    ]

    vals = [int(target_class_map[x]) for x in target_class]

    faerun = Faerun(view="front", coords=False)
    faerun.add_scatter(
        "chembl",
        {
            "x": x,
            "y": y,
            "c": vals,
            "labels": labels
        },
        colormap="tab10",
        point_scale=1.0,
        max_point_size=10,
        has_legend=True,
        categorical=True,
        shader="smoothCircle",
        legend_labels=legend_labels,
        title_index=1,
    )
    faerun.add_tree("chembl_tree", {
        "from": s,
        "to": t
    },
                    point_helper="chembl",
                    color="#222222")

    faerun.plot("chembl", template="smiles")
示例#7
0
"""
Visualizing RNA sequencing data using tmap.

Data Source:
https://gdc.cancer.gov/about-data/publications/pancanatlas
"""

import numpy as np
import pandas as pd
from faerun import Faerun
import tmap as tm

# Coniguration for the tmap layout
CFG_TMAP = tm.LayoutConfiguration()
CFG_TMAP.k = 50
CFG_TMAP.kc = 50
CFG_TMAP.node_size = 1 / 20

DATA = pd.read_csv("data.csv.xz", index_col=0, sep=",")
LABELS = pd.read_csv("labels.csv", index_col=0, sep=",")

LABELMAP = {"PRAD": 1, "LUAD": 2, "BRCA": 3, "KIRC": 4, "COAD": 5}
LABELS = np.array([int(LABELMAP[v]) for v in LABELS["Class"]], dtype=np.int)


def main():
    """ Main function """

    # Initialize and configure tmap
    dims = 256
    enc = tm.Minhash(len(DATA.columns), 42, dims)
示例#8
0
def main():
    """ The main function """
    df = pd.read_csv("drugbank.csv").dropna(subset=["SMILES"]).reset_index(
        drop=True)
    enc = MHFPEncoder()
    lf = tm.LSHForest(2048, 128)

    fps = []
    labels = []
    groups = []
    tpsa = []
    logp = []
    mw = []
    h_acceptors = []
    h_donors = []
    ring_count = []
    is_lipinski = []
    has_coc = []
    has_sa = []
    has_tz = []

    substruct_coc = AllChem.MolFromSmiles("COC")
    substruct_sa = AllChem.MolFromSmiles("NS(=O)=O")
    substruct_tz = AllChem.MolFromSmiles("N1N=NN=C1")

    total = len(df)
    for i, row in df.iterrows():
        if i % 1000 == 0 and i > 0:
            print(f"{round(100 * (i / total))}% done ...")

        smiles = row[6]
        mol = AllChem.MolFromSmiles(smiles)

        if mol and mol.GetNumAtoms() > 5 and smiles.count(".") < 2:
            fps.append(tm.VectorUint(enc.encode_mol(mol, min_radius=0)))
            labels.append(
                f'{smiles}__<a href="https://www.drugbank.ca/drugs/{row[0]}" target="_blank">{row[0]}</a>__{row[1]}'
                .replace("'", ""))
            groups.append(row[3].split(";")[0])
            tpsa.append(Descriptors.TPSA(mol))
            logp.append(Descriptors.MolLogP(mol))
            mw.append(Descriptors.MolWt(mol))
            h_acceptors.append(Descriptors.NumHAcceptors(mol))
            h_donors.append(Descriptors.NumHDonors(mol))
            ring_count.append(Descriptors.RingCount(mol))
            is_lipinski.append(lipinski_pass(mol))
            has_coc.append(mol.HasSubstructMatch(substruct_coc))
            has_sa.append(mol.HasSubstructMatch(substruct_sa))
            has_tz.append(mol.HasSubstructMatch(substruct_tz))

    # Create the labels and the integer encoded array for the groups,
    # as they're categorical
    labels_groups, groups = Faerun.create_categories(groups)
    tpsa_ranked = ss.rankdata(np.array(tpsa) / max(tpsa)) / len(tpsa)
    logp_ranked = ss.rankdata(np.array(logp) / max(logp)) / len(logp)
    mw_ranked = ss.rankdata(np.array(mw) / max(mw)) / len(mw)
    h_acceptors_ranked = ss.rankdata(
        np.array(h_acceptors) / max(h_acceptors)) / len(h_acceptors)
    h_donors_ranked = ss.rankdata(
        np.array(h_donors) / max(h_donors)) / len(h_donors)
    ring_count_ranked = ss.rankdata(
        np.array(ring_count) / max(ring_count)) / len(ring_count)

    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()
    cfg.k = 100
    # cfg.sl_extra_scaling_steps = 1
    cfg.sl_repeats = 2
    cfg.mmm_repeats = 2
    cfg.node_size = 2
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg)

    # Define a colormap highlighting approved vs non-approved
    custom_cmap = ListedColormap(
        [
            "#2ecc71", "#9b59b6", "#ecf0f1", "#e74c3c", "#e67e22", "#f1c40f",
            "#95a5a6"
        ],
        name="custom",
    )

    bin_cmap = ListedColormap(["#e74c3c", "#2ecc71"], name="bin_cmap")

    f = Faerun(
        clear_color="#222222",
        coords=False,
        view="front",
        impress=
        'made with <a href="http://tmap.gdb.tools" target="_blank">tmap</a><br />and <a href="https://github.com/reymond-group/faerun-python" target="_blank">faerun</a><br /><a href="https://gist.github.com/daenuprobst/5cddd0159c0cf4758fb16b4b4acbef89">source</a>',
    )

    f.add_scatter(
        "Drugbank",
        {
            "x":
            x,
            "y":
            y,
            "c": [
                groups,
                is_lipinski,
                has_coc,
                has_sa,
                has_tz,
                tpsa_ranked,
                logp_ranked,
                mw_ranked,
                h_acceptors_ranked,
                h_donors_ranked,
                ring_count_ranked,
            ],
            "labels":
            labels,
        },
        shader="smoothCircle",
        colormap=[
            custom_cmap,
            bin_cmap,
            bin_cmap,
            bin_cmap,
            bin_cmap,
            "viridis",
            "viridis",
            "viridis",
            "viridis",
            "viridis",
            "viridis",
        ],
        point_scale=2.5,
        categorical=[
            True, True, True, True, True, False, False, False, False, False
        ],
        has_legend=True,
        legend_labels=[
            labels_groups,
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
        ],
        selected_labels=["SMILES", "Drugbank ID", "Name"],
        series_title=[
            "Group",
            "Lipinski",
            "Ethers",
            "Sulfonamides",
            "Tetrazoles",
            "TPSA",
            "logP",
            "Mol Weight",
            "H Acceptors",
            "H Donors",
            "Ring Count",
        ],
        max_legend_label=[
            None,
            None,
            None,
            None,
            None,
            str(round(max(tpsa))),
            str(round(max(logp))),
            str(round(max(mw))),
            str(round(max(h_acceptors))),
            str(round(max(h_donors))),
            str(round(max(ring_count))),
        ],
        min_legend_label=[
            None,
            None,
            None,
            None,
            None,
            str(round(min(tpsa))),
            str(round(min(logp))),
            str(round(min(mw))),
            str(round(min(h_acceptors))),
            str(round(min(h_donors))),
            str(round(min(ring_count))),
        ],
        title_index=2,
        legend_title="",
    )

    f.add_tree("drugbanktree", {"from": s, "to": t}, point_helper="Drugbank")

    f.plot("drugbank", template="smiles")