def main(): """ Main function """ # Use 128 permutations to create the MinHash enc = tm.Minhash(128) lf = tm.LSHForest(128) d = 10000 n = 1000 data = [] # Generating some random data start = timer() for i in range(n): data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d))) print(f"Generating the data took {(timer() - start) * 1000}ms.") # Use batch_add to parallelize the insertion of the arrays start = timer() lf.batch_add(enc.batch_from_binary_array(data)) print(f"Adding the data took {(timer() - start) * 1000}ms.") # Index the added data start = timer() lf.index() print(f"Indexing took {(timer() - start) * 1000}ms.") # The configuration for the MST plot # Distribute the tree more evenly cfg = tm.LayoutConfiguration() cfg.sl_scaling_min = 1 cfg.sl_scaling_max = 1 cfg.node_size = 1 / 50 # Construct the k-nearest neighbour graph start = timer() x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg) print(f"layout_from_lsh_forest took {(timer() - start) * 1000}ms.") # Plot spanning tree layout start = timer() for i in range(len(s)): plt.plot( [x[s[i]], x[t[i]]], [y[s[i]], y[t[i]]], "r-", linewidth=1.0, alpha=0.5, zorder=1, ) plt.scatter(x, y, s=0.1, zorder=2) plt.tight_layout() plt.savefig("lsh_forest_knng_mpl.png") print(f"Plotting using matplotlib took {(timer() - start) * 1000}ms.")
def tree_coords(lf, node_size=1 / 20, k=20, mmm_rps=2): print('Converting to tmap coordinates') # Create a LayoutConfiguration instance cfg = tm.LayoutConfiguration() cfg.node_size = node_size cfg.mmm_repeats = mmm_rps cfg.sl_extra_scaling_steps = 5 cfg.k = k cfg.sl_scaling_type = tm.RelativeToAvgLength #Create minimum spanning tree from the LSHForest and LayoutConfiguration instance #The x and y coordinates of the vertices, the ids of the vertices spanning the edges #information on the graph is ignored x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg) return list(x), list(y), list(s), list(t)
def main(): """ Main function """ n = 1000 edge_list = [] weights = {} # Create a random graph for i in range(n): for j in np.random.randint(0, high=n, size=2): # Do not add parallel edges here, to be sure # to have the right weight later if i in weights and j in weights[i] or j in weights and i in weights[j]: continue weight = np.random.rand(1) edge_list.append([i, j, weight]) # Store the weights in 2d map for easy access if i not in weights: weights[i] = {} if j not in weights: weights[j] = {} # Invert weights to make lower ones more visible in the plot weights[i][j] = 1.0 - weight weights[j][i] = 1.0 - weight # Set the initial randomized positioning to True # Otherwise, OGDF tends to segfault cfg = tm.LayoutConfiguration() cfg.fme_randomize = True # The configuration for the MST plot # Distribute the tree more evenly cfg_mst = tm.LayoutConfiguration() cfg_mst.sl_scaling_min = 1 cfg_mst.sl_scaling_max = 1 # Compute the layout x, y, s, t, _ = tm.layout_from_edge_list(n, edge_list, config=cfg, create_mst=False) x_mst, y_mst, s_mst, t_mst, _ = tm.layout_from_edge_list( n, edge_list, config=cfg_mst, create_mst=True ) _, (ax1, ax2) = plt.subplots(ncols=2, sharey=True) # Plot graph layout with spanning tree superimposed in red for i in range(len(s)): ax1.plot( [x[s[i]], x[t[i]]], [y[s[i]], y[t[i]]], "k-", linewidth=weights[s[i]][t[i]], alpha=0.5, zorder=1, ) for i in range(len(s_mst)): ax1.plot( [x[s_mst[i]], x[t_mst[i]]], [y[s_mst[i]], y[t_mst[i]]], "r-", linewidth=weights[s_mst[i]][t_mst[i]], alpha=0.5, zorder=2, ) ax1.scatter(x, y, s=0.1, zorder=3) # Plot spanning tree layout for i in range(len(s_mst)): ax2.plot( [x_mst[s_mst[i]], x_mst[t_mst[i]]], [y_mst[s_mst[i]], y_mst[t_mst[i]]], "r-", linewidth=weights[s_mst[i]][t_mst[i]], alpha=0.5, zorder=1, ) ax2.scatter(x_mst, y_mst, s=0.1, zorder=2) plt.tight_layout() plt.savefig("spanning_tree_big.png")
from tqdm import tqdm import os import tmap as tm # Load the data MN = MNIST("./data/mnist-data") IMAGES_TRAIN, LABELS_TRAIN = MN.load_training() IMAGES_TEST, LABELS_TEST = MN.load_testing() IMAGES = np.concatenate((IMAGES_TRAIN, IMAGES_TEST)) LABELS = np.concatenate((LABELS_TRAIN, LABELS_TEST)) IMAGE_LABELS = [] # Coniguration for the tmap layout CFG = tm.LayoutConfiguration() CFG.node_size = 1 / 50 def main(): """ Main function """ # Initialize and configure tmap dims = 1024 enc = tm.Minhash(dims) lf = tm.LSHForest(dims, 128) print("Converting images ...") for image in tqdm(IMAGES): img = Image.fromarray(np.uint8(np.split(np.array(image), 28))) buffered = BytesIO()
def main(): """ The main function """ df = pd.read_csv("papers.tar.xz") df.drop(df.tail(1).index, inplace=True) df["title"] = df["title"].apply(lambda t: t.replace("'", '"')) enc = tm.Minhash() lf = tm.LSHForest() ctr = Counter() texts = [] for _, row in df.iterrows(): text = re.sub(r"[^a-zA-Z-]+", " ", row["paper_text"]) text = [t.lower() for t in text.split(" ") if len(t) > 2] ctr.update(text) texts.append(text) # Remove the top n words n = 6000 ctr = ctr.most_common()[: -(len(ctr) - n) - 1 : -1] # Make it fast using a lookup map all_words = {} for i, (key, _) in enumerate(ctr): all_words[key] = i # Create the fingerprints and also check whether the word # "deep" is found in the document fingerprints = [] has_word = [] for text in texts: if "deep" in text: has_word.append(1) else: has_word.append(0) fingerprint = [] for t in text: if t in all_words: fingerprint.append(all_words[t]) fingerprints.append(tm.VectorUint(fingerprint)) # Index the article fingerprints lf.batch_add(enc.batch_from_sparse_binary_array(fingerprints)) lf.index() # Create the tmap config = tm.LayoutConfiguration() config.k = 100 x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=config) faerun = Faerun( view="front", coords=False, legend_title="", legend_number_format="{:.0f}" ) # Add a scatter that is bigger than the one above, to add colored # circles. faerun.add_scatter( "NIPS_word", {"x": x, "y": y, "c": has_word, "labels": df["title"]}, colormap="Set1", point_scale=7.5, max_point_size=25, shader="smoothCircle", has_legend=True, categorical=True, legend_title="Contains word<br/>'deep'", legend_labels=[(0, "No"), (1, "Yes")], interactive=False, ) # Add a scatter that is colored by year on top faerun.add_scatter( "NIPS", {"x": x, "y": y, "c": df["year"], "labels": df["title"]}, colormap="gray", point_scale=5.0, max_point_size=20, shader="smoothCircle", has_legend=True, legend_title="Year of<br/>Publication", ) faerun.add_tree( "NIPS_tree", {"from": s, "to": t}, point_helper="NIPS", color="#666666" ) faerun.plot("nips_papers")
def main(): """ Main function """ dims = 512 lf = tm.LSHForest(dims, 128, store=True) # Due to the large data size (> 1GB) the following files are not provided directly smiles, target_class, activity, chembl_id = pickle.load( open("chembl.pickle", "rb")) labels = [] for i, s in enumerate(smiles): labels.append( s + "__" + chembl_id[i] + "__" + f'<a target="_blank" href="https://www.ebi.ac.uk/chembl/compound_report_card/{chembl_id[i]}">{chembl_id[i]}</a>' ) lf.restore("chembl.dat") target_class_map = dict([(y, x + 1) for x, y in enumerate(sorted(set(target_class)))]) classes = [ "enzyme", "kinase", "protease", "cytochrome p450", "ion channel", "transporter", "transcription factor", "membrane receptor", "epigenetic regulator", ] i = 0 for key, value in target_class_map.items(): if key not in classes: target_class_map[key] = 7 else: target_class_map[key] = i i += 1 if i == 7: i = 8 cfg = tm.LayoutConfiguration() cfg.node_size = 1 / 70 cfg.mmm_repeats = 2 cfg.sl_repeats = 2 start = timer() x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg) end = timer() print(end - start) activity = np.array(activity) activity = np.maximum(0.0, activity) activity = np.minimum(100.0, activity) activity = 10.0 - activity legend_labels = [ (0, "Cytochrome p450"), (1, "Other Enzyme"), (2, "Epigenetic Regulator"), (3, "Ion Channel"), (4, "Kinase"), (5, "Membrane Receptor"), (6, "Protease"), (8, "Transcription Factor"), (9, "Transporter"), (7, "Other"), ] vals = [int(target_class_map[x]) for x in target_class] faerun = Faerun(view="front", coords=False) faerun.add_scatter( "chembl", { "x": x, "y": y, "c": vals, "labels": labels }, colormap="tab10", point_scale=1.0, max_point_size=10, has_legend=True, categorical=True, shader="smoothCircle", legend_labels=legend_labels, title_index=1, ) faerun.add_tree("chembl_tree", { "from": s, "to": t }, point_helper="chembl", color="#222222") faerun.plot("chembl", template="smiles")
""" Visualizing RNA sequencing data using tmap. Data Source: https://gdc.cancer.gov/about-data/publications/pancanatlas """ import numpy as np import pandas as pd from faerun import Faerun import tmap as tm # Coniguration for the tmap layout CFG_TMAP = tm.LayoutConfiguration() CFG_TMAP.k = 50 CFG_TMAP.kc = 50 CFG_TMAP.node_size = 1 / 20 DATA = pd.read_csv("data.csv.xz", index_col=0, sep=",") LABELS = pd.read_csv("labels.csv", index_col=0, sep=",") LABELMAP = {"PRAD": 1, "LUAD": 2, "BRCA": 3, "KIRC": 4, "COAD": 5} LABELS = np.array([int(LABELMAP[v]) for v in LABELS["Class"]], dtype=np.int) def main(): """ Main function """ # Initialize and configure tmap dims = 256 enc = tm.Minhash(len(DATA.columns), 42, dims)
def main(): """ The main function """ df = pd.read_csv("drugbank.csv").dropna(subset=["SMILES"]).reset_index( drop=True) enc = MHFPEncoder() lf = tm.LSHForest(2048, 128) fps = [] labels = [] groups = [] tpsa = [] logp = [] mw = [] h_acceptors = [] h_donors = [] ring_count = [] is_lipinski = [] has_coc = [] has_sa = [] has_tz = [] substruct_coc = AllChem.MolFromSmiles("COC") substruct_sa = AllChem.MolFromSmiles("NS(=O)=O") substruct_tz = AllChem.MolFromSmiles("N1N=NN=C1") total = len(df) for i, row in df.iterrows(): if i % 1000 == 0 and i > 0: print(f"{round(100 * (i / total))}% done ...") smiles = row[6] mol = AllChem.MolFromSmiles(smiles) if mol and mol.GetNumAtoms() > 5 and smiles.count(".") < 2: fps.append(tm.VectorUint(enc.encode_mol(mol, min_radius=0))) labels.append( f'{smiles}__<a href="https://www.drugbank.ca/drugs/{row[0]}" target="_blank">{row[0]}</a>__{row[1]}' .replace("'", "")) groups.append(row[3].split(";")[0]) tpsa.append(Descriptors.TPSA(mol)) logp.append(Descriptors.MolLogP(mol)) mw.append(Descriptors.MolWt(mol)) h_acceptors.append(Descriptors.NumHAcceptors(mol)) h_donors.append(Descriptors.NumHDonors(mol)) ring_count.append(Descriptors.RingCount(mol)) is_lipinski.append(lipinski_pass(mol)) has_coc.append(mol.HasSubstructMatch(substruct_coc)) has_sa.append(mol.HasSubstructMatch(substruct_sa)) has_tz.append(mol.HasSubstructMatch(substruct_tz)) # Create the labels and the integer encoded array for the groups, # as they're categorical labels_groups, groups = Faerun.create_categories(groups) tpsa_ranked = ss.rankdata(np.array(tpsa) / max(tpsa)) / len(tpsa) logp_ranked = ss.rankdata(np.array(logp) / max(logp)) / len(logp) mw_ranked = ss.rankdata(np.array(mw) / max(mw)) / len(mw) h_acceptors_ranked = ss.rankdata( np.array(h_acceptors) / max(h_acceptors)) / len(h_acceptors) h_donors_ranked = ss.rankdata( np.array(h_donors) / max(h_donors)) / len(h_donors) ring_count_ranked = ss.rankdata( np.array(ring_count) / max(ring_count)) / len(ring_count) lf.batch_add(fps) lf.index() cfg = tm.LayoutConfiguration() cfg.k = 100 # cfg.sl_extra_scaling_steps = 1 cfg.sl_repeats = 2 cfg.mmm_repeats = 2 cfg.node_size = 2 x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg) # Define a colormap highlighting approved vs non-approved custom_cmap = ListedColormap( [ "#2ecc71", "#9b59b6", "#ecf0f1", "#e74c3c", "#e67e22", "#f1c40f", "#95a5a6" ], name="custom", ) bin_cmap = ListedColormap(["#e74c3c", "#2ecc71"], name="bin_cmap") f = Faerun( clear_color="#222222", coords=False, view="front", impress= 'made with <a href="http://tmap.gdb.tools" target="_blank">tmap</a><br />and <a href="https://github.com/reymond-group/faerun-python" target="_blank">faerun</a><br /><a href="https://gist.github.com/daenuprobst/5cddd0159c0cf4758fb16b4b4acbef89">source</a>', ) f.add_scatter( "Drugbank", { "x": x, "y": y, "c": [ groups, is_lipinski, has_coc, has_sa, has_tz, tpsa_ranked, logp_ranked, mw_ranked, h_acceptors_ranked, h_donors_ranked, ring_count_ranked, ], "labels": labels, }, shader="smoothCircle", colormap=[ custom_cmap, bin_cmap, bin_cmap, bin_cmap, bin_cmap, "viridis", "viridis", "viridis", "viridis", "viridis", "viridis", ], point_scale=2.5, categorical=[ True, True, True, True, True, False, False, False, False, False ], has_legend=True, legend_labels=[ labels_groups, [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], ], selected_labels=["SMILES", "Drugbank ID", "Name"], series_title=[ "Group", "Lipinski", "Ethers", "Sulfonamides", "Tetrazoles", "TPSA", "logP", "Mol Weight", "H Acceptors", "H Donors", "Ring Count", ], max_legend_label=[ None, None, None, None, None, str(round(max(tpsa))), str(round(max(logp))), str(round(max(mw))), str(round(max(h_acceptors))), str(round(max(h_donors))), str(round(max(ring_count))), ], min_legend_label=[ None, None, None, None, None, str(round(min(tpsa))), str(round(min(logp))), str(round(min(mw))), str(round(min(h_acceptors))), str(round(min(h_donors))), str(round(min(ring_count))), ], title_index=2, legend_title="", ) f.add_tree("drugbanktree", {"from": s, "to": t}, point_helper="Drugbank") f.plot("drugbank", template="smiles")