def main(): """ Main function """ # Use 128 permutations to create the MinHash enc = tm.Minhash(1024) lf = tm.LSHForest(128, file_backed=True) # d = 1000 # n = 1000000 d = 10000 n = 1000 # Generating some random data start = timer() for _ in range(n): # data.append(tm.VectorUint(np.random.randint(0, high=2, size=d))) lf.add( enc.from_sparse_binary_array( tm.VectorUint(np.random.randint(0, high=2, size=d)))) print(f"Generating the data took {(timer() - start) * 1000}ms.") # Index the added data start = timer() lf.index() print(f"Indexing took {(timer() - start) * 1000}ms.") # Find the 10 nearest neighbors of the first entry start = timer() knng_from = tm.VectorUint() knng_to = tm.VectorUint() knng_weight = tm.VectorFloat() _ = lf.get_knn_graph(knng_from, knng_to, knng_weight, 10) print(f"The kNN search took {(timer() - start) * 1000}ms.")
def test_knn_graph(self): random.seed(42) data = [] for _ in range(100): row = [] for _ in range(10): row.append(random.randint(0, 20)) data.append(tm.VectorUint(row)) mh = tm.Minhash() lf = tm.LSHForest() lf.batch_add(mh.batch_from_sparse_binary_array(data)) lf.index() f = tm.VectorUint() t = tm.VectorUint() w = tm.VectorFloat() lf.get_knn_graph(f, t, w, 10) assert len(f) == 1000 assert t[0] == 0 assert t[1] == 26 assert t[2] == 36 assert t[3] == 67 assert t[4] == 33 assert t[5] == 83
def test_from_sparse_binary_array(self): mh = tm.Minhash(8) a = mh.from_sparse_binary_array( tm.VectorUint([6, 22, 26, 62, 626, 226622])) b = mh.from_sparse_binary_array( tm.VectorUint([6, 22, 26, 62, 262, 226622])) assert len(a) == 8 assert round(mh.get_distance(a, b), 2) == 0.25
def MolsToLSHForest(mol_list, save_path="./", worker=os.cpu_count() - 1, batch_size=None): print('Available CPU Cores =', os.cpu_count()) print('Number of CPU Core used =', worker) print('\nTotal Number of Mols =', len(mol_list)) if batch_size: print('Batch Size =', batch_size) if not os.path.exists(outpath): os.makedirs(outpath) print('Saving Files at', outpath) sys.stdout.flush() if batch_size: fps, props = batch_convert(mol_list, batch_size, outpath, props_named_tuple=Props) else: fps, props = single_convert(mol_list, outpath, props_named_tuple=Props) print("Loading data and converting to LSH Forest data") print('Converting MinHash Fingerprints to Vectors') sys.stdout.flush() fps = [tm.VectorUint(fp) for fp in fps] print(len(fps), 'Fingerprints Converted') sys.stdout.flush() lf.batch_add(fps) lf.index() lf.store(os.path.join(outpath, "lf.dat")) return lf, props
def calc_mhfp(ENC, mol): """calculates the minhashed fingerprint Arguments: mol Returns: tmap VectorUint -- minhashed fingerprint """ smiles = Chem.MolToSmiles(mol) fp = tm.VectorUint(ENC.encode(smiles)) return fp
def main(): """ Main function """ # Use 128 permutations to create the MinHash enc = tm.Minhash(128) lf = tm.LSHForest(128) d = 1000 n = 10000 data = [] # Generating some random data start = timer() for _ in range(n): data.append(tm.VectorUchar(np.random.randint(0, high=2, size=d))) print(f"Generating the data took {(timer() - start) * 1000}ms.") # Use batch_add to parallelize the insertion of the arrays start = timer() lf.batch_add(enc.batch_from_binary_array(data)) print(f"Adding the data took {(timer() - start) * 1000}ms.") # Index the added data start = timer() lf.index() print(f"Indexing took {(timer() - start) * 1000}ms.") # Construct the k-nearest neighbour graph start = timer() knng_from = tm.VectorUint() knng_to = tm.VectorUint() knng_weight = tm.VectorFloat() _ = lf.get_knn_graph(knng_from, knng_to, knng_weight, 10) print(f"The kNN search took {(timer() - start) * 1000}ms.")
def test_lf_layout(self): random.seed(42) data = [] for _ in range(100): row = [] for _ in range(10): row.append(random.randint(0, 20)) data.append(tm.VectorUint(row)) mh = tm.Minhash() lf = tm.LSHForest() lf.batch_add(mh.batch_from_sparse_binary_array(data)) lf.index() x, y, s, t, gp = tm.layout_from_lsh_forest(lf) assert len(x) == 100 assert len(s) == 99
def test_query(self): random.seed(42) data = [] for _ in range(100): row = [] for _ in range(10): row.append(random.randint(0, 20)) data.append(tm.VectorUint(row)) mh = tm.Minhash() lf = tm.LSHForest() lf.batch_add(mh.batch_from_sparse_binary_array(data)) lf.index() assert lf.size() == len(data) r = lf.query_linear_scan_by_id(0, 10) assert r[0][1] == 0 assert r[1][1] == 26
def LSH_Convert(mols, outpath, num_workers): # MinHash fingerprints (mhfp) encoder for molecular fingerprinting enc = MHFPEncoder(1024) # Locality Sensitive Hashing Forest Instance lf = tm.LSHForest(1024, 64) print("Number of mols to be hashed:", len(mols)) fps = process_map(enc.encode_mol, mols, chunksize=100, max_workers=num_workers) fp_vecs = [tm.VectorUint(fp) for fp in fps] lf.batch_add(fp_vecs) lf.index() # save fp and lf with open(os.path.join(outpath, "fps.pickle"), "wb") as fpfile: pickle.dump(fps, fpfile) lf.store(os.path.join(outpath, "lf.dat")) print('LSH data files saved!') return lf
def main(): """ The main function """ df = pd.read_csv("papers.tar.xz") df.drop(df.tail(1).index, inplace=True) df["title"] = df["title"].apply(lambda t: t.replace("'", '"')) enc = tm.Minhash() lf = tm.LSHForest() ctr = Counter() texts = [] for _, row in df.iterrows(): text = re.sub(r"[^a-zA-Z-]+", " ", row["paper_text"]) text = [t.lower() for t in text.split(" ") if len(t) > 2] ctr.update(text) texts.append(text) # Remove the top n words n = 6000 ctr = ctr.most_common()[: -(len(ctr) - n) - 1 : -1] # Make it fast using a lookup map all_words = {} for i, (key, _) in enumerate(ctr): all_words[key] = i # Create the fingerprints and also check whether the word # "deep" is found in the document fingerprints = [] has_word = [] for text in texts: if "deep" in text: has_word.append(1) else: has_word.append(0) fingerprint = [] for t in text: if t in all_words: fingerprint.append(all_words[t]) fingerprints.append(tm.VectorUint(fingerprint)) # Index the article fingerprints lf.batch_add(enc.batch_from_sparse_binary_array(fingerprints)) lf.index() # Create the tmap config = tm.LayoutConfiguration() config.k = 100 x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=config) faerun = Faerun( view="front", coords=False, legend_title="", legend_number_format="{:.0f}" ) # Add a scatter that is bigger than the one above, to add colored # circles. faerun.add_scatter( "NIPS_word", {"x": x, "y": y, "c": has_word, "labels": df["title"]}, colormap="Set1", point_scale=7.5, max_point_size=25, shader="smoothCircle", has_legend=True, categorical=True, legend_title="Contains word<br/>'deep'", legend_labels=[(0, "No"), (1, "Yes")], interactive=False, ) # Add a scatter that is colored by year on top faerun.add_scatter( "NIPS", {"x": x, "y": y, "c": df["year"], "labels": df["title"]}, colormap="gray", point_scale=5.0, max_point_size=20, shader="smoothCircle", has_legend=True, legend_title="Year of<br/>Publication", ) faerun.add_tree( "NIPS_tree", {"from": s, "to": t}, point_helper="NIPS", color="#666666" ) faerun.plot("nips_papers")
import pickle import random import sys import numpy as np import tmap as tm fps = [] findCID = {} with open(sys.argv[1]) as inFile: for i, line in enumerate(inFile): line = line.strip() line = line.split(' ') findCID[i] = [line[0], line[1], line[2]] fp = tm.VectorUint(np.array(list(map(int, line[3].split(';'))))) fps.append(fp) pickle.dump(findCID, open('{}_dictionary'.format(sys.argv[1]), 'wb')) lf = tm.LSHForest(512, 32) lf.batch_add(fps) lf.index() lf.store('{}_LSHforest'.format(sys.argv[1]))
def main(): """ The main function """ df = pd.read_csv("drugbank.csv").dropna(subset=["SMILES"]).reset_index( drop=True) enc = MHFPEncoder() lf = tm.LSHForest(2048, 128) fps = [] labels = [] groups = [] tpsa = [] logp = [] mw = [] h_acceptors = [] h_donors = [] ring_count = [] is_lipinski = [] has_coc = [] has_sa = [] has_tz = [] substruct_coc = AllChem.MolFromSmiles("COC") substruct_sa = AllChem.MolFromSmiles("NS(=O)=O") substruct_tz = AllChem.MolFromSmiles("N1N=NN=C1") total = len(df) for i, row in df.iterrows(): if i % 1000 == 0 and i > 0: print(f"{round(100 * (i / total))}% done ...") smiles = row[6] mol = AllChem.MolFromSmiles(smiles) if mol and mol.GetNumAtoms() > 5 and smiles.count(".") < 2: fps.append(tm.VectorUint(enc.encode_mol(mol, min_radius=0))) labels.append( f'{smiles}__<a href="https://www.drugbank.ca/drugs/{row[0]}" target="_blank">{row[0]}</a>__{row[1]}' .replace("'", "")) groups.append(row[3].split(";")[0]) tpsa.append(Descriptors.TPSA(mol)) logp.append(Descriptors.MolLogP(mol)) mw.append(Descriptors.MolWt(mol)) h_acceptors.append(Descriptors.NumHAcceptors(mol)) h_donors.append(Descriptors.NumHDonors(mol)) ring_count.append(Descriptors.RingCount(mol)) is_lipinski.append(lipinski_pass(mol)) has_coc.append(mol.HasSubstructMatch(substruct_coc)) has_sa.append(mol.HasSubstructMatch(substruct_sa)) has_tz.append(mol.HasSubstructMatch(substruct_tz)) # Create the labels and the integer encoded array for the groups, # as they're categorical labels_groups, groups = Faerun.create_categories(groups) tpsa_ranked = ss.rankdata(np.array(tpsa) / max(tpsa)) / len(tpsa) logp_ranked = ss.rankdata(np.array(logp) / max(logp)) / len(logp) mw_ranked = ss.rankdata(np.array(mw) / max(mw)) / len(mw) h_acceptors_ranked = ss.rankdata( np.array(h_acceptors) / max(h_acceptors)) / len(h_acceptors) h_donors_ranked = ss.rankdata( np.array(h_donors) / max(h_donors)) / len(h_donors) ring_count_ranked = ss.rankdata( np.array(ring_count) / max(ring_count)) / len(ring_count) lf.batch_add(fps) lf.index() cfg = tm.LayoutConfiguration() cfg.k = 100 # cfg.sl_extra_scaling_steps = 1 cfg.sl_repeats = 2 cfg.mmm_repeats = 2 cfg.node_size = 2 x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg) # Define a colormap highlighting approved vs non-approved custom_cmap = ListedColormap( [ "#2ecc71", "#9b59b6", "#ecf0f1", "#e74c3c", "#e67e22", "#f1c40f", "#95a5a6" ], name="custom", ) bin_cmap = ListedColormap(["#e74c3c", "#2ecc71"], name="bin_cmap") f = Faerun( clear_color="#222222", coords=False, view="front", impress= 'made with <a href="http://tmap.gdb.tools" target="_blank">tmap</a><br />and <a href="https://github.com/reymond-group/faerun-python" target="_blank">faerun</a><br /><a href="https://gist.github.com/daenuprobst/5cddd0159c0cf4758fb16b4b4acbef89">source</a>', ) f.add_scatter( "Drugbank", { "x": x, "y": y, "c": [ groups, is_lipinski, has_coc, has_sa, has_tz, tpsa_ranked, logp_ranked, mw_ranked, h_acceptors_ranked, h_donors_ranked, ring_count_ranked, ], "labels": labels, }, shader="smoothCircle", colormap=[ custom_cmap, bin_cmap, bin_cmap, bin_cmap, bin_cmap, "viridis", "viridis", "viridis", "viridis", "viridis", "viridis", ], point_scale=2.5, categorical=[ True, True, True, True, True, False, False, False, False, False ], has_legend=True, legend_labels=[ labels_groups, [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], [(0, "No"), (1, "Yes")], ], selected_labels=["SMILES", "Drugbank ID", "Name"], series_title=[ "Group", "Lipinski", "Ethers", "Sulfonamides", "Tetrazoles", "TPSA", "logP", "Mol Weight", "H Acceptors", "H Donors", "Ring Count", ], max_legend_label=[ None, None, None, None, None, str(round(max(tpsa))), str(round(max(logp))), str(round(max(mw))), str(round(max(h_acceptors))), str(round(max(h_donors))), str(round(max(ring_count))), ], min_legend_label=[ None, None, None, None, None, str(round(min(tpsa))), str(round(min(logp))), str(round(min(mw))), str(round(min(h_acceptors))), str(round(min(h_donors))), str(round(min(ring_count))), ], title_index=2, legend_title="", ) f.add_tree("drugbanktree", {"from": s, "to": t}, point_helper="Drugbank") f.plot("drugbank", template="smiles")
def main(): """ Main function """ # Initialize and configure tmap dims = 2048 enc = tm.Minhash(dims) lf = tm.LSHForest(dims, 128, store=True) fps = [] # fps_umap = [] for row in DATA: fps.append(tm.VectorUint(list(row))) lf.batch_add(enc.batch_from_sparse_binary_array(fps)) lf.index() x_tmap, y_tmap, s, t, _ = tm.layout_from_lsh_forest(lf, CFG_TMAP) lf.clear() # Prepare custom color map tab10 = plt.get_cmap("tab10").colors colors_gray = [(0.2, 0.2, 0.2), tab10[0], tab10[1], tab10[2], tab10[3], tab10[4]] custom_cm_gray = LinearSegmentedColormap.from_list("custom_cm_gray", colors_gray, N=len(colors_gray)) legend_labels = [ (1, "Rudyard Kipling"), (2, "Herbert George Wells"), (3, "Charles Darwin"), (4, "George Bernard Shaw"), (5, "William Wymark Jacobs"), (0, "Other"), ] faerun = Faerun( clear_color="#111111", view="front", coords=False, alpha_blending=True, legend_title="", ) faerun.add_scatter( "gutenberg", { "x": x_tmap, "y": y_tmap, "c": LABELS, "labels": FAERUN_LABELS }, colormap=custom_cm_gray, point_scale=4.2, max_point_size=10, has_legend=True, categorical=True, legend_title="Authors", legend_labels=legend_labels, shader="smoothCircle", selected_labels=["Author", "Title"], ) faerun.add_tree( "gutenberg_tree", { "from": s, "to": t }, point_helper="gutenberg", color="#222222", ) faerun.plot("gutenberg", template="default")