def test_workflow(self): for backend in Annoy, BruteForcePurePython, Sklearn: sim = self.make_sim(backend) self.workflow(sim) sim.save(opj(self.tmpdir, 'neighbortest')) sim2 = SimpleNeighbors.load(opj(self.tmpdir, 'neighbortest')) self.workflow(sim2)
def load(cls, prefix, nlp): import pickle with open(prefix + "-chatbot.pkl", "rb") as fh: data = pickle.load(fh) newobj = cls(nlp, data['dims']) newobj.id_pairs = data['id_pairs'] newobj.vocab = data['vocab'] newobj.nns = SimpleNeighbors.load(prefix) return newobj
class SemanticSimilarityChatbot: def __init__(self, nlp, dims): self.nns = SimpleNeighbors(dims) self.id_pairs = {} self.vocab = [] self.dims = dims self.nlp = nlp def add_to_vocab(self, item): cur_id = len(self.vocab) self.vocab.append(item) return cur_id def add_pair(self, first, second): first_id = self.add_to_vocab(first) second_id = self.add_to_vocab(second) self.id_pairs[first_id] = second_id vec = self.vectorize(first) self.nns.add_one(first_id, vec) def vectorize(self, s): if s == "": s = " " doc = self.nlp(s, disable=['tagger', 'parser']) mean = np.mean(np.array([w.vector for w in doc]), axis=0) return mean def build(self, n=50): self.nns.build(n) def response_for(self, s, n=10): vec = self.vectorize(s) nearest_ids = self.nns.nearest(vec, n) picked = random.choice(nearest_ids) return self.vocab[self.id_pairs[picked]] def save(self, prefix): import pickle data = { 'id_pairs': self.id_pairs, 'vocab': self.vocab, 'dims': self.dims } with open(prefix + "-chatbot.pkl", "wb") as fh: pickle.dump(data, fh) self.nns.save(prefix) @classmethod def load(cls, prefix, nlp): import pickle with open(prefix + "-chatbot.pkl", "rb") as fh: data = pickle.load(fh) newobj = cls(nlp, data['dims']) newobj.id_pairs = data['id_pairs'] newobj.vocab = data['vocab'] newobj.nns = SimpleNeighbors.load(prefix) return newobj
def init_nlp(**kwargs): global nlp, vocab_forest nlp = nlp or spacy.load(kwargs.get('model', 'en_vectors_web_lg')) # stop words from spacy.en: stop_words = [ 'other', 'she', 'alone', 'hers', 'enough', 'becoming', 'amount', 'himself', 'such', 'sometime', 'noone', 'though', 'thereupon', 'wherever', 'will', 'now', 'therefore', 'forty', 'name', 'whom', 'often', 'unless', 'this', 'whether', 'nothing', 'well', 'along', 'from', 'on', 'should', 'hundred', 'much', 'seems', 'wherein', 'beyond', 'used', 'you', 'except', 'so', 'top', 'even', 'without', 'give', 'and', 'whoever', 'about', 'nor', 'which', 'together', 'an', 'everyone', 'below', 'itself', 'doing', 'mostly', 'many', 'else', 'already', 'elsewhere', 'whereupon', 'were', 'using', 'until', 'mine', 'made', 'nobody', 'some', 'down', 'toward', 'with', 'out', 'has', 'although', 'their', 'sixty', 'somehow', 'full', 'next', 'between', 'by', 'yourselves', 'throughout', 'few', 'own', 'hereafter', 'up', 'done', 'indeed', 'anywhere', 'then', 'latter', 'our', 'same', 'over', 're', 'not', 'regarding', 'nowhere', 'really', 'former', 'any', 'through', 'they', 'whole', 'becomes', 'around', 'yet', 'less', 'is', 'these', 'whatever', 'otherwise', 'as', 'anything', 'among', 'have', 'however', 'go', 'afterwards', 'since', 'still', 'can', 'beforehand', 'everywhere', 'why', 'seem', 'because', 'last', 'due', 'had', 'get', 'while', 'all', 'him', 'who', 'most', 'to', 'only', 'serious', 'meanwhile', 'are', 'show', 'several', 'at', 'might', 'onto', 'anyone', 'her', 'hereby', 'seemed', 'am', 'again', 'move', 'therein', 'than', 'did', 'very', 'it', 'anyhow', 'both', 'please', 'i', 'make', 'more', 'no', 'off', 'various', 'been', 'thereby', 'against', 'whence', 'third', 'there', 'ever', 'sometimes', 'every', 'take', 'we', 'say', 'each', 'also', 'what', 'me', 'us', 'anyway', 'none', 'per', 'thru', 'his', 'moreover', 'a', 'perhaps', 'how', 'yours', 'besides', 'whenever', 'empty', 'least', 'under', 'he', 'back', 'myself', 'namely', 'first', 'herself', 'into', 'someone', 'quite', 'never', 'always', 'here', 'via', 'cannot', 'must', 'ca', 'would', 'nevertheless', 'above', 'front', 'part', 'became', 'yourself', 'after', 'everything', 'your', 'somewhere', 'before', 'too', 'the', 'those', 'once', 'does', 'do', 'towards', 'could', 'keep', 'them', 'for', 'twenty', 'something', 'but', 'my', 'see', 'that', 'in', 'others', 'side', 'of', 'further', 'during', 'upon', 'behind', 'become', 'almost', 'whose', 'another', 'its', 'within', 'thereafter', 'bottom', 'whereas', 'when', 'seeming', 'just', 'either', 'put', 'or', 'call', 'being', 'be', 'fifty', 'beside', 'across', 'may', 'whereby', 'neither', 'was', 'rather', 'if', 'formerly', 'amongst', 'where', 'thus', 'ourselves', 'themselves', 'hence', 'ours' ] # custom stop words: stop_words += ['\'s', 'St.', 'tabu'] for stop_word in stop_words: nlp.vocab[stop_word].is_stop = True print("loading up the prepared Annoy object...") vocab_forest = SimpleNeighbors.load('vocab_forest') return nlp
def create_bert_embeddings(stories): sentences = [] embedding_dimensions = 768 single_index = SimpleNeighbors(embedding_dimensions) for story in stories.values(): sentence = TreebankWordDetokenizer().detokenize(story['story'][0]) #print(sentence) sentences.append(sentence) sbert_model = SentenceTransformer('stsb-roberta-base') sentence_embeddings = sbert_model.encode(sentences, show_progress_bar=True) for embedding, key in zip(sentence_embeddings, stories.values()): single_index.add_one(key['movie_id'], embedding) # print("Key: ", key['movie_id']) # print("Embedding: ", embedding) # print("Vector Len: ", len(embedding)) # input("Press any key...") return (single_index)
def create_doc2vec_embeddings(stories): sentences = [] tags = {} embedding_dimensions = 160 single_index = SimpleNeighbors(embedding_dimensions) for i, story in enumerate(stories.values()): dfs_doc = TaggedDocument(words=story['story'][0], tags=[story['story'][1][0]]) sentences.append(dfs_doc) tags[i] = story['story'][1][0] # print(story['movie_id']) # print(dfs_doc) # input("Press....") model = NEBULA_DOC_MODEL(dimensions=embedding_dimensions, epochs=400) model.fit(sentences, tags) print(len(sentences), " ", len(tags)) sentence_embeddings = model._get_embeddings(tags) for embedding, key in zip(sentence_embeddings, stories.values()): single_index.add_one(key['movie_id'], embedding) return (single_index)
def load_doc2vec_embeddings(stories): sentences = [] tags = {} embedding_dimensions = 160 single_index = SimpleNeighbors(embedding_dimensions) for i, story in enumerate(stories.values()): dfs_doc = TaggedDocument(words=story['story'][0], tags=[story['story'][1][0]]) sentences.append(dfs_doc) tags[i] = story['story'][1][0] # print(story['movie_id']) # print(dfs_doc) # input("Press....") model = Doc2Vec.load("nebula_model_doc.dat") #sentence_embeddings = np.array([model.docvecs[tags[i]] for i, _ in enumerate(documents)]) print(len(sentences), " ", len(tags)) sentence_embeddings = _get_embeddings(model, tags) for embedding, key in zip(sentence_embeddings, stories.values()): single_index.add_one(key['movie_id'], embedding) return (single_index)
def getLookUp(words): lookUp = SimpleNeighbors(numDimensions()) for w in words: # .corpus of the lookUp lets us determine if the word has already been added if w.text.lower() not in lookUp.corpus: lookUp.add_one(w.text.lower(), w.vector) lookUp.build() return lookUp
def benchmark(n=10000, dims=300, query_count=10, metric='angular'): import numpy as np from time import time data = np.random.randn(n, dims) for backend in available(): start = time() print("benchmarking", backend, "at", start) sim = SimpleNeighbors(dims, metric, backend=backend) labels = list(range(n)) print("feeding data") sim.feed(zip(labels, data)) print("building index") sim.build(50) to_build = time() print("querying") for i in range(query_count): sim.nearest(np.random.randn(dims)) nearest_query = time() print( backend, "%0.2f sec to build, %0.2f sec to query %d items" % (to_build - start, nearest_query - start, query_count))
def get_vectors_and_build_index(collection, num_index_trees=40): items = list( collection.find({"vector": { "$exists": 1 }}, projection={ "text": 1, "vector": 1 })) sentences = [it['text'] for it in items] embeddings = np.array([it['vector'] for it in items]) embedding_dimensions = embeddings.shape[1] print(f'\nAdding {len(embeddings)} embeddings to index') index = SimpleNeighbors(embedding_dimensions, metric='dot') for i in trange(embeddings.shape[0]): index.add_one(sentences[i], embeddings[i]) print(f'Building index with {num_index_trees} trees...') index.build(n=num_index_trees) return index
def make_sim(self, backend=None): sim = SimpleNeighbors(3, metric='angular', backend=backend) sim.feed(data) sim.add_one(*one_more) sim.build(20) return sim
def process_trends(collection_in, query, collection_out): cur = collection_in.find(query, projection={"articles": 1}) for item in tqdm(cur, total=collection_in.count_documents(query)): articles = item['articles'] trend_snippet = max(articles, key=lambda x: len(x['snippet']))['snippet'] trend_snippet = clean_text(trend_snippet) vector = vectorize([clean_text(trend_snippet)])[0] search_results, dists = query_trend_text(clean_text(trend_snippet), index) post_ids = [] for search_result in search_results: post_id_res = db.raw_posts.find_one({"text": search_result}) if post_id_res: post_id = post_id_res['id'] post_ids.append(post_id) if len(post_ids) > 5: collection_out.insert_one( {"trend_snippet": trend_snippet, "post_ids": post_ids, "result_texts": search_results, "dists": dists} ) collection_in.update_one({"_id": item['_id']}, {"$set": {"processed": 1}}, upsert=True) else: print(len(post_ids)) if __name__ == "__main__": db = MongoClient(host=mongo_host, port=mongo_port, username=mongo_user, password=mongo_pass)[mongo_db] index = SimpleNeighbors.load(index_prefix) query = {"processed": {"$exists": 0}} process_trends(db.raw_trends, query, db.trends)
def main(): if len(sys.argv) < 3: print("Usage: ", sys.argv[0], " db_name, algo <NEBULA_DOC/NEBULA_WORD/NEBULA_INDEX/NEBULA_MIX>, movie_id") exit() db_name = sys.argv[1] movie_id = sys.argv[3] algo = sys.argv[2] db = connect_db(db_name) if algo == "NEBULA_DOC" or algo == "NEBULA_WORD": f_vec = get_requested_movie(db, movie_id, algo) embeddings = get_embeddings_from_db(db, algo) nebula_check_distance(embeddings, f_vec, algo) if algo == "NEBULA_INDEX": num_index_trees = 512 embedding_dimensions = 100 single_index = SimpleNeighbors(embedding_dimensions) for _algo in ["NEBULA_DOC"]: embeddings = get_embeddings_from_db(db, _algo) for embedding in embeddings.values(): movie = embedding['movie_id'] #_algo = embedding['algo'] vec = embedding['embeddings'] #print(movie, _algo) #annotated_sentence = '({}) {}'.format(language_name, language_to_sentences[language_code][i]) single_index.add_one(movie + "_" + _algo, vec) print("Index for: " + _algo + " is added" + " index size: ", len(embeddings)) if algo == "NEBULA_MIX": num_index_trees = 512 embedding_dimensions = 100 mix_index = SimpleNeighbors(embedding_dimensions * 2) doc_embeddings = get_embeddings_from_db(db, "NEBULA_DOC") #kmeans_clusters(doc_embeddings) word_embeddings = get_embeddings_from_db(db, "NEBULA_WORD") #kmeans_clusters(word_embeddings) for doc, word in zip(doc_embeddings.values(), word_embeddings.values()): #print(doc) #print(word) movie = doc['movie_id'] #_algo = embedding['algo'] vec = [] for d,w in zip(doc['embeddings'], word['embeddings']): vec.append(d) vec.append(w) #vec = doc['embeddings'] + word['embeddings'] mix_index.add_one(movie, vec) print("Mixed Index is added, index size: ", str(len(word_embeddings)) + " " + str(len(doc_embeddings))) if algo == "NEBULA_INDEX": print('Building multi-algo index with {} trees...'.format(num_index_trees)) single_index.build(n=num_index_trees) single_index.save("nebula_index_single") for _algo in ["NEBULA_DOC"]: _key = movie_id + "_" + _algo sims = single_index.neighbors(_key, n=20) print("----------------------") print("Top 20 "+ _algo + " Positive Cosines for Movie: " + movie_id) #print(nb) for sim in sims: print(sim.split("_"+_algo)[0]) print("----------------------") while (True): mv = input("Enter movie id: ") #al = input("Enter algo <NEBULA_DOC/NEBULA_WORD>: ") for _algo in ["NEBULA_DOC"]: _key = mv + "_" + _algo sims = single_index.neighbors(_key, n=20) print("----------------------") print("Top 10 "+ _algo + " Positive Cosines for Movie: " + mv) for sim in sims: print(sim.split("_"+_algo)[0]) #print(sim) print("----------------------") if algo == "NEBULA_MIX": print('Building mixed-algo index with {} trees...'.format(num_index_trees)) mix_index.build(n=num_index_trees) mix_index.save("nebula_index_mix_") _key = movie_id sims = mix_index.neighbors(_key, n=20) print("----------------------") print("Top 20 MIX" + " Positive Cosines for Movie: " + movie_id) #print(nb) for sim in sims: print(sim) print("----------------------") while (True): mv = input("Enter movie id: ") #al = input("Enter algo <NEBULA_DOC/NEBULA_WORD>: ") _key = mv sims = mix_index.neighbors(_key, n=20) print("----------------------") print("Top 20 MIX Positive Cosines for Movie: " + mv) for sim in sims: print(sim) print("----------------------")
#!/usr/bin/env python3 import os, sys, random, json, re import datetime import wordfilter import spacy from simpleneighbors import SimpleNeighbors from random import choice, sample global nlp, vocab_forest, all_motifs nlp = None vocab_forest = SimpleNeighbors(300) all_motifs = None def populate_motifs(infile="motifs.txt"): global all_motifs with open(infile) as f: all_motifs = list(l.strip() for l in f.readlines()) return all_motifs def init_nlp(**kwargs): global nlp, vocab_forest nlp = nlp or spacy.load(kwargs.get('model', 'en_vectors_web_lg')) # stop words from spacy.en: stop_words = [ 'other', 'she', 'alone', 'hers', 'enough', 'becoming', 'amount', 'himself', 'such', 'sometime', 'noone', 'though', 'thereupon', 'wherever', 'will', 'now', 'therefore', 'forty', 'name', 'whom',
def __init__(self, nlp, dims): self.nns = SimpleNeighbors(dims) self.id_pairs = {} self.vocab = [] self.dims = dims self.nlp = nlp
sentence_mean(nlp, "This... is a test.").shape def sentence_vector_trnf(embedder, s): if s == "": s = " " sentence_embedding = embedder.encode([s]) return sentence_embedding[0] from simpleneighbors import SimpleNeighbors response_sample = random.sample(list(responses.keys()), 10000) # Using GLOVE nns = SimpleNeighbors(300) for i, line_id in enumerate(response_sample): # show progress if i % 1000 == 0: print(i, line_id, movie_lines[line_id]) line_text = movie_lines[line_id] summary_vector = sentence_mean(nlp, line_text) if np.any(summary_vector): nns.add_one(line_id, summary_vector) nns.build() # Predictions Using Glove sentence = "how are you doing?" sentence = "How's the weather?" sentence = "Did you eat today?" picked = nns.nearest(sentence_mean(nlp, sentence), 5)[0]
from paths import * from simpleneighbors import SimpleNeighbors def vec(s): return nlp.vocab[s].vector def meanv(vecs): total = np.sum(vecs, axis=0) return total / len(vecs) nlp = spacy.load('en_core_web_lg') lookup = SimpleNeighbors(300) for item in nlp.vocab: if item.has_vector and item.prob > -15 and item.is_lower: lookup.add_one(item.text, item.vector) lookup.build() desire_text = nlp(open(desire).read()) eyes_text = nlp(open(eyes).read()) memory_text = nlp(open(memory).read()) names_text = nlp(open(names).read()) signs_text = nlp(open(signs).read()) dead_text = nlp(open(dead).read()) sky_text = nlp(open(sky).read()) continuous_text = nlp(open(continuous).read()) hidden_text = nlp(open(hidden).read()) thin_text = nlp(open(thin).read())
# all_words = list(nlp.vocab.strings) def vec(word): return nlp(word, disable=["parser", "tagger", "ner"]).vector print("getting embeddings...") embeddings = [vec(w) for w in all_words] print("getting done") print("building simpleneibors...") lookup = SimpleNeighbors(300) for v, w in zip(embeddings, all_words): lookup.add_one(w, v) lookup.build() print("building done") def nearest_words(word, used_words): ws = [w for w in lookup.nearest(vec(word), 156) if w not in used_words][:5] used_words.extend(ws) return ws def get_words(word): words = dict()