def test_workflow(self):
     for backend in Annoy, BruteForcePurePython, Sklearn:
         sim = self.make_sim(backend)
         self.workflow(sim)
         sim.save(opj(self.tmpdir, 'neighbortest'))
         sim2 = SimpleNeighbors.load(opj(self.tmpdir, 'neighbortest'))
         self.workflow(sim2)
 def load(cls, prefix, nlp):
     import pickle
     with open(prefix + "-chatbot.pkl", "rb") as fh:
         data = pickle.load(fh)
         newobj = cls(nlp, data['dims'])
         newobj.id_pairs = data['id_pairs']
         newobj.vocab = data['vocab']
         newobj.nns = SimpleNeighbors.load(prefix)
     return newobj
Пример #3
0
class SemanticSimilarityChatbot:
    
    def __init__(self, nlp, dims):
        self.nns = SimpleNeighbors(dims)
        self.id_pairs = {}
        self.vocab = []
        self.dims = dims
        self.nlp = nlp
        
    def add_to_vocab(self, item):
        cur_id = len(self.vocab)
        self.vocab.append(item)
        return cur_id
    
    def add_pair(self, first, second):
        first_id = self.add_to_vocab(first)
        second_id = self.add_to_vocab(second)
        self.id_pairs[first_id] = second_id
        vec = self.vectorize(first)
        self.nns.add_one(first_id, vec)
        
    def vectorize(self, s):
        if s == "":
            s = " "
        doc = self.nlp(s, disable=['tagger', 'parser'])
        mean = np.mean(np.array([w.vector for w in doc]), axis=0)
        return mean
    
    def build(self, n=50):
        self.nns.build(n)
        
    def response_for(self, s, n=10):
        vec = self.vectorize(s)
        nearest_ids = self.nns.nearest(vec, n)
        picked = random.choice(nearest_ids)
        return self.vocab[self.id_pairs[picked]]
    
    def save(self, prefix):
        import pickle
        data = {
            'id_pairs': self.id_pairs,
            'vocab': self.vocab,
            'dims': self.dims
        }
        with open(prefix + "-chatbot.pkl", "wb") as fh:
            pickle.dump(data, fh)
        self.nns.save(prefix)
        
    @classmethod
    def load(cls, prefix, nlp):
        import pickle
        with open(prefix + "-chatbot.pkl", "rb") as fh:
            data = pickle.load(fh)
            newobj = cls(nlp, data['dims'])
            newobj.id_pairs = data['id_pairs']
            newobj.vocab = data['vocab']
            newobj.nns = SimpleNeighbors.load(prefix)
        return newobj
Пример #4
0
def init_nlp(**kwargs):
    global nlp, vocab_forest
    nlp = nlp or spacy.load(kwargs.get('model', 'en_vectors_web_lg'))

    # stop words from spacy.en:
    stop_words = [
        'other', 'she', 'alone', 'hers', 'enough', 'becoming', 'amount',
        'himself', 'such', 'sometime', 'noone', 'though', 'thereupon',
        'wherever', 'will', 'now', 'therefore', 'forty', 'name', 'whom',
        'often', 'unless', 'this', 'whether', 'nothing', 'well', 'along',
        'from', 'on', 'should', 'hundred', 'much', 'seems', 'wherein',
        'beyond', 'used', 'you', 'except', 'so', 'top', 'even', 'without',
        'give', 'and', 'whoever', 'about', 'nor', 'which', 'together', 'an',
        'everyone', 'below', 'itself', 'doing', 'mostly', 'many', 'else',
        'already', 'elsewhere', 'whereupon', 'were', 'using', 'until', 'mine',
        'made', 'nobody', 'some', 'down', 'toward', 'with', 'out', 'has',
        'although', 'their', 'sixty', 'somehow', 'full', 'next', 'between',
        'by', 'yourselves', 'throughout', 'few', 'own', 'hereafter', 'up',
        'done', 'indeed', 'anywhere', 'then', 'latter', 'our', 'same', 'over',
        're', 'not', 'regarding', 'nowhere', 'really', 'former', 'any',
        'through', 'they', 'whole', 'becomes', 'around', 'yet', 'less', 'is',
        'these', 'whatever', 'otherwise', 'as', 'anything', 'among', 'have',
        'however', 'go', 'afterwards', 'since', 'still', 'can', 'beforehand',
        'everywhere', 'why', 'seem', 'because', 'last', 'due', 'had', 'get',
        'while', 'all', 'him', 'who', 'most', 'to', 'only', 'serious',
        'meanwhile', 'are', 'show', 'several', 'at', 'might', 'onto', 'anyone',
        'her', 'hereby', 'seemed', 'am', 'again', 'move', 'therein', 'than',
        'did', 'very', 'it', 'anyhow', 'both', 'please', 'i', 'make', 'more',
        'no', 'off', 'various', 'been', 'thereby', 'against', 'whence',
        'third', 'there', 'ever', 'sometimes', 'every', 'take', 'we', 'say',
        'each', 'also', 'what', 'me', 'us', 'anyway', 'none', 'per', 'thru',
        'his', 'moreover', 'a', 'perhaps', 'how', 'yours', 'besides',
        'whenever', 'empty', 'least', 'under', 'he', 'back', 'myself',
        'namely', 'first', 'herself', 'into', 'someone', 'quite', 'never',
        'always', 'here', 'via', 'cannot', 'must', 'ca', 'would',
        'nevertheless', 'above', 'front', 'part', 'became', 'yourself',
        'after', 'everything', 'your', 'somewhere', 'before', 'too', 'the',
        'those', 'once', 'does', 'do', 'towards', 'could', 'keep', 'them',
        'for', 'twenty', 'something', 'but', 'my', 'see', 'that', 'in',
        'others', 'side', 'of', 'further', 'during', 'upon', 'behind',
        'become', 'almost', 'whose', 'another', 'its', 'within', 'thereafter',
        'bottom', 'whereas', 'when', 'seeming', 'just', 'either', 'put', 'or',
        'call', 'being', 'be', 'fifty', 'beside', 'across', 'may', 'whereby',
        'neither', 'was', 'rather', 'if', 'formerly', 'amongst', 'where',
        'thus', 'ourselves', 'themselves', 'hence', 'ours'
    ]
    # custom stop words:
    stop_words += ['\'s', 'St.', 'tabu']
    for stop_word in stop_words:
        nlp.vocab[stop_word].is_stop = True

    print("loading up the prepared Annoy object...")
    vocab_forest = SimpleNeighbors.load('vocab_forest')

    return nlp
Пример #5
0
def create_bert_embeddings(stories):
    sentences = []

    embedding_dimensions = 768
    single_index = SimpleNeighbors(embedding_dimensions)
    for story in stories.values():
        sentence = TreebankWordDetokenizer().detokenize(story['story'][0])
        #print(sentence)
        sentences.append(sentence)

    sbert_model = SentenceTransformer('stsb-roberta-base')
    sentence_embeddings = sbert_model.encode(sentences, show_progress_bar=True)
    for embedding, key in zip(sentence_embeddings, stories.values()):
        single_index.add_one(key['movie_id'], embedding)

        # print("Key: ", key['movie_id'])
        # print("Embedding: ", embedding)
        # print("Vector Len: ", len(embedding))
        # input("Press any key...")
    return (single_index)
Пример #6
0
def create_doc2vec_embeddings(stories):
    sentences = []
    tags = {}
    embedding_dimensions = 160
    single_index = SimpleNeighbors(embedding_dimensions)
    for i, story in enumerate(stories.values()):
        dfs_doc = TaggedDocument(words=story['story'][0],
                                 tags=[story['story'][1][0]])
        sentences.append(dfs_doc)
        tags[i] = story['story'][1][0]
        # print(story['movie_id'])
        # print(dfs_doc)
        # input("Press....")

    model = NEBULA_DOC_MODEL(dimensions=embedding_dimensions, epochs=400)
    model.fit(sentences, tags)
    print(len(sentences), " ", len(tags))
    sentence_embeddings = model._get_embeddings(tags)
    for embedding, key in zip(sentence_embeddings, stories.values()):
        single_index.add_one(key['movie_id'], embedding)
    return (single_index)
Пример #7
0
def load_doc2vec_embeddings(stories):
    sentences = []
    tags = {}
    embedding_dimensions = 160
    single_index = SimpleNeighbors(embedding_dimensions)
    for i, story in enumerate(stories.values()):
        dfs_doc = TaggedDocument(words=story['story'][0],
                                 tags=[story['story'][1][0]])
        sentences.append(dfs_doc)
        tags[i] = story['story'][1][0]
        # print(story['movie_id'])
        # print(dfs_doc)
        # input("Press....")

    model = Doc2Vec.load("nebula_model_doc.dat")
    #sentence_embeddings = np.array([model.docvecs[tags[i]] for i, _ in enumerate(documents)])
    print(len(sentences), " ", len(tags))
    sentence_embeddings = _get_embeddings(model, tags)
    for embedding, key in zip(sentence_embeddings, stories.values()):
        single_index.add_one(key['movie_id'], embedding)
    return (single_index)
Пример #8
0
def getLookUp(words):
    lookUp = SimpleNeighbors(numDimensions())
    for w in words:
        # .corpus of the lookUp lets us determine if the word has already been added
        if w.text.lower() not in lookUp.corpus:
            lookUp.add_one(w.text.lower(), w.vector)
    lookUp.build()
    return lookUp
Пример #9
0
def benchmark(n=10000, dims=300, query_count=10, metric='angular'):
    import numpy as np
    from time import time
    data = np.random.randn(n, dims)
    for backend in available():
        start = time()
        print("benchmarking", backend, "at", start)
        sim = SimpleNeighbors(dims, metric, backend=backend)
        labels = list(range(n))
        print("feeding data")
        sim.feed(zip(labels, data))
        print("building index")
        sim.build(50)
        to_build = time()
        print("querying")
        for i in range(query_count):
            sim.nearest(np.random.randn(dims))
        nearest_query = time()
        print(
            backend, "%0.2f sec to build, %0.2f sec to query %d items" %
            (to_build - start, nearest_query - start, query_count))
Пример #10
0
def get_vectors_and_build_index(collection, num_index_trees=40):
    items = list(
        collection.find({"vector": {
            "$exists": 1
        }},
                        projection={
                            "text": 1,
                            "vector": 1
                        }))
    sentences = [it['text'] for it in items]
    embeddings = np.array([it['vector'] for it in items])
    embedding_dimensions = embeddings.shape[1]

    print(f'\nAdding {len(embeddings)} embeddings to index')
    index = SimpleNeighbors(embedding_dimensions, metric='dot')
    for i in trange(embeddings.shape[0]):
        index.add_one(sentences[i], embeddings[i])
    print(f'Building index with {num_index_trees} trees...')
    index.build(n=num_index_trees)
    return index
 def make_sim(self, backend=None):
     sim = SimpleNeighbors(3, metric='angular', backend=backend)
     sim.feed(data)
     sim.add_one(*one_more)
     sim.build(20)
     return sim
def process_trends(collection_in, query, collection_out):
    cur = collection_in.find(query, projection={"articles": 1})
    for item in tqdm(cur, total=collection_in.count_documents(query)):
        articles = item['articles']
        trend_snippet = max(articles, key=lambda x: len(x['snippet']))['snippet']
        trend_snippet = clean_text(trend_snippet)
        vector = vectorize([clean_text(trend_snippet)])[0]
        search_results, dists = query_trend_text(clean_text(trend_snippet), index)
        post_ids = []
        for search_result in search_results:
            post_id_res = db.raw_posts.find_one({"text": search_result})
            if post_id_res:
                post_id = post_id_res['id']
                post_ids.append(post_id)
        if len(post_ids) > 5:
            collection_out.insert_one(
                {"trend_snippet": trend_snippet, "post_ids": post_ids, "result_texts": search_results, "dists": dists}
            )
            collection_in.update_one({"_id": item['_id']}, {"$set": {"processed": 1}}, upsert=True)
        else:
            print(len(post_ids))

if __name__ == "__main__":
    db = MongoClient(host=mongo_host, port=mongo_port, username=mongo_user, password=mongo_pass)[mongo_db]
    index = SimpleNeighbors.load(index_prefix)
    
    query = {"processed": {"$exists": 0}}
    process_trends(db.raw_trends, query, db.trends)
    
    
Пример #13
0
def main():
    if len(sys.argv) < 3:
        print("Usage: ", sys.argv[0], " db_name, algo <NEBULA_DOC/NEBULA_WORD/NEBULA_INDEX/NEBULA_MIX>, movie_id")
        exit()
    db_name = sys.argv[1]
    movie_id = sys.argv[3]
    algo = sys.argv[2]
    db = connect_db(db_name)
    if algo == "NEBULA_DOC" or algo == "NEBULA_WORD":
        f_vec = get_requested_movie(db, movie_id, algo)
        embeddings = get_embeddings_from_db(db, algo)  
        nebula_check_distance(embeddings,  f_vec, algo)
    if algo == "NEBULA_INDEX":
        num_index_trees = 512
        embedding_dimensions = 100
        single_index = SimpleNeighbors(embedding_dimensions)
        for _algo in ["NEBULA_DOC"]:
            embeddings = get_embeddings_from_db(db, _algo)
            for embedding in embeddings.values():
                movie = embedding['movie_id'] 
                #_algo = embedding['algo']
                vec = embedding['embeddings']
                #print(movie, _algo)
                #annotated_sentence = '({}) {}'.format(language_name, language_to_sentences[language_code][i])
                single_index.add_one(movie + "_" + _algo, vec)
            print("Index for: " + _algo + " is added"  + " index size: ", len(embeddings))
    
    if algo == "NEBULA_MIX":
        num_index_trees = 512
        embedding_dimensions = 100
        mix_index = SimpleNeighbors(embedding_dimensions * 2)
        doc_embeddings =  get_embeddings_from_db(db, "NEBULA_DOC")
        #kmeans_clusters(doc_embeddings)
        word_embeddings =  get_embeddings_from_db(db, "NEBULA_WORD")
        #kmeans_clusters(word_embeddings)
        for doc, word in zip(doc_embeddings.values(), word_embeddings.values()):
            #print(doc)
            #print(word)
            movie = doc['movie_id'] 
            #_algo = embedding['algo']
            vec = []
            for d,w in zip(doc['embeddings'], word['embeddings']):
                vec.append(d)
                vec.append(w)
            #vec = doc['embeddings'] + word['embeddings']
            mix_index.add_one(movie, vec)
        print("Mixed Index is added, index size: ", str(len(word_embeddings)) + " " + str(len(doc_embeddings)))

    if algo == "NEBULA_INDEX":
        print('Building multi-algo index with {} trees...'.format(num_index_trees))
        single_index.build(n=num_index_trees) 
        single_index.save("nebula_index_single")
        for _algo in ["NEBULA_DOC"]:     
            _key = movie_id + "_" + _algo
            sims = single_index.neighbors(_key, n=20)
            print("----------------------") 
            print("Top 20 "+ _algo + " Positive Cosines for Movie: " + movie_id)
            #print(nb)
            for sim in sims:
                print(sim.split("_"+_algo)[0])
            print("----------------------")
        while (True):
            mv = input("Enter movie id: ")
            #al = input("Enter algo <NEBULA_DOC/NEBULA_WORD>: ")
            for _algo in ["NEBULA_DOC"]:     
                _key = mv + "_" + _algo
                sims = single_index.neighbors(_key, n=20)
                print("----------------------") 
                print("Top 10 "+ _algo + " Positive Cosines for Movie: " + mv)
                for sim in sims:
                    print(sim.split("_"+_algo)[0])
                    #print(sim)
                print("----------------------")  

    if algo == "NEBULA_MIX":
        print('Building mixed-algo index with {} trees...'.format(num_index_trees))
        mix_index.build(n=num_index_trees) 
        mix_index.save("nebula_index_mix_")           
        _key = movie_id
        sims = mix_index.neighbors(_key, n=20)
        print("----------------------") 
        print("Top 20 MIX" + " Positive Cosines for Movie: " + movie_id)
        #print(nb)
        for sim in sims:
            print(sim)
        print("----------------------")
        while (True):
            mv = input("Enter movie id: ")
            #al = input("Enter algo <NEBULA_DOC/NEBULA_WORD>: ")    
            _key = mv
            sims = mix_index.neighbors(_key, n=20)
            print("----------------------") 
            print("Top 20 MIX Positive Cosines for Movie: " + mv)
            for sim in sims:
                print(sim)
            print("----------------------")
Пример #14
0
#!/usr/bin/env python3

import os, sys, random, json, re
import datetime
import wordfilter
import spacy
from simpleneighbors import SimpleNeighbors
from random import choice, sample

global nlp, vocab_forest, all_motifs
nlp = None
vocab_forest = SimpleNeighbors(300)
all_motifs = None


def populate_motifs(infile="motifs.txt"):
    global all_motifs
    with open(infile) as f:
        all_motifs = list(l.strip() for l in f.readlines())
    return all_motifs


def init_nlp(**kwargs):
    global nlp, vocab_forest
    nlp = nlp or spacy.load(kwargs.get('model', 'en_vectors_web_lg'))

    # stop words from spacy.en:
    stop_words = [
        'other', 'she', 'alone', 'hers', 'enough', 'becoming', 'amount',
        'himself', 'such', 'sometime', 'noone', 'though', 'thereupon',
        'wherever', 'will', 'now', 'therefore', 'forty', 'name', 'whom',
Пример #15
0
 def __init__(self, nlp, dims):
     self.nns = SimpleNeighbors(dims)
     self.id_pairs = {}
     self.vocab = []
     self.dims = dims
     self.nlp = nlp
Пример #16
0
sentence_mean(nlp, "This... is a test.").shape


def sentence_vector_trnf(embedder, s):
    if s == "":
        s = " "
    sentence_embedding = embedder.encode([s])
    return sentence_embedding[0]


from simpleneighbors import SimpleNeighbors

response_sample = random.sample(list(responses.keys()), 10000)
#  Using GLOVE
nns = SimpleNeighbors(300)
for i, line_id in enumerate(response_sample):
    # show progress
    if i % 1000 == 0: print(i, line_id, movie_lines[line_id])
    line_text = movie_lines[line_id]
    summary_vector = sentence_mean(nlp, line_text)
    if np.any(summary_vector):
        nns.add_one(line_id, summary_vector)
nns.build()

#  Predictions Using Glove
sentence = "how are you doing?"
sentence = "How's the weather?"
sentence = "Did you eat today?"

picked = nns.nearest(sentence_mean(nlp, sentence), 5)[0]
Пример #17
0
from paths import *
from simpleneighbors import SimpleNeighbors


def vec(s):
    return nlp.vocab[s].vector


def meanv(vecs):
    total = np.sum(vecs, axis=0)
    return total / len(vecs)


nlp = spacy.load('en_core_web_lg')

lookup = SimpleNeighbors(300)
for item in nlp.vocab:
    if item.has_vector and item.prob > -15 and item.is_lower:
        lookup.add_one(item.text, item.vector)
lookup.build()

desire_text = nlp(open(desire).read())
eyes_text = nlp(open(eyes).read())
memory_text =  nlp(open(memory).read())
names_text =  nlp(open(names).read())
signs_text =  nlp(open(signs).read())
dead_text =  nlp(open(dead).read())
sky_text =  nlp(open(sky).read())
continuous_text =  nlp(open(continuous).read())
hidden_text =  nlp(open(hidden).read())
thin_text =  nlp(open(thin).read())
Пример #18
0
# all_words = list(nlp.vocab.strings)


def vec(word):
    return nlp(word, disable=["parser", "tagger", "ner"]).vector


print("getting embeddings...")

embeddings = [vec(w) for w in all_words]

print("getting done")

print("building simpleneibors...")

lookup = SimpleNeighbors(300)
for v, w in zip(embeddings, all_words):
    lookup.add_one(w, v)
lookup.build()

print("building done")


def nearest_words(word, used_words):
    ws = [w for w in lookup.nearest(vec(word), 156) if w not in used_words][:5]
    used_words.extend(ws)
    return ws


def get_words(word):
    words = dict()