Пример #1
0
def getLookUp(words):
    lookUp = SimpleNeighbors(numDimensions())
    for w in words:
        # .corpus of the lookUp lets us determine if the word has already been added
        if w.text.lower() not in lookUp.corpus:
            lookUp.add_one(w.text.lower(), w.vector)
    lookUp.build()
    return lookUp
Пример #2
0
class SemanticSimilarityChatbot:
    
    def __init__(self, nlp, dims):
        self.nns = SimpleNeighbors(dims)
        self.id_pairs = {}
        self.vocab = []
        self.dims = dims
        self.nlp = nlp
        
    def add_to_vocab(self, item):
        cur_id = len(self.vocab)
        self.vocab.append(item)
        return cur_id
    
    def add_pair(self, first, second):
        first_id = self.add_to_vocab(first)
        second_id = self.add_to_vocab(second)
        self.id_pairs[first_id] = second_id
        vec = self.vectorize(first)
        self.nns.add_one(first_id, vec)
        
    def vectorize(self, s):
        if s == "":
            s = " "
        doc = self.nlp(s, disable=['tagger', 'parser'])
        mean = np.mean(np.array([w.vector for w in doc]), axis=0)
        return mean
    
    def build(self, n=50):
        self.nns.build(n)
        
    def response_for(self, s, n=10):
        vec = self.vectorize(s)
        nearest_ids = self.nns.nearest(vec, n)
        picked = random.choice(nearest_ids)
        return self.vocab[self.id_pairs[picked]]
    
    def save(self, prefix):
        import pickle
        data = {
            'id_pairs': self.id_pairs,
            'vocab': self.vocab,
            'dims': self.dims
        }
        with open(prefix + "-chatbot.pkl", "wb") as fh:
            pickle.dump(data, fh)
        self.nns.save(prefix)
        
    @classmethod
    def load(cls, prefix, nlp):
        import pickle
        with open(prefix + "-chatbot.pkl", "rb") as fh:
            data = pickle.load(fh)
            newobj = cls(nlp, data['dims'])
            newobj.id_pairs = data['id_pairs']
            newobj.vocab = data['vocab']
            newobj.nns = SimpleNeighbors.load(prefix)
        return newobj
Пример #3
0
def create_bert_embeddings(stories):
    sentences = []

    embedding_dimensions = 768
    single_index = SimpleNeighbors(embedding_dimensions)
    for story in stories.values():
        sentence = TreebankWordDetokenizer().detokenize(story['story'][0])
        #print(sentence)
        sentences.append(sentence)

    sbert_model = SentenceTransformer('stsb-roberta-base')
    sentence_embeddings = sbert_model.encode(sentences, show_progress_bar=True)
    for embedding, key in zip(sentence_embeddings, stories.values()):
        single_index.add_one(key['movie_id'], embedding)

        # print("Key: ", key['movie_id'])
        # print("Embedding: ", embedding)
        # print("Vector Len: ", len(embedding))
        # input("Press any key...")
    return (single_index)
Пример #4
0
def get_vectors_and_build_index(collection, num_index_trees=40):
    items = list(
        collection.find({"vector": {
            "$exists": 1
        }},
                        projection={
                            "text": 1,
                            "vector": 1
                        }))
    sentences = [it['text'] for it in items]
    embeddings = np.array([it['vector'] for it in items])
    embedding_dimensions = embeddings.shape[1]

    print(f'\nAdding {len(embeddings)} embeddings to index')
    index = SimpleNeighbors(embedding_dimensions, metric='dot')
    for i in trange(embeddings.shape[0]):
        index.add_one(sentences[i], embeddings[i])
    print(f'Building index with {num_index_trees} trees...')
    index.build(n=num_index_trees)
    return index
Пример #5
0
def create_doc2vec_embeddings(stories):
    sentences = []
    tags = {}
    embedding_dimensions = 160
    single_index = SimpleNeighbors(embedding_dimensions)
    for i, story in enumerate(stories.values()):
        dfs_doc = TaggedDocument(words=story['story'][0],
                                 tags=[story['story'][1][0]])
        sentences.append(dfs_doc)
        tags[i] = story['story'][1][0]
        # print(story['movie_id'])
        # print(dfs_doc)
        # input("Press....")

    model = NEBULA_DOC_MODEL(dimensions=embedding_dimensions, epochs=400)
    model.fit(sentences, tags)
    print(len(sentences), " ", len(tags))
    sentence_embeddings = model._get_embeddings(tags)
    for embedding, key in zip(sentence_embeddings, stories.values()):
        single_index.add_one(key['movie_id'], embedding)
    return (single_index)
Пример #6
0
def load_doc2vec_embeddings(stories):
    sentences = []
    tags = {}
    embedding_dimensions = 160
    single_index = SimpleNeighbors(embedding_dimensions)
    for i, story in enumerate(stories.values()):
        dfs_doc = TaggedDocument(words=story['story'][0],
                                 tags=[story['story'][1][0]])
        sentences.append(dfs_doc)
        tags[i] = story['story'][1][0]
        # print(story['movie_id'])
        # print(dfs_doc)
        # input("Press....")

    model = Doc2Vec.load("nebula_model_doc.dat")
    #sentence_embeddings = np.array([model.docvecs[tags[i]] for i, _ in enumerate(documents)])
    print(len(sentences), " ", len(tags))
    sentence_embeddings = _get_embeddings(model, tags)
    for embedding, key in zip(sentence_embeddings, stories.values()):
        single_index.add_one(key['movie_id'], embedding)
    return (single_index)
 def make_sim(self, backend=None):
     sim = SimpleNeighbors(3, metric='angular', backend=backend)
     sim.feed(data)
     sim.add_one(*one_more)
     sim.build(20)
     return sim
Пример #8
0
def main():
    if len(sys.argv) < 3:
        print("Usage: ", sys.argv[0], " db_name, algo <NEBULA_DOC/NEBULA_WORD/NEBULA_INDEX/NEBULA_MIX>, movie_id")
        exit()
    db_name = sys.argv[1]
    movie_id = sys.argv[3]
    algo = sys.argv[2]
    db = connect_db(db_name)
    if algo == "NEBULA_DOC" or algo == "NEBULA_WORD":
        f_vec = get_requested_movie(db, movie_id, algo)
        embeddings = get_embeddings_from_db(db, algo)  
        nebula_check_distance(embeddings,  f_vec, algo)
    if algo == "NEBULA_INDEX":
        num_index_trees = 512
        embedding_dimensions = 100
        single_index = SimpleNeighbors(embedding_dimensions)
        for _algo in ["NEBULA_DOC"]:
            embeddings = get_embeddings_from_db(db, _algo)
            for embedding in embeddings.values():
                movie = embedding['movie_id'] 
                #_algo = embedding['algo']
                vec = embedding['embeddings']
                #print(movie, _algo)
                #annotated_sentence = '({}) {}'.format(language_name, language_to_sentences[language_code][i])
                single_index.add_one(movie + "_" + _algo, vec)
            print("Index for: " + _algo + " is added"  + " index size: ", len(embeddings))
    
    if algo == "NEBULA_MIX":
        num_index_trees = 512
        embedding_dimensions = 100
        mix_index = SimpleNeighbors(embedding_dimensions * 2)
        doc_embeddings =  get_embeddings_from_db(db, "NEBULA_DOC")
        #kmeans_clusters(doc_embeddings)
        word_embeddings =  get_embeddings_from_db(db, "NEBULA_WORD")
        #kmeans_clusters(word_embeddings)
        for doc, word in zip(doc_embeddings.values(), word_embeddings.values()):
            #print(doc)
            #print(word)
            movie = doc['movie_id'] 
            #_algo = embedding['algo']
            vec = []
            for d,w in zip(doc['embeddings'], word['embeddings']):
                vec.append(d)
                vec.append(w)
            #vec = doc['embeddings'] + word['embeddings']
            mix_index.add_one(movie, vec)
        print("Mixed Index is added, index size: ", str(len(word_embeddings)) + " " + str(len(doc_embeddings)))

    if algo == "NEBULA_INDEX":
        print('Building multi-algo index with {} trees...'.format(num_index_trees))
        single_index.build(n=num_index_trees) 
        single_index.save("nebula_index_single")
        for _algo in ["NEBULA_DOC"]:     
            _key = movie_id + "_" + _algo
            sims = single_index.neighbors(_key, n=20)
            print("----------------------") 
            print("Top 20 "+ _algo + " Positive Cosines for Movie: " + movie_id)
            #print(nb)
            for sim in sims:
                print(sim.split("_"+_algo)[0])
            print("----------------------")
        while (True):
            mv = input("Enter movie id: ")
            #al = input("Enter algo <NEBULA_DOC/NEBULA_WORD>: ")
            for _algo in ["NEBULA_DOC"]:     
                _key = mv + "_" + _algo
                sims = single_index.neighbors(_key, n=20)
                print("----------------------") 
                print("Top 10 "+ _algo + " Positive Cosines for Movie: " + mv)
                for sim in sims:
                    print(sim.split("_"+_algo)[0])
                    #print(sim)
                print("----------------------")  

    if algo == "NEBULA_MIX":
        print('Building mixed-algo index with {} trees...'.format(num_index_trees))
        mix_index.build(n=num_index_trees) 
        mix_index.save("nebula_index_mix_")           
        _key = movie_id
        sims = mix_index.neighbors(_key, n=20)
        print("----------------------") 
        print("Top 20 MIX" + " Positive Cosines for Movie: " + movie_id)
        #print(nb)
        for sim in sims:
            print(sim)
        print("----------------------")
        while (True):
            mv = input("Enter movie id: ")
            #al = input("Enter algo <NEBULA_DOC/NEBULA_WORD>: ")    
            _key = mv
            sims = mix_index.neighbors(_key, n=20)
            print("----------------------") 
            print("Top 20 MIX Positive Cosines for Movie: " + mv)
            for sim in sims:
                print(sim)
            print("----------------------")
Пример #9
0
def vec(s):
    return nlp.vocab[s].vector


def meanv(vecs):
    total = np.sum(vecs, axis=0)
    return total / len(vecs)


nlp = spacy.load('en_core_web_lg')

lookup = SimpleNeighbors(300)
for item in nlp.vocab:
    if item.has_vector and item.prob > -15 and item.is_lower:
        lookup.add_one(item.text, item.vector)
lookup.build()

desire_text = nlp(open(desire).read())
eyes_text = nlp(open(eyes).read())
memory_text =  nlp(open(memory).read())
names_text =  nlp(open(names).read())
signs_text =  nlp(open(signs).read())
dead_text =  nlp(open(dead).read())
sky_text =  nlp(open(sky).read())
continuous_text =  nlp(open(continuous).read())
hidden_text =  nlp(open(hidden).read())
thin_text =  nlp(open(thin).read())
trading_text =  nlp(open(trading).read())
narrative_text =  nlp(open(narrative).read())
full_text = nlp(open(full).read())
Пример #10
0
def vec(word):
    return nlp(word, disable=["parser", "tagger", "ner"]).vector


print("getting embeddings...")

embeddings = [vec(w) for w in all_words]

print("getting done")

print("building simpleneibors...")

lookup = SimpleNeighbors(300)
for v, w in zip(embeddings, all_words):
    lookup.add_one(w, v)
lookup.build()

print("building done")


def nearest_words(word, used_words):
    ws = [w for w in lookup.nearest(vec(word), 156) if w not in used_words][:5]
    used_words.extend(ws)
    return ws


def get_words(word):
    words = dict()
    used_words = list()
    used_words.append(word)
Пример #11
0
    sentence_embedding = embedder.encode([s])
    return sentence_embedding[0]


from simpleneighbors import SimpleNeighbors

response_sample = random.sample(list(responses.keys()), 10000)
#  Using GLOVE
nns = SimpleNeighbors(300)
for i, line_id in enumerate(response_sample):
    # show progress
    if i % 1000 == 0: print(i, line_id, movie_lines[line_id])
    line_text = movie_lines[line_id]
    summary_vector = sentence_mean(nlp, line_text)
    if np.any(summary_vector):
        nns.add_one(line_id, summary_vector)
nns.build()

#  Predictions Using Glove
sentence = "how are you doing?"
sentence = "How's the weather?"
sentence = "Did you eat today?"

picked = nns.nearest(sentence_mean(nlp, sentence), 5)[0]
response_line_id = responses[picked]

print("Your line:\n\t", sentence)
print("Most similar turn:\n\t", movie_lines[picked])
print("Response to most similar turn:\n\t", movie_lines[response_line_id])

#  Predictions Using Sentence Embedder