Пример #1
0
def main():
    try:
        raw_df = dataframe_opening(use_both_cols=False)
        logging.info('made preprocessed dataframe')
        del (raw_df)
        preproc_df = preproc_opening()
        tf_idf_index = tf_idf_indexing(list(preproc_df.question1))
        logging.info('made tf-idf dataframe')
        del (tf_idf_index)
        bm25_index = bm25_indexing(list(preproc_df.question1))
        logging.info('made bm25 dataframe')
        del (bm25_index)
        fasttext_index = fasttext_indexing(preproc_df)
        logging.info('made fasttext dataframe')
        del (fasttext_index)
        #elmo_index = elmo_indexing(preproc_df)
        batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(
            elmo_path)
        cleaned, id_to_text, query_to_id = get_data_elmo(
            preproc_df.question1.tolist(), stop=5000)
        elmo_index = elmo_indexing(cleaned, batcher, sentence_character_ids,
                                   elmo_sentence_input)
        logging.info('made ELMo dataframe')

    except Exception as e:
        logging.exception(repr(e) + ' while some function')
Пример #2
0
 def __init__(self, corpus, elmo_path):
     LOGGER.info('Start Elmo initializing!')
     DataSet.__init__(self, corpus)
     self.elmo_path = elmo_path
     tf.reset_default_graph()
     self.batcher, self.sentence_character_ids, self.elmo_sentence_input = \
         load_elmo_embeddings(elmo_path)
     self.vectors = self.fit()
     LOGGER.info('Elmo initialized!')
Пример #3
0
    def __init__(self, index=False):
        batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(
            elmo_path)
        self.model = (batcher, sentence_character_ids, elmo_sentence_input)

        if index:
            log.info('Indexing starts!')
            cleaned = get_data_elmo(corpus)
            indexed = self.indexing(cleaned)
            with open('ELMO_matrix.pickle', 'wb') as f:
                pickle.dump(indexed, f)
            self.matrix = indexed
            log.info('Indexing finished')

        if not self.matrix:
            with open('ELMO_matrix.pickle', 'rb') as f:
                self.matrix = pickle.load(f)
Пример #4
0
 def search(self, query):
     tf.reset_default_graph()
     batcher, sentence_character_ids, elmo_sentence_input = \
       load_elmo_embeddings(self.elmo_path)
     with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
         elmo_vectors = get_elmo_vectors(sess, [query], batcher,
                                         sentence_character_ids,
                                         elmo_sentence_input)
         results = []
         for vect, sent in zip(elmo_vectors, [query]):
             results.append(np.mean(vect[:len(sent), :], axis=0))
     vec = results
     res = cosine_similarity(vec, self.vectors)
     docs = [(idx, doc) for idx, doc in enumerate(res[0])]
     docs = sorted(docs, key=lambda x: x[1], reverse=True)
     docs = [(x[0], x[1], self.corpus['question2'][x[0]])
             for x in docs[:10]]
     return docs
Пример #5
0
import tensorflow
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pickle
import os
import csv
from operator import itemgetter
from elmo_helpers import tokenize, get_elmo_vectors, load_elmo_embeddings

elmo_path = 'elmo'

batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(elmo_path)


with open('quora_question_pairs_rus.csv', 'r', encoding='utf-8') as q:
    str_corpus = csv.reader(q)
    file = list(str_corpus)

if not os.path.exists('documents.pickle'):
    docs = []
    for idx, line in enumerate(file):
        if idx != 0 and idx < 5002:
            docs.append(line[2])
    with open("documents.pickle", "wb") as c:
        pickle.dump(docs, c)
else:
    with open("documents.pickle", "rb") as c:
        docs = pickle.load(c)

if not os.path.exists('elmo_corpus.pickle'):
Пример #6
0
 def __init__(self):
   tf.reset_default_graph()
   self.batcher, self.sentence_character_ids, self.elmo_sentence_input = \
   load_elmo_embeddings(elmo_path)
   self.vectors = []
   self.collection = []
Пример #7
0
 def __init__(self, elmo_path='./models/elmo'):
     tf.reset_default_graph()
     self.batcher, self.sentence_character_ids, self.elmo_sentence_input = \
         load_elmo_embeddings(elmo_path)
def make_elmo_vectors_ruwordnet(data_path, model_directory, batch_size=25):
    model_name = os.path.basename(model_directory)
    data_name = os.path.basename(data_path).split('.')[0]
    data_dir = os.path.dirname(data_path)

    raw_sentences = []
    with open(data_path, 'r') as f:
        for line in f:
            res = line.strip()
            raw_sentences.append(res)
    sentences = [tokenize(s) for s in raw_sentences]
    print('=====')
    print('%d sentences total' % len(sentences))
    print('=====')

    batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(
        model_directory)

    cropped_vectors = list()
    averaged_vectors = list()
    # Actually producing ELMo embeddings for our data:
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.InteractiveSession(config=config)
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())

    for batch in [
            sentences[i * batch_size:(i + 1) * batch_size]
            for i in range((len(sentences) + batch_size - 1) // batch_size)
    ]:
        elmo_vectors_batch = get_elmo_vectors(sess, batch, batcher,
                                              sentence_character_ids,
                                              elmo_sentence_input)

        # print('ELMo embeddings for your input are ready')
        # print('Tensor shape:', elmo_vectors.shape)

        # Due to batch processing, the above code produces for each sentence
        # the same number of token vectors, equal to the length of the longest sentence
        # (the 2nd dimension of the elmo_vector tensor).
        # If a sentence is shorter, the vectors for non-existent words are filled with zeroes.
        # Let's make a version without these redundant vectors:
        cropped_vectors_batch = []
        for vect, sent in zip(elmo_vectors_batch, sentences):
            cropped_vector = vect[:len(sent), :]
            cropped_vectors_batch.append(cropped_vector)
            averaged_vectors.append(np.mean(cropped_vector, axis=0))

        cropped_vectors.extend(cropped_vectors_batch)

    averaged_vectors_np = np.ndarray(
        (len(averaged_vectors), averaged_vectors[0].shape[0]),
        averaged_vectors[0].dtype)
    for i, avg_vector in enumerate(averaged_vectors):
        averaged_vectors_np[i] = averaged_vectors[i]

    out_filename_pckl = os.path.join(
        data_dir, '_'.join([data_name, 'elmo_vectors', model_name]) + '.pkl')
    out_filename_npy = os.path.join(
        data_dir,
        '_'.join([data_name, 'elmo_avg_vectors', model_name]) + '.npy')

    with open(out_filename_pckl, 'wb') as f:
        pickle.dump(cropped_vectors, f)

    with open(out_filename_npy, 'wb') as f:
        np.save(f, averaged_vectors_np)
Пример #9
0
with open("docs.pkl", "rb") as file:
    docs = pickle.load(file)

with open("vec.pkl", "rb") as file:
    vec = pickle.load(file)

m = MorphAnalyzer()


def lemmatize(text):
    return [
        m.parse(word)[0].normal_form for word in simple_word_tokenize(text)
    ]


batcher, ids, elmo_input = load_elmo_embeddings(".")


class SearchELMO():
    def __init__(self, docs, vec):
        self.texts = np.array(docs)
        self.vec = vec

    def search(self, query, n=5):
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            query_vec = np.transpose(
                np.mean(get_elmo_vectors(sess, [lemmatize(query)], batcher,
                                         ids, elmo_input),
                        axis=1)).flatten()
            result = np.matmul(self.vec, query_vec)