def main(): try: raw_df = dataframe_opening(use_both_cols=False) logging.info('made preprocessed dataframe') del (raw_df) preproc_df = preproc_opening() tf_idf_index = tf_idf_indexing(list(preproc_df.question1)) logging.info('made tf-idf dataframe') del (tf_idf_index) bm25_index = bm25_indexing(list(preproc_df.question1)) logging.info('made bm25 dataframe') del (bm25_index) fasttext_index = fasttext_indexing(preproc_df) logging.info('made fasttext dataframe') del (fasttext_index) #elmo_index = elmo_indexing(preproc_df) batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings( elmo_path) cleaned, id_to_text, query_to_id = get_data_elmo( preproc_df.question1.tolist(), stop=5000) elmo_index = elmo_indexing(cleaned, batcher, sentence_character_ids, elmo_sentence_input) logging.info('made ELMo dataframe') except Exception as e: logging.exception(repr(e) + ' while some function')
def __init__(self, corpus, elmo_path): LOGGER.info('Start Elmo initializing!') DataSet.__init__(self, corpus) self.elmo_path = elmo_path tf.reset_default_graph() self.batcher, self.sentence_character_ids, self.elmo_sentence_input = \ load_elmo_embeddings(elmo_path) self.vectors = self.fit() LOGGER.info('Elmo initialized!')
def __init__(self, index=False): batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings( elmo_path) self.model = (batcher, sentence_character_ids, elmo_sentence_input) if index: log.info('Indexing starts!') cleaned = get_data_elmo(corpus) indexed = self.indexing(cleaned) with open('ELMO_matrix.pickle', 'wb') as f: pickle.dump(indexed, f) self.matrix = indexed log.info('Indexing finished') if not self.matrix: with open('ELMO_matrix.pickle', 'rb') as f: self.matrix = pickle.load(f)
def search(self, query): tf.reset_default_graph() batcher, sentence_character_ids, elmo_sentence_input = \ load_elmo_embeddings(self.elmo_path) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) elmo_vectors = get_elmo_vectors(sess, [query], batcher, sentence_character_ids, elmo_sentence_input) results = [] for vect, sent in zip(elmo_vectors, [query]): results.append(np.mean(vect[:len(sent), :], axis=0)) vec = results res = cosine_similarity(vec, self.vectors) docs = [(idx, doc) for idx, doc in enumerate(res[0])] docs = sorted(docs, key=lambda x: x[1], reverse=True) docs = [(x[0], x[1], self.corpus['question2'][x[0]]) for x in docs[:10]] return docs
import tensorflow import numpy as np from numpy import dot from numpy.linalg import norm import pickle import os import csv from operator import itemgetter from elmo_helpers import tokenize, get_elmo_vectors, load_elmo_embeddings elmo_path = 'elmo' batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings(elmo_path) with open('quora_question_pairs_rus.csv', 'r', encoding='utf-8') as q: str_corpus = csv.reader(q) file = list(str_corpus) if not os.path.exists('documents.pickle'): docs = [] for idx, line in enumerate(file): if idx != 0 and idx < 5002: docs.append(line[2]) with open("documents.pickle", "wb") as c: pickle.dump(docs, c) else: with open("documents.pickle", "rb") as c: docs = pickle.load(c) if not os.path.exists('elmo_corpus.pickle'):
def __init__(self): tf.reset_default_graph() self.batcher, self.sentence_character_ids, self.elmo_sentence_input = \ load_elmo_embeddings(elmo_path) self.vectors = [] self.collection = []
def __init__(self, elmo_path='./models/elmo'): tf.reset_default_graph() self.batcher, self.sentence_character_ids, self.elmo_sentence_input = \ load_elmo_embeddings(elmo_path)
def make_elmo_vectors_ruwordnet(data_path, model_directory, batch_size=25): model_name = os.path.basename(model_directory) data_name = os.path.basename(data_path).split('.')[0] data_dir = os.path.dirname(data_path) raw_sentences = [] with open(data_path, 'r') as f: for line in f: res = line.strip() raw_sentences.append(res) sentences = [tokenize(s) for s in raw_sentences] print('=====') print('%d sentences total' % len(sentences)) print('=====') batcher, sentence_character_ids, elmo_sentence_input = load_elmo_embeddings( model_directory) cropped_vectors = list() averaged_vectors = list() # Actually producing ELMo embeddings for our data: config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.InteractiveSession(config=config) # It is necessary to initialize variables once before running inference. sess.run(tf.global_variables_initializer()) for batch in [ sentences[i * batch_size:(i + 1) * batch_size] for i in range((len(sentences) + batch_size - 1) // batch_size) ]: elmo_vectors_batch = get_elmo_vectors(sess, batch, batcher, sentence_character_ids, elmo_sentence_input) # print('ELMo embeddings for your input are ready') # print('Tensor shape:', elmo_vectors.shape) # Due to batch processing, the above code produces for each sentence # the same number of token vectors, equal to the length of the longest sentence # (the 2nd dimension of the elmo_vector tensor). # If a sentence is shorter, the vectors for non-existent words are filled with zeroes. # Let's make a version without these redundant vectors: cropped_vectors_batch = [] for vect, sent in zip(elmo_vectors_batch, sentences): cropped_vector = vect[:len(sent), :] cropped_vectors_batch.append(cropped_vector) averaged_vectors.append(np.mean(cropped_vector, axis=0)) cropped_vectors.extend(cropped_vectors_batch) averaged_vectors_np = np.ndarray( (len(averaged_vectors), averaged_vectors[0].shape[0]), averaged_vectors[0].dtype) for i, avg_vector in enumerate(averaged_vectors): averaged_vectors_np[i] = averaged_vectors[i] out_filename_pckl = os.path.join( data_dir, '_'.join([data_name, 'elmo_vectors', model_name]) + '.pkl') out_filename_npy = os.path.join( data_dir, '_'.join([data_name, 'elmo_avg_vectors', model_name]) + '.npy') with open(out_filename_pckl, 'wb') as f: pickle.dump(cropped_vectors, f) with open(out_filename_npy, 'wb') as f: np.save(f, averaged_vectors_np)
with open("docs.pkl", "rb") as file: docs = pickle.load(file) with open("vec.pkl", "rb") as file: vec = pickle.load(file) m = MorphAnalyzer() def lemmatize(text): return [ m.parse(word)[0].normal_form for word in simple_word_tokenize(text) ] batcher, ids, elmo_input = load_elmo_embeddings(".") class SearchELMO(): def __init__(self, docs, vec): self.texts = np.array(docs) self.vec = vec def search(self, query, n=5): with tf.Session() as sess: sess.run(tf.global_variables_initializer()) query_vec = np.transpose( np.mean(get_elmo_vectors(sess, [lemmatize(query)], batcher, ids, elmo_input), axis=1)).flatten() result = np.matmul(self.vec, query_vec)