def get_elmo_vectors_shit(self, texts_in, layers="average"): """ :param texts: list of sentences (lists of words) :param warmup: warm up the model before actual inference (by running it over the 1st batch) :param layers: ["top", "average", "all"]. Yield the top ELMo layer, the average of all layers, or all layers as they are. :return: embedding tensor for all sentences (number of used layers by max word count by vector size) """ result=[] max_text_length = max([len(t) for t in texts_in]) # Creating the matrix which will eventually contain all embeddings from all batches: if layers == "all": final_vectors = np.zeros((len(texts_in), self.n_layers, max_text_length, self.vector_size)) else: final_vectors = np.zeros((len(texts_in), max_text_length, self.vector_size)) with tf.compat.v1.Session() as sess: # Get an op to compute ELMo vectors (a function of the internal biLM layers) self.elmo_sentence_input = weight_layers("input", self.sentence_embeddings_op, use_layers=layers) # It is necessary to initialize variables once before running inference. sess.run(tf.compat.v1.global_variables_initializer()) for texts_pre in texts_in: texts=[texts_pre] for i in range(10): self.warmup(sess, texts) # Running batches: chunk_counter = 0 for chunk in divide_chunks(texts, self.batch_size): # Converting sentences to character ids: sentence_ids = self.batcher.batch_sentences(chunk) self.logger.info(f"Texts in the current batch: {len(chunk)}") # Compute ELMo representations. elmo_vectors = sess.run( self.elmo_sentence_input["weighted_op"], feed_dict={self.sentence_character_ids: sentence_ids}, ) # Updating the full matrix: first_row = self.batch_size * chunk_counter last_row = first_row + elmo_vectors.shape[0] if layers == "all": final_vectors[first_row:last_row, :, : elmo_vectors.shape[2], :] = elmo_vectors else: final_vectors[first_row:last_row, : elmo_vectors.shape[1], :] = elmo_vectors chunk_counter += 1 result.append(final_vectors[0]) return result
def load(self, directory, top=False, max_batch_size=96): # Loading a pre-trained ELMo model: # You can call load with top=True to use only the top ELMo layer """ :param directory: directory or a ZIP archive with an ELMo model ('model.hdf5', 'options.json' and 'vocab.txt*' files must be present) :param top: use only top ELMo layer :param max_batch_size: the maximum allowable batch size during inference :return: ELMo batcher, character id placeholders, op object """ if not os.path.exists(directory): raise SystemExit("Error: model not found!") self.batch_size = max_batch_size if os.path.isfile(directory) and directory.endswith(".zip"): message = """ Assuming the model is a ZIP archive downloaded from the NLPL vector repository. Loading a model from a ZIP archive directly is slower than from the extracted files, but does not require additional disk space and allows to load from directories without write permissions. """ self.logger.info(message) if sys.version_info.major < 3 or sys.version_info.minor < 7: raise SystemExit( "Error: loading ELMo from ZIP archives requires Python >= 3.7." ) zf = zipfile.ZipFile(directory) vocab_file = zf.open("vocab.txt") options_file = zf.open("options.json") weight_file = zf.open("model.hdf5") m_options = json.load(options_file) options_file.seek(0) else: # We have all the files already extracted in a separate directory if os.path.isfile(os.path.join(directory, "vocab.txt.gz")): vocab_file = os.path.join(directory, "vocab.txt.gz") elif os.path.isfile(os.path.join(directory, "vocab.txt")): vocab_file = os.path.join(directory, "vocab.txt") else: raise SystemExit( "Error: no vocabulary file found in the model.") options_file = os.path.join(directory, "options.json") weight_file = os.path.join(directory, "model.hdf5") with open(options_file, 'r') as of: m_options = json.load(of) self.logger.info(f"Loading model from {directory}...") max_chars = m_options['char_cnn']['max_characters_per_token'] self.max_chars = max_chars # Create a Batcher to map text to character ids. self.batcher = Batcher(vocab_file, max_chars) # Input placeholders to the biLM. self.sentence_character_ids = tf.compat.v1.placeholder( 'int32', shape=(None, None, max_chars)) # Build the biLM graph. bilm = BidirectionalLanguageModel(options_file, weight_file, max_batch_size=max_batch_size) self.vector_size = int(bilm.options['lstm']['projection_dim'] * 2) # Get ops to compute the LM embeddings. sentence_embeddings_op = bilm(self.sentence_character_ids) # Get an op to compute ELMo (weighted average of the internal biLM layers) self.elmo_sentence_input = weight_layers('input', sentence_embeddings_op, use_top_only=top) return self.batcher, self.sentence_character_ids, self.elmo_sentence_input, self.batch_size
def get_elmo_vector_average(self, texts, warmup=True, layers="average"): """ :param texts: list of sentences (lists of words) :param warmup: warm up the model before actual inference (by running it over the 1st batch) :param layers: ["top", "average", "all"]. Yield the top ELMo layer, the average of all layers, or all layers as they are. :return: matrix of averaged embeddings for all sentences """ if layers == "all": average_vectors = np.zeros((len(texts), self.n_layers, self.vector_size)) else: average_vectors = np.zeros((len(texts), self.vector_size)) counter = 0 with tf.compat.v1.Session() as sess: # Get an op to compute ELMo vectors (a function of the internal biLM layers) self.elmo_sentence_input = weight_layers("input", self.sentence_embeddings_op, use_layers=layers) # It is necessary to initialize variables once before running inference. sess.run(tf.compat.v1.global_variables_initializer()) if warmup: self.warmup(sess, texts) # Running batches: for chunk in divide_chunks(texts, self.batch_size): # Converting sentences to character ids: sentence_ids = self.batcher.batch_sentences(chunk) self.logger.info(f"Texts in the current batch: {len(chunk)}") # Compute ELMo representations. elmo_vectors = sess.run( self.elmo_sentence_input["weighted_op"], feed_dict={self.sentence_character_ids: sentence_ids}, ) self.logger.debug(f"ELMo sentence input shape: {elmo_vectors.shape}") if layers == "all": elmo_vectors = elmo_vectors.reshape((len(chunk), elmo_vectors.shape[2], self.n_layers, self.vector_size)) for sentence in range(len(chunk)): if layers == "all": sent_vec = np.zeros((elmo_vectors.shape[1], self.n_layers, self.vector_size)) else: sent_vec = np.zeros((elmo_vectors.shape[1], self.vector_size)) for nr, word_vec in enumerate(elmo_vectors[sentence]): sent_vec[nr] = word_vec semantic_fingerprint = np.sum(sent_vec, axis=0) semantic_fingerprint = np.divide( semantic_fingerprint, sent_vec.shape[0] ) query_vec = semantic_fingerprint / np.linalg.norm( semantic_fingerprint ) average_vectors[counter] = query_vec counter += 1 return average_vectors