def get_elmo_vectors_shit(self, texts_in, layers="average"):
        """
        :param texts: list of sentences (lists of words)
        :param warmup: warm up the model before actual inference (by running it over the 1st batch)
        :param layers: ["top", "average", "all"].
        Yield the top ELMo layer, the average of all layers, or all layers as they are.
        :return: embedding tensor for all sentences
        (number of used layers by max word count by vector size)
        """
        result=[]
        max_text_length = max([len(t) for t in texts_in])

        # Creating the matrix which will eventually contain all embeddings from all batches:
        if layers == "all":
            final_vectors = np.zeros((len(texts_in), self.n_layers, max_text_length, self.vector_size))
        else:
            final_vectors = np.zeros((len(texts_in), max_text_length, self.vector_size))

        with tf.compat.v1.Session() as sess:
            # Get an op to compute ELMo vectors (a function of the internal biLM layers)
            self.elmo_sentence_input = weight_layers("input", self.sentence_embeddings_op,
                                                     use_layers=layers)

            # It is necessary to initialize variables once before running inference.
            sess.run(tf.compat.v1.global_variables_initializer())
            for texts_pre in texts_in:
                texts=[texts_pre]
                for i in range(10):
                    self.warmup(sess, texts)
                # Running batches:
                chunk_counter = 0
                for chunk in divide_chunks(texts, self.batch_size):


                    # Converting sentences to character ids:
                    sentence_ids = self.batcher.batch_sentences(chunk)
                    self.logger.info(f"Texts in the current batch: {len(chunk)}")

                    # Compute ELMo representations.
                    elmo_vectors = sess.run(
                        self.elmo_sentence_input["weighted_op"],
                        feed_dict={self.sentence_character_ids: sentence_ids},
                    )
                    # Updating the full matrix:
                    first_row = self.batch_size * chunk_counter
                    last_row = first_row + elmo_vectors.shape[0]
                    if layers == "all":
                        final_vectors[first_row:last_row, :, : elmo_vectors.shape[2], :] = elmo_vectors
                    else:
                        final_vectors[first_row:last_row, : elmo_vectors.shape[1], :] = elmo_vectors
                    chunk_counter += 1

                result.append(final_vectors[0])
        return result
예제 #2
0
    def load(self, directory, top=False, max_batch_size=96):
        # Loading a pre-trained ELMo model:
        # You can call load with top=True to use only the top ELMo layer
        """
        :param directory: directory or a ZIP archive with an ELMo model
        ('model.hdf5', 'options.json' and 'vocab.txt*' files must be present)
        :param top: use only top ELMo layer
        :param max_batch_size: the maximum allowable batch size during inference
        :return: ELMo batcher, character id placeholders, op object
        """
        if not os.path.exists(directory):
            raise SystemExit("Error: model not found!")
        self.batch_size = max_batch_size
        if os.path.isfile(directory) and directory.endswith(".zip"):
            message = """
            Assuming the model is a ZIP archive downloaded from the NLPL vector repository.
            Loading a model from a ZIP archive directly is slower than from the extracted files,
            but does not require additional disk space
            and allows to load from directories without write permissions.
            """
            self.logger.info(message)
            if sys.version_info.major < 3 or sys.version_info.minor < 7:
                raise SystemExit(
                    "Error: loading ELMo from ZIP archives requires Python >= 3.7."
                )
            zf = zipfile.ZipFile(directory)
            vocab_file = zf.open("vocab.txt")
            options_file = zf.open("options.json")
            weight_file = zf.open("model.hdf5")
            m_options = json.load(options_file)
            options_file.seek(0)
        else:
            # We have all the files already extracted in a separate directory
            if os.path.isfile(os.path.join(directory, "vocab.txt.gz")):
                vocab_file = os.path.join(directory, "vocab.txt.gz")
            elif os.path.isfile(os.path.join(directory, "vocab.txt")):
                vocab_file = os.path.join(directory, "vocab.txt")
            else:
                raise SystemExit(
                    "Error: no vocabulary file found in the model.")
            options_file = os.path.join(directory, "options.json")
            weight_file = os.path.join(directory, "model.hdf5")
            with open(options_file, 'r') as of:
                m_options = json.load(of)

        self.logger.info(f"Loading model from {directory}...")
        max_chars = m_options['char_cnn']['max_characters_per_token']
        self.max_chars = max_chars

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(vocab_file, max_chars)

        # Input placeholders to the biLM.
        self.sentence_character_ids = tf.compat.v1.placeholder(
            'int32', shape=(None, None, max_chars))

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(options_file,
                                          weight_file,
                                          max_batch_size=max_batch_size)
        self.vector_size = int(bilm.options['lstm']['projection_dim'] * 2)

        # Get ops to compute the LM embeddings.
        sentence_embeddings_op = bilm(self.sentence_character_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        self.elmo_sentence_input = weight_layers('input',
                                                 sentence_embeddings_op,
                                                 use_top_only=top)

        return self.batcher, self.sentence_character_ids, self.elmo_sentence_input, self.batch_size
    def get_elmo_vector_average(self, texts, warmup=True, layers="average"):
        """
        :param texts: list of sentences (lists of words)
        :param warmup: warm up the model before actual inference (by running it over the 1st batch)
        :param layers: ["top", "average", "all"].
        Yield the top ELMo layer, the average of all layers, or all layers as they are.
        :return: matrix of averaged embeddings for all sentences
        """

        if layers == "all":
            average_vectors = np.zeros((len(texts), self.n_layers, self.vector_size))
        else:
            average_vectors = np.zeros((len(texts), self.vector_size))

        counter = 0

        with tf.compat.v1.Session() as sess:
            # Get an op to compute ELMo vectors (a function of the internal biLM layers)
            self.elmo_sentence_input = weight_layers("input", self.sentence_embeddings_op,
                                                     use_layers=layers)

            # It is necessary to initialize variables once before running inference.
            sess.run(tf.compat.v1.global_variables_initializer())

            if warmup:
                self.warmup(sess, texts)

            # Running batches:
            for chunk in divide_chunks(texts, self.batch_size):
                # Converting sentences to character ids:
                sentence_ids = self.batcher.batch_sentences(chunk)
                self.logger.info(f"Texts in the current batch: {len(chunk)}")

                # Compute ELMo representations.
                elmo_vectors = sess.run(
                    self.elmo_sentence_input["weighted_op"],
                    feed_dict={self.sentence_character_ids: sentence_ids},
                )

                self.logger.debug(f"ELMo sentence input shape: {elmo_vectors.shape}")

                if layers == "all":
                    elmo_vectors = elmo_vectors.reshape((len(chunk), elmo_vectors.shape[2],
                                                         self.n_layers, self.vector_size))
                for sentence in range(len(chunk)):
                    if layers == "all":
                        sent_vec = np.zeros((elmo_vectors.shape[1], self.n_layers,
                                             self.vector_size))
                    else:
                        sent_vec = np.zeros((elmo_vectors.shape[1], self.vector_size))
                    for nr, word_vec in enumerate(elmo_vectors[sentence]):
                        sent_vec[nr] = word_vec
                    semantic_fingerprint = np.sum(sent_vec, axis=0)
                    semantic_fingerprint = np.divide(
                        semantic_fingerprint, sent_vec.shape[0]
                    )
                    query_vec = semantic_fingerprint / np.linalg.norm(
                        semantic_fingerprint
                    )

                    average_vectors[counter] = query_vec
                    counter += 1

        return average_vectors