Exemplo n.º 1
0
 def test_embedding_to_matrix(self):
     embedding = {'a': np.array(2), 'b': np.array(3), 'c': np.array(4)}
     token_index = {'a': 1, 'b': 2, 'd': 3}
     matrix = embedding_to_matrix(embedding, token_index, 1)
     np.testing.assert_array_equal(matrix[1], np.array(2))
     np.testing.assert_array_equal(matrix[2], np.array(3))
     # random values for zero index and tokens not in embedding
     self.assertTrue(-1 < float(matrix[0]) < 1)
     self.assertTrue(-1 < float(matrix[3]) < 1)
Exemplo n.º 2
0
    def _init_model(self, summarizer: Summarizer,
                    train_data: Iterable[Tuple[str, str]]) -> None:

        tokenizer_encoder, tokenizer_decoder = self._create_tokenizers(
            train_data)
        self.logger.info(
            'vocab encoder: {vocab_enc}, vocab decoder: {vocab_dec}'.format(
                vocab_enc=tokenizer_encoder.vocab_size,
                vocab_dec=tokenizer_decoder.vocab_size))
        vectorizer = Vectorizer(tokenizer_encoder,
                                tokenizer_decoder,
                                max_input_len=self.max_input_len,
                                max_output_len=self.max_output_len)
        embedding_weights_encoder, embedding_weights_decoder = None, None

        if self.embedding_path_encoder is not None:
            self.logger.info('loading encoder embedding from {}'.format(
                self.embedding_path_encoder))
            embedding = read_embedding(self.embedding_path_encoder,
                                       summarizer.embedding_size)
            embedding_weights_encoder = embedding_to_matrix(
                embedding=embedding,
                token_index=tokenizer_encoder.token_index,
                embedding_dim=summarizer.embedding_size)
        if self.embedding_path_decoder is not None:
            self.logger.info('loading decoder embedding from {}'.format(
                self.embedding_path_decoder))
            embedding = read_embedding(self.embedding_path_decoder,
                                       summarizer.embedding_size)
            embedding_weights_decoder = embedding_to_matrix(
                embedding=embedding,
                token_index=tokenizer_decoder.token_index,
                embedding_dim=summarizer.embedding_size)
        summarizer.init_model(
            preprocessor=self.preprocessor,
            vectorizer=vectorizer,
            embedding_weights_encoder=embedding_weights_encoder,
            embedding_weights_decoder=embedding_weights_decoder)