示例#1
0
    def _init_model(self, summarizer: Summarizer,
                    train_data: Iterable[Tuple[str, str]]) -> None:

        tokenizer_encoder, tokenizer_decoder = self._create_tokenizers(
            train_data)
        self.logger.info(
            'vocab encoder: {vocab_enc}, vocab decoder: {vocab_dec}'.format(
                vocab_enc=tokenizer_encoder.vocab_size,
                vocab_dec=tokenizer_decoder.vocab_size))
        vectorizer = Vectorizer(tokenizer_encoder,
                                tokenizer_decoder,
                                max_input_len=self.max_input_len,
                                max_output_len=self.max_output_len)
        embedding_weights_encoder, embedding_weights_decoder = None, None

        if self.embedding_path_encoder is not None:
            self.logger.info('loading encoder embedding from {}'.format(
                self.embedding_path_encoder))
            embedding = read_embedding(self.embedding_path_encoder,
                                       summarizer.embedding_size)
            embedding_weights_encoder = embedding_to_matrix(
                embedding=embedding,
                token_index=tokenizer_encoder.token_index,
                embedding_dim=summarizer.embedding_size)
        if self.embedding_path_decoder is not None:
            self.logger.info('loading decoder embedding from {}'.format(
                self.embedding_path_decoder))
            embedding = read_embedding(self.embedding_path_decoder,
                                       summarizer.embedding_size)
            embedding_weights_decoder = embedding_to_matrix(
                embedding=embedding,
                token_index=tokenizer_decoder.token_index,
                embedding_dim=summarizer.embedding_size)
        summarizer.init_model(
            preprocessor=self.preprocessor,
            vectorizer=vectorizer,
            embedding_weights_encoder=embedding_weights_encoder,
            embedding_weights_decoder=embedding_weights_decoder)
示例#2
0
 def test_read_embedding(self):
     current_dir = os.path.dirname(os.path.abspath(__file__))
     file_path = os.path.join(current_dir, 'resources/small_glove.txt')
     glove = read_embedding(file_path, vector_dim=3)
     assert_array_equal(array([1, 2, 3]), glove['a'])