def _init_model(self, summarizer: Summarizer, train_data: Iterable[Tuple[str, str]]) -> None: tokenizer_encoder, tokenizer_decoder = self._create_tokenizers( train_data) self.logger.info( 'vocab encoder: {vocab_enc}, vocab decoder: {vocab_dec}'.format( vocab_enc=tokenizer_encoder.vocab_size, vocab_dec=tokenizer_decoder.vocab_size)) vectorizer = Vectorizer(tokenizer_encoder, tokenizer_decoder, max_input_len=self.max_input_len, max_output_len=self.max_output_len) embedding_weights_encoder, embedding_weights_decoder = None, None if self.embedding_path_encoder is not None: self.logger.info('loading encoder embedding from {}'.format( self.embedding_path_encoder)) embedding = read_embedding(self.embedding_path_encoder, summarizer.embedding_size) embedding_weights_encoder = embedding_to_matrix( embedding=embedding, token_index=tokenizer_encoder.token_index, embedding_dim=summarizer.embedding_size) if self.embedding_path_decoder is not None: self.logger.info('loading decoder embedding from {}'.format( self.embedding_path_decoder)) embedding = read_embedding(self.embedding_path_decoder, summarizer.embedding_size) embedding_weights_decoder = embedding_to_matrix( embedding=embedding, token_index=tokenizer_decoder.token_index, embedding_dim=summarizer.embedding_size) summarizer.init_model( preprocessor=self.preprocessor, vectorizer=vectorizer, embedding_weights_encoder=embedding_weights_encoder, embedding_weights_decoder=embedding_weights_decoder)
def test_read_embedding(self): current_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(current_dir, 'resources/small_glove.txt') glove = read_embedding(file_path, vector_dim=3) assert_array_equal(array([1, 2, 3]), glove['a'])