def __init__(self, hidden_size: int): super(BahdanauAttention, self).__init__() self.hidden_size = hidden_size self.attention = nn.Linear(hidden_size * 2, hidden_size).to(get_device()) self.v = nn.Parameter(torch.rand(hidden_size)).to(get_device()) stdv = 1. / math.sqrt(self.v.size(0)) self.v.data.uniform_(-stdv, stdv)
def get_text_summary_from_batch( self, batch) -> Tuple[torch.Tensor, torch.Tensor]: """ Obtains original text and target summary indices from batch and transforms to GPU :param batch: :type batch: torchtext.data.batch.Batch :return: Text and summary indices for model :rtype: tuple """ text = batch.text[0].to(get_device()) summary = batch.summary[0].to(get_device()) return text, summary
def predict( self, text: str, length_of_original_text: float = 0.25) -> Tuple[str, torch.Tensor]: """ Predicts model output / summarizes given text. Obtains summarization with defined maximum percentage of length of original text. Returns summarization and attention weights to plot attention heatmap. :param text: Original text to summarize :type text: str :param length_of_original_text: Maximum ratio of summary length comparing to original text :type length_of_original_text: float :return: summary text and attention weights :rtype: tuple """ with torch.no_grad(): sequence = self.vocab_config.indices_from_text(text).unsqueeze(0) sequence_length = sequence.size(1) encoder_outputs, encoder_hidden = self.encoder( sequence.transpose(0, 1)) decoder_input = torch.LongTensor([ self.vocab_config.indices_from_text( Token.StartOfSequence.value) ]).to(get_device()) hidden = encoder_hidden[:self.decoder.n_layers] summary_words = [Token.StartOfSequence.value] max_summary_length = int(sequence_length * length_of_original_text) decoder_attentions = torch.zeros(max_summary_length, sequence_length) for idx in range(max_summary_length): output, hidden, decoder_attention = self.decoder( decoder_input, hidden, encoder_outputs, ) decoder_attentions[idx, :decoder_attention.size(2)] += \ decoder_attention.squeeze(0).squeeze(0).cpu().data top_v, top_i = output.data.topk(1) ni = top_i[0] if ni == self.vocab_config.indices_from_text( Token.EndOfSequence.value): break else: summary_words.append( self.vocab_config.text_from_indices(ni)) decoder_input = torch.LongTensor([ni]).to(get_device()) summary_words.append(Token.EndOfSequence.value) summary = " ".join(summary_words).lstrip() return summary, decoder_attentions
def __init__(self, input_size: int, embedding_size: int, hidden_size: int, n_layers: int = 1, dropout: float = 0.1): super(EncoderRNN, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.embedding_size = embedding_size self.embedding = nn.Embedding(input_size, embedding_size, padding_idx=1).to(get_device()) self.gru = nn.GRU(embedding_size, hidden_size, n_layers, dropout=dropout, bidirectional=True).to(get_device())
def indices_from_text(self, text: str) -> torch.Tensor: """ Converts text token to tensor of corresponding indices. :param text: Text to convert: :type text: str :return: Tensor with indices :rtype: torch.Tensor """ indices = [ self.stoi.get(word, self.stoi.get(Token.Unknown.value)) for word in text.strip().split(' ') ] return torch.LongTensor(indices).to(get_device())
def create_optimizers_and_loss(self) -> None: """ Initializes Adam optimizer for Seq2Seq model and learning rate scheduler as specified in config file. Initialized CrossEntropyLoss with ignoring padding token <pad> from sequence. """ self.optimizer = optim.Adam(self.seq2seq.parameters(), lr=self.config['learning_rate']) self.scheduler = optim.lr_scheduler.StepLR( self.optimizer, step_size=self.config['scheduler_step_size'], gamma=self.config['scheduler_gamma'], ) self.criterion = nn.CrossEntropyLoss( ignore_index=self.vocab_config.stoi[Token.Padding.value]).to( get_device())
def __init__( self, embedding_size: int, hidden_size: int, output_size: int, n_layers: int = 1, dropout: float = 0.1, ): super(DecoderRNN, self).__init__() self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.n_layers = n_layers self.dropout = dropout self.embedding = nn.Embedding(output_size, embedding_size, padding_idx=1).to(get_device()) self.dropout = nn.Dropout(dropout, inplace=True).to(get_device()) self.attention = BahdanauAttention(hidden_size).to(get_device()) self.gru = nn.GRU(hidden_size + embedding_size, hidden_size, n_layers).to(get_device()) self.classifier = nn.Linear(hidden_size * 2, output_size).to(get_device())
def save_model(self, model_path: str, model_epoch: int) -> None: """ Saves trained model weights after epoch, transferred to CPU. Currently attention ``V`` parameter is also saved, cause PyTorch does not supports nn.Parameter saving directly. :param model_path: Path to save model :type model_path: str :param model_epoch: Model epoch :type model_epoch: int """ torch.save(self.seq2seq.cpu().state_dict(), model_path + f'_{model_epoch}.pt') torch.save(self.seq2seq.decoder.attention.v.cpu(), model_path + f'_att_param_{model_epoch}.pt') self.logger.info(f'Saved model {model_path}_{model_epoch}.pt') self.seq2seq.to(get_device())
def forward(self, text: torch.Tensor, summary: torch.Tensor, teacher_forcing_ratio: float = 0.5) -> torch.Tensor: """ Defines Seq2Seq structure and flow. Teacher forcing ratio specifies probability of altering the decoder output with the target summary token for the next word generation. Used to accelerate model learning time. * Feeds encoder with input indices * Initializes decoder hidden state as encoder hidden state * Initializes decoder output with Start of Sequence <sos> token * Initializes summary output vector * Until the maximum summary length is reached: * Feeds decoder with decoder output, hidden state and encoder output * Updates decoder output and hidden state * Updates summary output vector with decoder output token * With teacher_forcing_ratio probability alters decoder output :param text: Indices of input text :type text: torch.Tensor :param summary: Indices of target / reference summary :type summary: torch.Tensor :param teacher_forcing_ratio: :type teacher_forcing_ratio: float :return: Output sequence / summary :rtype: torch.Tensor """ batch_size = text.size(1) max_len = summary.size(0) vocab_size = self.decoder.output_size encoder_output, hidden = self.encoder(text) hidden = hidden[:self.decoder.n_layers] output = summary.data[0, :] outputs = torch.FloatTensor(max_len, batch_size, vocab_size).fill_(0).to(get_device()) for t in range(1, max_len): output, hidden, attention_weights = self.decoder( output, hidden, encoder_output) outputs[t] = output is_teacher = random.random() < teacher_forcing_ratio top_first = output.data.max(1)[1] output = summary.data[t] if is_teacher else top_first return outputs
def create_model(self) -> None: """ Initializes full Seq2Seq model with encoder and decoder as specified in config file. """ self.encoder = EncoderRNN( input_size=self.config['text_size'], embedding_size=self.config['embed_size'], hidden_size=self.config['hidden_size'], n_layers=2, dropout=0.5, ) self.decoder = DecoderRNN( embedding_size=self.config['embed_size'], hidden_size=self.config['hidden_size'], output_size=self.config['text_size'], n_layers=1, dropout=0.5, ) self.seq2seq = Seq2Seq(encoder=self.encoder, decoder=self.decoder).to(get_device())
def load_model(self, model_path: str, attention_param_path: str = None) -> None: """ Loads trained model and transfers to GPU. Currently attention ``V`` parameter is also saved and loaded, cause PyTorch does not supports nn.Parameter saving directly. :param model_path: Path to trained model :type model_path: str :param attention_param_path: Path to trained attention parameter :type attention_param_path: str """ if attention_param_path: self.seq2seq.load_state_dict(torch.load(model_path), strict=False) self.seq2seq.decoder.attention.v = nn.Parameter( torch.load(attention_param_path)) else: self.seq2seq.load_state_dict(torch.load(model_path)) self.seq2seq = self.seq2seq.to(get_device())