def test_linear_similarity(self):
        linear = LinearAttention(3, 3, normalize=True)
        linear._weight_vector = Parameter(torch.FloatTensor([-.3, .5, 2.0, -1.0, 1, 1]))
        linear._bias = Parameter(torch.FloatTensor([.1]))
        output = linear(Variable(torch.FloatTensor([[-7, -8, -9]])),
                        Variable(torch.FloatTensor([[[1, 2, 3], [4, 5, 6]]])))

        assert_almost_equal(output.data.numpy(), numpy.array([[0.0474, 0.9526]]), decimal=2)
Exemplo n.º 2
0
    def test_linear_similarity(self):
        linear = LinearAttention(3, 3, normalize=True)
        linear._weight_vector = Parameter(torch.FloatTensor([-.3, .5, 2.0, -1.0, 1, 1]))
        linear._bias = Parameter(torch.FloatTensor([.1]))
        output = linear(Variable(torch.FloatTensor([[-7, -8, -9]])),
                        Variable(torch.FloatTensor([[[1, 2, 3], [4, 5, 6]]])))

        assert_almost_equal(output.data.numpy(), numpy.array([[0.0474, 0.9526]]), decimal=2)
    def test_bidaf_trilinear_similarity(self):
        linear = LinearAttention(2, 2, combination='x,y,x*y', normalize=False)
        linear._weight_vector = Parameter(torch.FloatTensor([-.3, .5, 2.0, -1.0, 1, 1]))
        linear._bias = Parameter(torch.FloatTensor([.0]))
        output = linear(torch.FloatTensor([[4, 5]]),
                        torch.FloatTensor([[[1, 2], [4, 5], [7, 8], [10, 11]]]))

        assert_almost_equal(output.data.numpy(),
                            numpy.array([[-1.2 + 2.5 + 2 + -2 + 4 + 10,
                                          -1.2 + 2.5 + 8 + -5 + 16 + 25,
                                          -1.2 + 2.5 + 14 + -8 + 28 + 40,
                                          -1.2 + 2.5 + 20 + -11 + 40 + 55]]),
                            decimal=2)
	def __init__(self, vocab: Vocabulary,
	             num_layers: int = 1,
	             input_dim: int = 100,
	             hidden_dim: int = 100,
	             bidirectional: bool = True,
				 batch_size: int = 100,
				 with_attention: bool = True,
	             dropout: float = 0.2):
		super(SeqEncoder, self).__init__(vocab)
		self.rnn = torch.nn.LSTM(
			num_layers = num_layers,
			input_size= input_dim,
			hidden_size= hidden_dim,
			bidirectional= bidirectional,
			dropout= dropout,
			batch_first= True
		)
		self.num_bidirectional = 2 if bidirectional else 1
		self.hidden_dim = hidden_dim
		self.hidden = None
		self.with_attention = with_attention
		if with_attention:
			self.attention = LinearAttention(
				tensor_1_dim = hidden_dim * self.num_bidirectional,
				tensor_2_dim = hidden_dim * self.num_bidirectional)
Exemplo n.º 5
0
    def __init__(self, training=False):
        self.training = training
        config = conf['seq2seq_allen']
        prefix = config['processed_data_prefix']
        train_file = config['train_data']
        valid_file = config['valid_data']
        src_embedding_dim = config['src_embedding_dim']
        hidden_dim = config['hidden_dim']
        batch_size = config['batch_size']
        epoch = config['epoch']
        self.model_path = config['model']

        if torch.cuda.is_available():
            cuda_device = 0
        else:
            cuda_device = -1

        # 定义数据读取器,WordTokenizer代表按照空格分割,target的namespace用于生成输出层的vocab时不和source混在一起
        self.reader = MySeqDatasetReader(
            source_tokenizer=WordTokenizer(),
            target_tokenizer=WordTokenizer(),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace='target_tokens')
            })

        if training and self.model_path is not None:
            # 从文件中读取数据
            self.train_dataset = self.reader.read(
                os.path.join(prefix, train_file))
            self.valid_dataset = self.reader.read(
                os.path.join(prefix, valid_file))

            # 定义词汇
            self.vocab = Vocabulary.from_instances(self.train_dataset +
                                                   self.valid_dataset,
                                                   min_count={
                                                       'tokens': 3,
                                                       'target_tokens': 3
                                                   })
        elif not training:
            try:
                self.vocab = Vocabulary.from_files(self.model_path)
            except Exception as e:
                logger.exception('vocab file does not exist!')

                # 从文件中读取数据
                self.train_dataset = self.reader.read(
                    os.path.join(prefix, train_file))
                self.valid_dataset = self.reader.read(
                    os.path.join(prefix, valid_file))

                # 定义词汇
                self.vocab = Vocabulary.from_instances(self.train_dataset +
                                                       self.valid_dataset,
                                                       min_count={
                                                           'tokens': 3,
                                                           'target_tokens': 3
                                                       })

        # 定义embedding层
        src_embedding = Embedding(
            num_embeddings=self.vocab.get_vocab_size('tokens'),
            embedding_dim=src_embedding_dim)

        # 定义encoder,这里使用的是BiGRU
        encoder = PytorchSeq2SeqWrapper(
            torch.nn.GRU(src_embedding_dim,
                         hidden_dim // 2,
                         batch_first=True,
                         bidirectional=True))

        # 定义decoder,这里使用的是GRU,因为decoder的输入需要和encoder的输出一致
        decoder = PytorchSeq2SeqWrapper(
            torch.nn.GRU(hidden_dim, hidden_dim, batch_first=True))
        # 将index 映射到 embedding上,tokens与data reader中用的TokenInder一致
        source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

        # 线性Attention层
        attention = LinearAttention(hidden_dim,
                                    hidden_dim,
                                    activation=Activation.by_name('tanh')())

        # 定义模型
        self.model = Seq2SeqKnu(vocab=self.vocab,
                                source_embedder=source_embedder,
                                encoder=encoder,
                                target_namespace='target_tokens',
                                decoder=decoder,
                                attention=attention,
                                max_decoding_steps=20,
                                cuda_device=cuda_device)

        # 判断是否训练
        if training and self.model_path is not None:
            optimizer = optim.Adam(self.model.parameters())
            # sorting_keys代表batch的时候依据什么排序
            iterator = BucketIterator(batch_size=batch_size,
                                      sorting_keys=[("source_tokens",
                                                     "num_tokens")])
            # 迭代器需要接受vocab,在训练时可以用vocab来index数据
            iterator.index_with(self.vocab)

            self.model.cuda(cuda_device)

            # 定义训练器
            self.trainer = Trainer(model=self.model,
                                   optimizer=optimizer,
                                   iterator=iterator,
                                   patience=10,
                                   validation_metric="+accuracy",
                                   train_dataset=self.train_dataset,
                                   validation_dataset=self.valid_dataset,
                                   serialization_dir=self.model_path,
                                   num_epochs=epoch,
                                   cuda_device=cuda_device)
        elif not training:
            with open(os.path.join(self.model_path, 'best.th'), 'rb') as f:
                self.model.load_state_dict(torch.load(f))
            self.model.cuda(cuda_device)
            self.predictor = MySeqPredictor(self.model,
                                            dataset_reader=self.reader)
Exemplo n.º 6
0
vocab = Vocabulary.from_instances(train_dataset + valid_dataset,
                                  min_count={
                                      'tokens': 3,
                                      'target_tokens': 3
                                  })

src_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                          embedding_dim=src_embedding_dim)

encoder = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True))

source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

attention = LinearAttention(hidden_dim,
                            hidden_dim,
                            activation=Activation.by_name('tanh')())

model = SimpleSeq2Seq(
    vocab,
    source_embedder,
    encoder,
    max_decoding_steps=20,
    target_embedding_dim=trg_embedding_dim,
    target_namespace='target_tokens',
    attention=attention,  # pass attention
    beam_size=8,
    use_bleu=True)

optimizer = optim.Adam(model.parameters())
iterator = BucketIterator(batch_size=32,
Exemplo n.º 7
0
vector = torch.rand((batch_size, embedding_dim1,))
matrix = torch.rand((batch_size, sequence_length, embedding_dim1))
attention = DotProductAttention()
output = attention(vector, matrix)
print('Output from DotProductAttention:', output.size(), output)

# bilinear & linear attention allows inputs of different sizes
vector = torch.rand((batch_size, embedding_dim1,))
matrix = torch.rand((batch_size, sequence_length, embedding_dim2))
attention = BilinearAttention(vector_dim=embedding_dim1, matrix_dim=embedding_dim2)
output = attention(vector, matrix)
print('Output from BilinearAttention:', output.size(), output)

tanh = Activation.by_name('tanh')()
attention = LinearAttention(
    tensor_1_dim=embedding_dim1, tensor_2_dim=embedding_dim2,
    combination='x,y', activation=tanh)
output = attention(vector, matrix)
print('Output from LinearAttention:', output)

# MatrixAttention
sequence_length1 = 10
sequence_length2 = 15

# dot product attention only allows matrices of the same size
matrix1 = torch.rand((batch_size, sequence_length1, embedding_dim1))
matrix2 = torch.rand((batch_size, sequence_length2, embedding_dim1))

matrix_attention = DotProductMatrixAttention()
output = matrix_attention(matrix1, matrix2)
print('Output shape of DotProductMatrixAttention:', output.shape)
Exemplo n.º 8
0
    def __init__(self, input_embedding: InputEmbedding,
                 config: CopyNetConfig) -> None:
        super().__init__()

        self.data_len = config.data_len
        # Encoding modules.
        self._encoder = PytorchSeq2SeqWrapper(
            torch.nn.GRU(input_size=config.hidden,
                         hidden_size=config.encoder_GRU_hidden,
                         num_layers=config.encoder_layers,
                         bidirectional=True,
                         batch_first=True))
        # Embedding modules.
        self.input_embed = input_embedding
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        # We arbitrarily set the decoder's input dimension to be the same as the output dimension.
        self.encoder_output_dim = config.encoder_GRU_hidden * 2
        self.decoder_input_dim = config.decoder_hidden_size
        self.decoder_output_dim = config.decoder_GRU_hidden  # = config.decoder_GRU_hidden * 2

        # Reduce dimensionality of encoder output to reduce the number of decoder parameters.
        self.encoder_output_projection = Linear(self.encoder_output_dim,
                                                self.decoder_output_dim)

        # The decoder input will be a function of the embedding of the previous predicted token,
        # an attended encoder hidden state called the "attentive read", and another
        # weighted sum of the encoder hidden state called the "selective read".
        # While the weights for the attentive read are calculated by an `Attention` module,
        # the weights for the selective read are simply the predicted probabilities
        # corresponding to each token in the source sentence that matches the target
        # token from the previous timestep.
        self._attention = LinearAttention(
            self.decoder_output_dim,
            self.decoder_output_dim,
            activation=Activation.by_name('tanh')())
        # config.hidden * 2: bidirectional
        self._input_projection_layer = Linear(
            config.feature_dim + self.decoder_output_dim * 2,
            self.decoder_input_dim)

        # We then run the projected decoder input through an LSTM cell to produce
        # the next hidden state.
        self._decoder_cell = GRUCell(self.decoder_input_dim,
                                     self.decoder_output_dim)
        self._command_token_size = config.num_cmd_tokens

        # We create a "generation" score for each token in the target vocab
        # with a linear projection of the decoder hidden state.
        self._output_generation_layer_1 = Linear(self.decoder_output_dim,
                                                 self._command_token_size)
        self._output_generation_layer_2 = Linear(self.decoder_output_dim,
                                                 self._command_token_size)

        # We create a "copying" score for each source token by applying a non-linearity
        # (tanh) to a linear projection of the encoded hidden state for that token,
        # and then taking the dot product of the result with the decoder hidden state.
        self._output_copying_layer_1 = Linear(self.decoder_output_dim,
                                              self.decoder_output_dim)
        self._output_copying_layer_2 = Linear(self.decoder_output_dim,
                                              self.decoder_output_dim)

        self._softmax = nn.LogSoftmax(dim=-1)
Exemplo n.º 9
0
    def __init__(self):
        config = conf['seq2seq_allen']
        prefix = config['processed_data_prefix']
        train_file = config['train_data']
        valid_file = config['valid_data']
        src_embedding_dim = config['src_embedding_dim']
        trg_embedding_dim = config['trg_embedding_dim']
        hidden_dim = config['hidden_dim']

        if torch.cuda.is_available():
            cuda_device = 0
        else:
            cuda_device = -1

        self.reader = Seq2SeqDatasetReader(
            source_tokenizer=WordTokenizer(),
            target_tokenizer=WordTokenizer(),
            source_token_indexers={'tokens': SingleIdTokenIndexer()},
            target_token_indexers={
                'tokens': SingleIdTokenIndexer(namespace='target_tokens')
            })

        self.train_dataset = self.reader.read(os.path.join(prefix, train_file))
        self.valid_dataset = self.reader.read(os.path.join(prefix, valid_file))

        vocab = Vocabulary.from_instances(self.train_dataset +
                                          self.valid_dataset,
                                          min_count={
                                              'tokens': 3,
                                              'target_tokens': 3
                                          })

        src_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=src_embedding_dim)

        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(src_embedding_dim, hidden_dim, batch_first=True))

        source_embedder = BasicTextFieldEmbedder({"tokens": src_embedding})

        attention = LinearAttention(hidden_dim,
                                    hidden_dim,
                                    activation=Activation.by_name('tanh')())

        self.model = SimpleSeq2Seq(
            vocab=vocab,
            source_embedder=source_embedder,
            encoder=encoder,
            max_decoding_steps=20,
            target_embedding_dim=trg_embedding_dim,
            target_namespace='target_tokens',
            attention=attention,  # pass attention
            use_bleu=True)

        optimizer = optim.Adam(self.model.parameters())
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("source_tokens", "num_tokens")
                                                ])
        # 迭代器需要接受vocab,在训练时可以用vocab来index数据
        iterator.index_with(vocab)

        self.model.cuda(cuda_device)

        self.trainer = Trainer(model=self.model,
                               optimizer=optimizer,
                               iterator=iterator,
                               patience=10,
                               validation_metric="+accuracy",
                               train_dataset=self.train_dataset,
                               validation_dataset=self.valid_dataset,
                               num_epochs=1,
                               cuda_device=cuda_device)