Exemplo n.º 1
0
    def __init__(self,
                 pretrained_model_name: Optional[str] = None,
                 cache_dir: Optional[str] = None,
                 hparams=None):
        super().__init__(hparams=hparams)

        self.load_pretrained_config(pretrained_model_name, cache_dir)

        # Word embedding
        self.word_embedder = WordEmbedder(
            vocab_size=self._hparams.vocab_size,
            hparams=self._hparams.embed)

        # Segment embedding for each type of tokens
        self.segment_embedder = WordEmbedder(
            vocab_size=self._hparams.type_vocab_size,
            hparams=self._hparams.segment_embed)

        # Position embedding
        self.position_embedder = PositionEmbedder(
            position_size=self._hparams.position_size,
            hparams=self._hparams.position_embed)

        # The BERT encoder (a TransformerEncoder)
        self.encoder = TransformerEncoder(hparams=self._hparams.encoder)

        self.pooler = nn.Sequential(
            nn.Linear(self._hparams.hidden_size, self._hparams.hidden_size),
            nn.Tanh())

        self.init_pretrained_weights()
Exemplo n.º 2
0
    def test_infer_helpers(self):
        """Tests inference helpers.
        """
        def _test_fn(helper):
            _, next_inputs, _ = helper.next_inputs(
                time=1,
                outputs=tf.ones([self._batch_size,
                                 self._vocab_size]),  # Not used
                state=None,  # Not used
                sample_ids=tf.ones([self._batch_size], dtype=tf.int32))

            self.assertEqual(helper.sample_ids_shape, tf.TensorShape([]))
            self.assertEqual(next_inputs.get_shape(),
                             tf.TensorShape([self._batch_size, self._emb_dim]))

            # Test in an RNN decoder
            output_layer = tf.layers.Dense(self._vocab_size)
            decoder = BasicRNNDecoder(vocab_size=self._vocab_size,
                                      output_layer=output_layer)
            outputs, final_state, sequence_lengths = decoder(
                helper=helper, max_decoding_length=self._max_seq_length)

            cell_dim = decoder.hparams.rnn_cell.kwargs.num_units
            with self.test_session() as sess:
                sess.run(tf.global_variables_initializer())
                outputs_, final_state_, sequence_lengths_ = sess.run(
                    [outputs, final_state, sequence_lengths])
                max_length = max(sequence_lengths_)
                self.assertEqual(
                    outputs_.logits.shape,
                    (self._batch_size, max_length, self._vocab_size))
                self.assertEqual(outputs_.sample_id.shape,
                                 (self._batch_size, max_length))
                self.assertEqual(final_state_[0].shape,
                                 (self._batch_size, cell_dim))

        # case-(1)
        helper = GreedyEmbeddingHelper(self._embedding, self._start_tokens,
                                       self._end_token)
        _test_fn(helper)

        # case-(2)
        embedder = WordEmbedder(self._embedding)
        helper = GreedyEmbeddingHelper(embedder, self._start_tokens,
                                       self._end_token)
        _test_fn(helper)

        # case-(3)
        word_embedder = WordEmbedder(self._embedding)
        pos_embedder = PositionEmbedder(position_size=self._max_seq_length)

        def _emb_fn(ids, times):
            return word_embedder(ids) + pos_embedder(times)

        helper = GreedyEmbeddingHelper(_emb_fn, self._start_tokens,
                                       self._end_token)
        _test_fn(helper)
Exemplo n.º 3
0
    def test_embedder_multi_calls(self):
        """Tests embedders called by multiple times.
        """
        hparams = {
            "dim": 1024,
            "dropout_rate": 0.3,
            "dropout_strategy": "item"
        }
        embedder = WordEmbedder(vocab_size=100, hparams=hparams)
        inputs = torch.ones([64, 16], dtype=torch.int32)
        outputs = embedder(inputs)

        emb_dim = embedder.dim
        if not isinstance(emb_dim, (list, tuple)):
            emb_dim = [emb_dim]
        self.assertEqual(list(outputs.shape), [64, 16] + emb_dim)

        # Call with inputs in a different shape
        inputs = torch.ones([64, 10, 20], dtype=torch.int32)
        outputs = embedder(inputs)

        emb_dim = embedder.dim
        if not isinstance(emb_dim, (list, tuple)):
            emb_dim = [emb_dim]
        self.assertEqual(list(outputs.shape), [64, 10, 20] + emb_dim)
Exemplo n.º 4
0
    def _test_word_embedder(self, hparams):
        """Tests :class:`texar.modules.WordEmbedder`.
        """
        embedder = WordEmbedder(vocab_size=100, hparams=hparams)

        inputs = torch.randint(embedder.vocab_size, (64, 16), dtype=torch.long)
        outputs = embedder(inputs)

        inputs_soft = torch.randn((64, 16, embedder.vocab_size),
                                  dtype=torch.float32)
        outputs_soft = embedder(soft_ids=inputs_soft)

        if isinstance(embedder.dim, (list, tuple)):
            emb_dim = tuple(embedder.dim)
        else:
            emb_dim = (embedder.dim, )

        if isinstance(hparams["dim"], (list, tuple)):
            hparams_dim = tuple(hparams["dim"])
        else:
            hparams_dim = (hparams["dim"], )

        self.assertEqual(outputs.size(), (64, 16) + emb_dim)
        self.assertEqual(outputs_soft.size(), (64, 16) + emb_dim)
        self.assertEqual(emb_dim, hparams_dim)
        self.assertEqual(embedder.vocab_size, 100)
        self.assertEqual(outputs.size(), (64, 16) + emb_dim)
        self.assertEqual(outputs_soft.size(), (64, 16) + emb_dim)
Exemplo n.º 5
0
    def _test_word_embedder(self, hparams):
        """Tests :class:`texar.modules.WordEmbedder`.
        """
        embedder = WordEmbedder(
            vocab_size=100, hparams=hparams)

        inputs = torch.ones([64, 16], dtype=torch.int32)
        outputs = embedder(inputs)

        inputs_soft = torch.ones(
            [64, 16, embedder.vocab_size], dtype=torch.float32)
        outputs_soft = embedder(soft_ids=inputs_soft)

        emb_dim = embedder.dim
        if isinstance(emb_dim, int):
            emb_dim = [emb_dim]
        if not isinstance(emb_dim, (list)):
            emb_dim = list(emb_dim)

        hparams_dim = hparams["dim"]
        if not isinstance(hparams["dim"], (list, tuple)):
            hparams_dim = [hparams["dim"]]

        self.assertEqual(list(outputs.shape), [64, 16] + emb_dim)
        self.assertEqual(list(outputs_soft.shape), [64, 16] + emb_dim)
        self.assertEqual(emb_dim, hparams_dim)
        self.assertEqual(embedder.vocab_size, 100)
        self.assertEqual(tuple(outputs.shape), (64, 16) + tuple(emb_dim))
        self.assertEqual(tuple(outputs_soft.shape), (64, 16) + tuple(emb_dim))
    def __init__(self, config_model, config_data):
        ModuleBase.__init__(self)
        self.config_model = config_model
        self.config_data = config_data

        with open(config_data.vocab_file, "rb") as f:
            id2w = pickle.load(f)
        self.id2w = id2w
        self.vocab_size = len(id2w)
        self.pad_token_id, self.bos_token_id = (0, 1)
        self.eos_token_id, self.unk_token_id = (2, 3)

        self.word_embedder = WordEmbedder(vocab_size=self.vocab_size,
                                          hparams=config_model.emb)
        self.pos_embedder = SinusoidsPositionEmbedder(
            position_size=config_data.max_decoding_length,
            hparams=config_model.position_embedder_hparams,
        )

        self.encoder = TransformerEncoder(hparams=config_model.encoder)
        self.decoder = TransformerDecoder(
            vocab_size=self.vocab_size,
            output_layer=self.word_embedder.embedding,
            hparams=config_model.decoder,
        )

        self.smoothed_loss_func = LabelSmoothingLoss(
            label_confidence=self.config_model.loss_label_confidence,
            tgt_vocab_size=self.vocab_size,
            ignore_index=0,
        )
Exemplo n.º 7
0
    def __init__(self, hparams=None):
        super().__init__(hparams)
        self.word_embedder = WordEmbedder(vocab_size=self._hparams.vocab_size,
                                          hparams=self._hparams.embed)

        # Segment embedding for each type of tokens
        self.segment_embedder = WordEmbedder(
            vocab_size=self._hparams.type_vocab_size,
            hparams=self._hparams.segment_embed)

        # Position embedding
        self.position_embedder = PositionEmbedder(
            position_size=self._hparams.position_size,
            hparams=self._hparams.position_embed)

        # The BERT encoder (a TransformerEncoder)
        self.encoder = TransformerEncoder(hparams=self._hparams.encoder)

        self.pooler = nn.Sequential(
            nn.Linear(self._hparams.hidden_size, self._hparams.hidden_size),
            nn.Tanh(), nn.Dropout(self._hparams.dropout))

        self._num_classes = self._hparams.num_classes

        if self._num_classes > 0:
            logit_kwargs = self._hparams.logit_layer_kwargs
            if logit_kwargs is None:
                logit_kwargs = {}
            elif not isinstance(logit_kwargs, HParams):
                raise ValueError("hparams['logit_layer_kwargs'] "
                                 "must be a dict.")
            else:
                logit_kwargs = logit_kwargs.todict()

            self.logits_layer = nn.Linear(self._hparams.hidden_size,
                                          self._num_classes, **logit_kwargs)

        else:
            self.logits_layer = None

        self.step_iteration = 0
Exemplo n.º 8
0
    def test_word_embedder_soft_ids(self):
        """Tests the correctness of using soft ids.
        """
        init_value = np.expand_dims(np.arange(5), 1)
        embedder = WordEmbedder(init_value=init_value)

        ids = np.array([3])
        soft_ids = np.array([[0, 0, 0, 1, 0]])

        outputs = embedder(ids=torch.from_numpy(ids))
        soft_outputs = embedder(soft_ids=torch.from_numpy(soft_ids))
        self.assertEqual(outputs, soft_outputs)
Exemplo n.º 9
0
    def test_word_embedder_soft_ids(self):
        """Tests the correctness of using soft ids.
        """
        init_value = np.expand_dims(np.arange(5), 1)
        embedder = WordEmbedder(init_value=init_value)

        ids = torch.tensor([3])
        soft_ids = torch.tensor([0, 0, 0, 1, 0], dtype=torch.float)

        outputs = embedder(ids=ids)
        soft_outputs = embedder(soft_ids=soft_ids)
        self.assertEqual(outputs, soft_outputs)
Exemplo n.º 10
0
 def setUp(self):
     self._vocab_size = 4
     self._max_time = 8
     self._batch_size = 16
     self._emb_dim = 20
     self._inputs = torch.randint(self._vocab_size,
                                  size=(self._batch_size, self._max_time))
     embedding = torch.rand(self._vocab_size,
                            self._emb_dim,
                            dtype=torch.float)
     self._embedder = WordEmbedder(init_value=embedding)
     self._hparams = HParams(None, BasicRNNDecoder.default_hparams())
    def test_encode_with_embedder(self):
        """Tests encoding companioned with :mod:`texar.modules.embedders`.
        """
        embedder = WordEmbedder(vocab_size=20, hparams={"dim": 100})
        inputs = tf.ones([64, 16], dtype=tf.int32)

        encoder = UnidirectionalRNNEncoder()
        outputs, state = encoder(embedder(inputs))

        cell_dim = encoder.hparams.rnn_cell.kwargs.num_units
        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            outputs_, state_ = sess.run([outputs, state])
            self.assertEqual(outputs_.shape, (64, 16, cell_dim))
            self.assertEqual(state_[0].shape, (64, cell_dim))
Exemplo n.º 12
0
    def setUp(self):
        self._vocab_size = 10
        self._max_time = 16
        self._batch_size = 8
        self._emb_dim = 20
        self._attention_dim = 256
        self._inputs = torch.randint(self._vocab_size,
                                     size=(self._batch_size, self._max_time))
        embedding = torch.rand(self._vocab_size,
                               self._emb_dim,
                               dtype=torch.float)
        self._embedder = WordEmbedder(init_value=embedding)
        self._encoder_output = torch.rand(self._batch_size, self._max_time, 64)

        self._test_hparams = {}  # (cell_type, is_multi) -> hparams
        for cell_type in ["RNNCell", "LSTMCell", "GRUCell"]:
            hparams = {
                "rnn_cell": {
                    'type': cell_type,
                    'kwargs': {
                        'num_units': 256,
                    },
                },
                "attention": {
                    "kwargs": {
                        "num_units": self._attention_dim
                    },
                }
            }
            self._test_hparams[(cell_type, False)] = HParams(
                hparams, AttentionRNNDecoder.default_hparams())

        hparams = {
            "rnn_cell": {
                'type': 'LSTMCell',
                'kwargs': {
                    'num_units': 256,
                },
                'num_layers': 3,
            },
            "attention": {
                "kwargs": {
                    "num_units": self._attention_dim
                },
            }
        }
        self._test_hparams[("LSTMCell", True)] = HParams(
            hparams, AttentionRNNDecoder.default_hparams())
Exemplo n.º 13
0
    def test_word_embedder_soft_ids(self):
        """Tests the correctness of using soft ids.
        """
        init_value = np.expand_dims(np.arange(5), 1)
        embedder = WordEmbedder(init_value=init_value)

        ids = np.array([3])
        soft_ids = np.array([[0, 0, 0, 1, 0]])

        outputs = embedder(ids=ids)
        soft_outputs = embedder(soft_ids=soft_ids)

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            outputs_, soft_outputs_ = sess.run([outputs, soft_outputs])
            self.assertEqual(outputs_, soft_outputs_)
Exemplo n.º 14
0
    def __init__(self, gpt2_config, top_k, temperature):
        super().__init__()
        self.word_embedder = WordEmbedder(vocab_size=gpt2_config.vocab_size,
                                          hparams=gpt2_config.embed)

        self.pos_embedder = PositionEmbedder(
            position_size=gpt2_config.position_size,
            hparams=gpt2_config.pos_embed)

        self.decoder = TransformerDecoder(
            vocab_size=gpt2_config.vocab_size,
            output_layer=self.word_embedder.embedding,
            hparams=gpt2_config.decoder)

        self.top_k = top_k
        self.temperature = temperature

        self._embedding_fn = lambda x, y: (self.word_embedder(x) + self.
                                           pos_embedder(y))
Exemplo n.º 15
0
    def test_embedder_multi_calls(self):
        """Tests embedders called by multiple times.
        """
        hparams = {"dim": 26, "dropout_rate": 0.3, "dropout_strategy": "item"}
        embedder = WordEmbedder(vocab_size=100, hparams=hparams)
        inputs = torch.randint(embedder.vocab_size, (64, 16), dtype=torch.long)
        outputs = embedder(inputs)

        if isinstance(embedder.dim, (list, tuple)):
            emb_dim = tuple(embedder.dim)
        else:
            emb_dim = (embedder.dim, )
        self.assertEqual(outputs.size(), (64, 16) + emb_dim)

        # Call with inputs in a different shape
        inputs = torch.randint(embedder.vocab_size, (64, 10, 20),
                               dtype=torch.long)
        outputs = embedder(inputs)

        self.assertEqual(outputs.size(), (64, 10, 20) + emb_dim)
Exemplo n.º 16
0
    def _test_word_embedder(self, hparams):
        """Tests :class:`texar.modules.WordEmbedder`.
        """
        embedder = WordEmbedder(vocab_size=100, hparams=hparams)

        inputs = tf.ones([64, 16], dtype=tf.int32)
        outputs = embedder(inputs)

        inputs_soft = tf.ones([64, 16, embedder.vocab_size], dtype=tf.float32)
        outputs_soft = embedder(soft_ids=inputs_soft)

        emb_dim = embedder.dim
        if not isinstance(emb_dim, (list, tuple)):
            emb_dim = [emb_dim]

        hparams_dim = hparams["dim"]
        if not isinstance(hparams["dim"], (list, tuple)):
            hparams_dim = [hparams["dim"]]

        self.assertEqual(outputs.shape, [64, 16] + emb_dim)
        self.assertEqual(outputs_soft.shape, [64, 16] + emb_dim)
        self.assertEqual(emb_dim, hparams_dim)
        self.assertEqual(embedder.vocab_size, 100)
        self.assertEqual(len(embedder.trainable_variables), 1)

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            outputs_, outputs_soft_ = sess.run(
                [outputs, outputs_soft],
                feed_dict={global_mode(): tf.estimator.ModeKeys.TRAIN})
            self.assertEqual(outputs_.shape, (64, 16) + tuple(emb_dim))
            self.assertEqual(outputs_soft_.shape, (64, 16) + tuple(emb_dim))

        # Tests unknown input shapes
        inputs = tf.placeholder(dtype=tf.int64, shape=[None, None])
        outputs = embedder(inputs)
        self.assertEqual(len(outputs.get_shape()), 2 + len(hparams_dim))

        inputs_soft = tf.placeholder(dtype=tf.int64, shape=[None, None, None])
        outputs_soft = embedder(soft_ids=inputs_soft)
        self.assertEqual(len(outputs_soft.get_shape()), 2 + len(hparams_dim))
Exemplo n.º 17
0
    def __init__(self,
                 pretrained_model_name: Optional[str] = None,
                 cache_dir: Optional[str] = None,
                 hparams=None):
        self.load_pretrained_config(pretrained_model_name, cache_dir, hparams)

        # Word embedding
        word_embedder = WordEmbedder(vocab_size=self._hparams.vocab_size,
                                     hparams=self._hparams.embed)

        # Position embedding
        position_embedder = PositionEmbedder(
            position_size=self._hparams.position_size,
            hparams=self._hparams.position_embed)

        # The GPT2 encoder (a TransformerEncoder)
        super().__init__(hparams=None)

        # Register modules after `__init__` is called.
        self.word_embedder = word_embedder
        self.position_embedder = position_embedder

        self.init_pretrained_weights(load_output_layer=False)