예제 #1
0
    def __init__(self, embedding=None, vocab_size=None, hparams=None):
        ModuleBase.__init__(self, hparams)
        self._vocab_size = vocab_size
        self._embedding = None
        self.sampling_method = self._hparams.sampling_method
        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer( \
                    layers.get_initializer(self._hparams.initializer))
            if self._hparams.position_embedder.name == 'sinusoids':
                self.position_embedder = \
                    position_embedders.SinusoidsSegmentalPositionEmbedder( \
                    self._hparams.position_embedder.hparams)

        if self._hparams.use_embedding:
            if embedding is None and vocab_size is None:
                raise ValueError("""If 'embedding' is not provided,
                    'vocab_size' must be specified.""")
            if isinstance(embedding, (tf.Tensor, tf.Variable)):
                self._embedding = embedding
            else:
                self._embedding = embedder_utils.get_embedding(
                    self._hparams.embedding,
                    embedding,
                    vocab_size,
                    variable_scope=self.variable_scope)
                self._embed_dim = shape_list(self._embedding)[-1]
                if self._hparams.zero_pad:
                    self._embedding = tf.concat( \
                        (tf.zeros(shape=[1, self._embed_dim]),\
                        self._embedding[1:, :]), 0)
            if self._vocab_size is None:
                self._vocab_size = self._embedding.get_shape().as_list()[0]
        self.output_layer = \
            self.build_output_layer(shape_list(self._embedding)[-1])
예제 #2
0
    def __init__(self, hparams=None):
        EncoderBase.__init__(self, hparams)

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))

            self.multihead_attention_list = []
            self.poswise_networks = []
            for i in range(self._hparams.num_blocks):
                with tf.variable_scope("layer_{}".format(i)):

                    with tf.variable_scope('attention'):
                        mh_attn = MultiheadAttentionEncoder(
                            self._hparams.multihead_attention)
                        self.multihead_attention_list.append(mh_attn)

                        if self._hparams.dim != mh_attn.hparams.output_dim:
                            raise ValueError(
                                'The "dim" in the hparams of '
                                '"multihead_attention" should be equal to the '
                                '"dim" of TransformerEncoder')

                    pw_net = FeedForwardNetwork(
                        hparams=self._hparams['poswise_feedforward'])
                    final_dim = pw_net.hparams.layers[-1]['kwargs']['units']
                    if self._hparams.dim != final_dim:
                        raise ValueError(
                            'The output dimenstion of '
                            '"poswise_feedforward" should be equal '
                            'to the "dim" of TransformerEncoder.')
                    self.poswise_networks.append(pw_net)
예제 #3
0
    def __init__(self, input_size: int, hparams=None):
        super().__init__(hparams=hparams)
        use_bias = self._hparams.use_bias

        self.Q_dense = nn.Linear(input_size,
                                 self._hparams.num_units,
                                 bias=use_bias)
        self.K_dense = nn.Linear(input_size,
                                 self._hparams.num_units,
                                 bias=use_bias)
        self.V_dense = nn.Linear(input_size,
                                 self._hparams.num_units,
                                 bias=use_bias)
        self.O_dense = nn.Linear(self._hparams.num_units,
                                 self._hparams.output_dim,
                                 bias=use_bias)

        if self._hparams.initializer:
            # TODO(haoransh): we may define kernel_initializer and bias
            #  initializer seperately
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            for name, param in self.named_parameters():
                if name.split('.')[-1] == 'weight':
                    print('name:{}'.format(name))
                    initialize(param)
예제 #4
0
 def reset_parameters(self):
     initialize = layers.get_initializer(self._hparams.initializer)
     if initialize is not None:
         # Do not re-initialize LayerNorm modules.
         for name, param in self.named_parameters():
             if name.split('.')[-1] == 'weight' and 'layer_norm' not in name:
                 initialize(param)
예제 #5
0
    def __init__(self, vocab_size=None, output_layer=None, hparams=None):
        ModuleBase.__init__(self, hparams)

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))

            # Make the output layer
            self._output_layer, self._vocab_size = _make_output_layer(
                output_layer, vocab_size, self._hparams.output_layer_bias,
                self.variable_scope)

            # Make attention and poswise networks
            self.multihead_attentions = {'self_att': [], 'encdec_att': []}
            self.poswise_networks = []
            for i in range(self._hparams.num_blocks):
                layer_name = 'layer_{}'.format(i)
                with tf.variable_scope(layer_name):
                    with tf.variable_scope("self_attention"):
                        multihead_attention = MultiheadAttentionEncoder(
                            self._hparams.multihead_attention)
                        self.multihead_attentions['self_att'].append(
                            multihead_attention)

                    if self._hparams.dim != \
                            multihead_attention.hparams.output_dim:
                        raise ValueError('The output dimenstion of '
                                         'MultiheadEncoder should be equal '
                                         'to the dim of TransformerDecoder')

                    with tf.variable_scope('encdec_attention'):
                        multihead_attention = MultiheadAttentionEncoder(
                            self._hparams.multihead_attention)
                        self.multihead_attentions['encdec_att'].append(
                            multihead_attention)

                    if self._hparams.dim != \
                            multihead_attention.hparams.output_dim:
                        raise ValueError('The output dimenstion of '
                                         'MultiheadEncoder should be equal '
                                         'to the dim of TransformerDecoder')

                    pw_net = FeedForwardNetwork(
                        hparams=self._hparams['poswise_feedforward'])
                    final_dim = pw_net.hparams.layers[-1]['kwargs']['units']
                    if self._hparams.dim != final_dim:
                        raise ValueError(
                            'The output dimenstion of '
                            '"poswise_feedforward" should be equal '
                            'to the "dim" of TransformerDecoder.')
                    self.poswise_networks.append(pw_net)

            # Built in _build()
            self.context = None
            self.context_sequence_length = None
            self.embedding = None
            self._helper = None
            self._cache = None
            self.max_decoding_length = None
예제 #6
0
    def __init__(self, embedding, hparams=None):
        ModuleBase.__init__(self, hparams)

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))

            if self._hparams.position_embedder_type == 'sinusoids':
                self.position_embedder = SinusoidsPositionEmbedder(
                    self._hparams.position_embedder_hparams)
            else:
                self.position_embedder = PositionEmbedder(
                    position_size=self._hparams.position_size,
                    hparams=self._hparams.position_embedder_hparams)

            self._embedding = embedding
            self._vocab_size = self._embedding.get_shape().as_list()[0]

            self.output_layer = \
                self._build_output_layer(shape_list(self._embedding)[-1])

            self.multihead_attentions = {'self_att': [], 'encdec_att': []}
            self.poswise_networks = []
            for i in range(self._hparams.num_blocks):
                layer_name = 'layer_{}'.format(i)
                with tf.variable_scope(layer_name):
                    with tf.variable_scope("self_attention"):
                        multihead_attention = MultiheadAttentionEncoder(
                            self._hparams.multihead_attention)
                        self.multihead_attentions['self_att'].append(
                            multihead_attention)
                    # pylint: disable=protected-access
                    if self._hparams.dim != \
                        multihead_attention._hparams.output_dim:
                        raise ValueError('The output dimenstion of '
                                         'MultiheadEncoder should be equal '
                                         'to the dim of TransformerDecoder')

                    with tf.variable_scope('encdec_attention'):
                        multihead_attention = MultiheadAttentionEncoder(
                            self._hparams.multihead_attention)
                        self.multihead_attentions['encdec_att'].append(
                            multihead_attention)
                    if self._hparams.dim != \
                        multihead_attention._hparams.output_dim:
                        raise ValueError('The output dimenstion of '
                                         'MultiheadEncoder should be equal '
                                         'to the dim of TransformerDecoder')

                    poswise_network = FeedForwardNetwork(
                        hparams=self._hparams['poswise_feedforward'])
                    if self._hparams.dim != \
                        poswise_network._hparams.layers[-1]['kwargs']['units']:
                        raise ValueError('The output dimenstion of '
                                         'FeedForwardNetwork should be equal '
                                         'to the dim of TransformerDecoder')
                    self.poswise_networks.append(poswise_network)
예제 #7
0
    def __init__(self,
                 pretrained_model_name: Optional[str] = None,
                 cache_dir: Optional[str] = None,
                 hparams=None):

        super().__init__(hparams=hparams)

        # Create the underlying encoder
        encoder_hparams = dict_fetch(hparams, XLNetEncoder.default_hparams())

        self._encoder = XLNetEncoder(
            pretrained_model_name=pretrained_model_name,
            cache_dir=cache_dir,
            hparams=encoder_hparams)

        # TODO: The logic here is very similar to that in XLNetClassifier.
        #  We need to reduce the code redundancy.
        if self._hparams.use_projection:
            if self._hparams.regr_strategy == 'all_time':
                self.projection = nn.Linear(
                    self._encoder.output_size * self._hparams.max_seq_length,
                    self._encoder.output_size * self._hparams.max_seq_length)
            else:
                self.projection = nn.Linear(self._encoder.output_size,
                                            self._encoder.output_size)
        self.dropout = nn.Dropout(self._hparams.dropout)

        logit_kwargs = self._hparams.logit_layer_kwargs
        if logit_kwargs is None:
            logit_kwargs = {}
        elif not isinstance(logit_kwargs, HParams):
            raise ValueError("hparams['logit_layer_kwargs'] "
                             "must be a dict.")
        else:
            logit_kwargs = logit_kwargs.todict()

        if self._hparams.regr_strategy == 'all_time':
            self.hidden_to_logits = nn.Linear(
                self._encoder.output_size * self._hparams.max_seq_length, 1,
                **logit_kwargs)
        else:
            self.hidden_to_logits = nn.Linear(self._encoder.output_size, 1,
                                              **logit_kwargs)

        if self._hparams.initializer:
            initialize = get_initializer(self._hparams.initializer)
            assert initialize is not None
            if self._hparams.use_projection:
                initialize(self.projection.weight)
                initialize(self.projection.bias)
            initialize(self.hidden_to_logits.weight)
            if self.hidden_to_logits.bias:
                initialize(self.hidden_to_logits.bias)
        else:
            if self._hparams.use_projection:
                self.projection.apply(init_weights)
            self.hidden_to_logits.apply(init_weights)
예제 #8
0
    def __init__(self, hparams=None):
        EncoderBase.__init__(self, hparams)

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))

            self.position_embedder = \
                SinusoidsPositionEmbedder(
                    self._hparams.position_embedder_hparams)
예제 #9
0
    def __init__(self,
                 pretrained_model_name: Optional[str] = None,
                 cache_dir: Optional[str] = None,
                 hparams=None):

        super().__init__(hparams=hparams)

        # Create the underlying encoder
        encoder_hparams = dict_fetch(hparams, GPT2Encoder.default_hparams())

        self._encoder = GPT2Encoder(
            pretrained_model_name=pretrained_model_name,
            cache_dir=cache_dir,
            hparams=encoder_hparams)

        # Create a dropout layer
        self._dropout_layer = nn.Dropout(self._hparams.dropout)

        # Create an additional classification layer if needed
        self.num_classes = self._hparams.num_classes
        if self.num_classes <= 0:
            self._logits_layer = None
        else:
            logit_kwargs = self._hparams.logit_layer_kwargs
            if logit_kwargs is None:
                logit_kwargs = {}
            elif not isinstance(logit_kwargs, HParams):
                raise ValueError("hparams['logit_layer_kwargs'] "
                                 "must be a dict.")
            else:
                logit_kwargs = logit_kwargs.todict()

            if self._hparams.clas_strategy == 'all_time':
                self._logits_layer = nn.Linear(
                    self._encoder.output_size * self._hparams.max_seq_length,
                    self.num_classes, **logit_kwargs)
            else:
                self._logits_layer = nn.Linear(self._encoder.output_size,
                                               self.num_classes,
                                               **logit_kwargs)

        if self._hparams.initializer:
            initialize = get_initializer(self._hparams.initializer)
            assert initialize is not None
            if self._logits_layer is not None:
                initialize(self._logits_layer.weight)
                if self._logits_layer.bias is not None:
                    initialize(self._logits_layer.bias)

        self.is_binary = (self.num_classes == 1) or \
                         (self.num_classes <= 0 and
                          self._hparams.dim == 1)
예제 #10
0
def get_embedding(hparams=None,
                  init_value=None,
                  num_embeds=None,
                  variable_scope='Embedding'):
    """Creates embedding variable if not exists.

    Args:
        hparams (dict or HParams, optional): Embedding hyperparameters. Missing
            hyperparameters are set to default values. See
            :func:`~texar.modules.default_embedding_hparams`
            for all hyperparameters and default values.

            If :attr:`init_value` is given, :attr:`hparams["initializer"]`,
            and :attr:`hparams["dim"]` are ignored.
        init_value (Tensor or numpy array, optional): Initial values of the
            embedding variable. If not given, embedding is initialized as
            specified in :attr:`hparams["initializer"]`.
        num_embeds (int, optional): The number of embedding items
            (e.g., vocabulary size). Required if :attr:`init_value` is
            not provided.
        variable_scope (str or VariableScope, optional): Variable scope of
            the embedding variable.

    Returns:
        Variable or Tensor: A 2D `Variable` or `Tensor` of the same shape with
        :attr:`init_value` or of the shape
        :attr:`[num_embeds, hparams["dim"]]`.
    """
    with tf.variable_scope(variable_scope):
        if hparams is None or isinstance(hparams, dict):
            hparams = HParams(hparams, default_embedding_hparams())
        regularizer = layers.get_regularizer(hparams["regularizer"])
        if init_value is None:
            initializer = layers.get_initializer(hparams["initializer"])
            dim = hparams["dim"]
            if not isinstance(hparams["dim"], (list, tuple)):
                dim = [dim]
            embedding = tf.get_variable(name='w',
                                        shape=[num_embeds] + dim,
                                        initializer=initializer,
                                        regularizer=regularizer,
                                        trainable=hparams["trainable"])
        else:
            init_value = tf.cast(init_value, tf.float32)
            embedding = tf.get_variable(name='w',
                                        initializer=init_value,
                                        regularizer=regularizer,
                                        trainable=hparams["trainable"])

        return embedding
    def __init__(self,
                 vocab_size=None,
                 output_layer=None,
                 tau=None,
                 hparams=None):
        EncoderBase.__init__(self, hparams)

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))

            # Make the output layer
            self._output_layer, self._vocab_size = _make_output_layer(
                output_layer, vocab_size, self._hparams.output_layer_bias,
                self.variable_scope)

            # Make attention and poswise networks
            self.graph_multihead_attention_list = []
            self.poswise_networks = []
            for i in range(self._hparams.num_blocks):
                with tf.variable_scope("layer_{}".format(i)):

                    with tf.variable_scope('attention'):
                        mh_attn = GraphMultiheadAttentionEncoder(
                            self._hparams.graph_multihead_attention)
                        self.graph_multihead_attention_list.append(mh_attn)

                        if self._hparams.dim != mh_attn.hparams.output_dim:
                            raise ValueError(
                                'The "dim" in the hparams of '
                                '"multihead_attention" should be equal to the '
                                '"dim" of CrossGraphTransformerFixedLengthDecoder'
                            )

                    pw_net = FeedForwardNetwork(
                        hparams=self._hparams['poswise_feedforward'])
                    final_dim = pw_net.hparams.layers[-1]['kwargs']['units']
                    if self._hparams.dim != final_dim:
                        raise ValueError(
                            'The output dimenstion of '
                            '"poswise_feedforward" should be equal '
                            'to the "dim" of CrossGraphTransformerFixedLengthDecoder.'
                        )
                    self.poswise_networks.append(pw_net)

            self._helper = None
            self._tau = tau
예제 #12
0
    def __init__(self, hparams=None):
        EncoderBase.__init__(self, hparams)

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))
            if self._hparams.position_embedder_type == 'sinusoids':
                self.position_embedder = SinusoidsPositionEmbedder(
                    self._hparams.position_embedder_hparams)
            else:
                self.position_embedder = PositionEmbedder(
                    position_size=self._hparams.position_size,
                    hparams=self._hparams.position_embedder_hparams)
            # pylint: disable=protected-access
            if self._hparams.dim != \
                self.position_embedder._hparams.dim:
                raise ValueError('"dim" in '
                                 'TransformerEncoder hparams must be equal '
                                 'to "dim" in its '
                                 'position_embedder_hparams.')

            self.multihead_attention_list = []
            self.poswise_networks = []
            for i in range(self._hparams.num_blocks):
                with tf.variable_scope("layer_{}".format(i)):
                    with tf.variable_scope('attention'):
                        multihead_attention = MultiheadAttentionEncoder(
                            self._hparams.multihead_attention)
                        self.multihead_attention_list.append(
                            multihead_attention)
                    # pylint: disable=protected-access
                    if self._hparams.dim != \
                        multihead_attention._hparams.output_dim:
                        raise ValueError('The "dim" in the hparams of '
                                         'multihead_attention should be equal '
                                         'to the "dim" of TransformerEncoder')
                    poswise_network = FeedForwardNetwork(
                        hparams=self._hparams['poswise_feedforward'])
                    # pylint: disable=protected-access
                    if self._hparams.dim != \
                        poswise_network._hparams.layers[-1]['kwargs']['units']:
                        # poswise_network._hparams.layers[-1]['units']:
                        raise ValueError('The "units" in the "kwargs" of '
                                         'FeedForwardNetwork should be equal '
                                         'to the "dim" of TransformerEncoder')
                    self.poswise_networks.append(poswise_network)
예제 #13
0
    def __init__(self, embedding, hparams=None):
        ModuleBase.__init__(self, hparams)

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer( \
                    layers.get_initializer(self._hparams.initializer))

            self.position_embedder = \
                SinusoidsPositionEmbedder(
                    self._hparams.position_embedder_hparams)

            self._embedding = embedding
            self._vocab_size = self._embedding.get_shape().as_list()[0]

        self.output_layer = \
            self._build_output_layer(shape_list(self._embedding)[-1])
예제 #14
0
def get_embedding(num_embeds: Optional[int] = None,
                  init_value: Optional[torch.Tensor] = None,
                  hparams=None):
    r"""Creates embedding variable if not exists.

    Args:
        hparams (dict or HParams, optional): Embedding hyperparameters. Missing
            hyperparameters are set to default values. See
            :func:`~texar.modules.default_embedding_hparams`
            for all hyperparameters and default values.

            If :attr:`init_value` is given, :attr:`hparams["initializer"]`,
            and :attr:`hparams["dim"]` are ignored.
        init_value (Tensor or numpy array, optional): Initial values of the
            embedding variable. If not given, embedding is initialized as
            specified in :attr:`hparams["initializer"]`.
        num_embeds (int, optional): The number of embedding items
            (e.g., vocabulary size). Required if :attr:`init_value` is
            not provided.

    Returns:
        A 2D :tensor:`Tensor` of the same shape with :attr:`init_value` or of
        the shape ``[num_embeds, hparams["dim"]]``.
    """
    if hparams is None or isinstance(hparams, dict):
        hparams = HParams(hparams, default_embedding_hparams())
    if init_value is None:
        initializer = layers.get_initializer(
            getattr(hparams, "initializer", None))
        # TODO Shibiao: add regularizer
        dim = hparams["dim"]
        if not isinstance(hparams["dim"], (list, tuple)):
            dim = [dim]
        embedding = torch.empty(size=[num_embeds] + dim)
        # initializer should be set by layers.get_initializer
        if initializer:
            embedding = initializer(embedding)
        else:
            embedding = torch.nn.init.xavier_uniform_(embedding)
    else:
        if torch.is_tensor(init_value):
            embedding = init_value  # Do not copy the tensor.
        else:
            embedding = torch.tensor(init_value, dtype=torch.float)

    return embedding
예제 #15
0
    def __init__(self,
                 pretrained_model_name: Optional[str] = None,
                 cache_dir: Optional[str] = None,
                 hparams=None):

        super().__init__(pretrained_model_name=pretrained_model_name,
                         cache_dir=cache_dir,
                         hparams=hparams)

        if self.pretrained_model_dir:
            self._hparams = HParams(self.pretrained_model_hparams,
                                    self._hparams.todict())

        # Word embedding
        self.word_embedder = WordEmbedder(vocab_size=self._hparams.vocab_size,
                                          hparams=self._hparams.embed)

        # Segment embedding for each type of tokens
        self.segment_embedder = WordEmbedder(
            vocab_size=self._hparams.type_vocab_size,
            hparams=self._hparams.segment_embed)

        # Position embedding
        self.position_embedder = PositionEmbedder(
            position_size=self._hparams.position_size,
            hparams=self._hparams.position_embed)

        # The BERT encoder (a TransformerEncoder)
        self.encoder = TransformerEncoder(hparams=self._hparams.encoder)

        self.pooler = nn.Sequential(
            nn.Linear(self._hparams.hidden_size, self._hparams.hidden_size),
            nn.Tanh())

        if self.pretrained_model_dir:
            bert_utils.init_bert_checkpoint(self, self.pretrained_model_dir)
        elif self._hparams.initializer:
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        '.')[-1] == 'weight' and 'layer_norm' not in name:
                    initialize(param)
예제 #16
0
    def __init__(self, hparams=None):
        EncoderBase.__init__(self, hparams)

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))

            self.Q_dense = tf.layers.Dense(self._hparams.num_units,
                                           use_bias=False,
                                           name='q')
            self.K_dense = tf.layers.Dense(self._hparams.num_units,
                                           use_bias=False,
                                           name='k')
            self.V_dense = tf.layers.Dense(self._hparams.num_units,
                                           use_bias=False,
                                           name='v')
            self.O_dense = tf.layers.Dense(self._hparams.output_dim,
                                           use_bias=False,
                                           name='o')
예제 #17
0
    def __init__(self,
                 pretrained_model_name=None,
                 cache_dir=None,
                 hparams=None):
        EncoderBase.__init__(self, hparams)
        BertBase.__init__(self, pretrained_model_name, cache_dir,
                          hparams)  # put these things to BertBase
        if self.pretrained_model:
            self._hparams = HParams(self.pretrained_model_hparams,
                                    self._hparams.todict())

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))

            # Word embedding
            self.word_embedder = WordEmbedder(
                vocab_size=self._hparams.vocab_size,
                hparams=self._hparams.embed)

            # Segment embedding for each type of tokens
            self.segment_embedder = WordEmbedder(
                vocab_size=self._hparams.type_vocab_size,
                hparams=self._hparams.segment_embed)

            # Position embedding
            self.position_embedder = PositionEmbedder(
                position_size=self._hparams.position_size,
                hparams=self._hparams.position_embed)

            # The BERT encoder (a TransformerEncoder)
            self.encoder = TransformerEncoder(hparams=self._hparams.encoder)

            with tf.variable_scope("pooler"):
                kwargs_i = {
                    "units": self._hparams.hidden_size,
                    "activation": tf.tanh
                }
                layer_hparams = {"type": "Dense", "kwargs": kwargs_i}
                self.pooler = layers.get_layer(hparams=layer_hparams)
    def __init__(self, hparams=None):
        EncoderBase.__init__(self, hparams)
        use_bias = self._hparams.use_bias

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))

            self.Q_dense = tf.layers.Dense(self._hparams.num_units,
                                           use_bias=use_bias,
                                           name='query')
            self.K_dense = tf.layers.Dense(self._hparams.num_units,
                                           use_bias=use_bias,
                                           name='key')
            self.V_dense = tf.layers.Dense(self._hparams.num_units,
                                           use_bias=use_bias,
                                           name='value')
            self.O_dense = tf.layers.Dense(self._hparams.output_dim,
                                           use_bias=use_bias,
                                           name='output')
예제 #19
0
    def __init__(self, embedding, vocab_size=None, hparams=None):
        EncoderBase.__init__(self, hparams)
        self._vocab_size = vocab_size
        self._embedding = None
        self.enc = None
        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))
            if self._hparams.position_embedder.name == 'sinusoids':
                self.position_embedder = \
                    position_embedders.SinusoidsPositionEmbedder(\
                    self._hparams.position_embedder.hparams)

        if self._hparams.use_embedding:
            if isinstance(embedding, tf.Variable):
                self._embedding = embedding
            embed_dim = self._embedding.get_shape().as_list()[-1]
            if self._hparams.zero_pad:  # TODO(zhiting): vocab has zero pad
                if not self._hparams.bos_pad:
                    self._embedding = tf.concat(\
                        (tf.zeros(shape=[1, embed_dim]),
                         self._embedding[1:, :]), 0)
                else:
                    self._embedding = tf.concat(\
                        (tf.zeros(shape=[2, embed_dim]),
                         self._embedding[2:, :]), 0)
            if self._vocab_size is None:
                self._vocab_size = self._embedding.get_shape().as_list()[0]
        with tf.variable_scope(self.variable_scope):
            if self._hparams.target_space_id is not None:
                space_embedding = tf.get_variable('target_space_embedding', \
                    [32, embed_dim])
                self.target_symbol_embedding = tf.gather(space_embedding, \
                    self._hparams.target_space_id)
            else:
                self.target_symbol_embedding = None
        self.stack_output = None
예제 #20
0
    def __init__(self,
                 pretrained_model_name: Optional[str] = None,
                 cache_dir: Optional[str] = None,
                 hparams=None):

        super().__init__(pretrained_model_name=pretrained_model_name,
                         cache_dir=cache_dir,
                         hparams=hparams)

        if self.pretrained_model_dir:
            self._hparams = HParams(self.pretrained_model_hparams,
                                    self._hparams.todict())

        # Word embedding
        self.word_embedder = WordEmbedder(vocab_size=self._hparams.vocab_size,
                                          hparams=self._hparams.embed)

        # Position embedding
        self.position_embedder = PositionEmbedder(
            position_size=self._hparams.position_size,
            hparams=self._hparams.position_embed)

        # The GPT2 decoder (a TransformerDecoder)
        self.decoder = TransformerDecoder(
            vocab_size=self._hparams.vocab_size,
            output_layer=self.word_embedder.embedding,
            hparams=self._hparams.decoder)

        if self.pretrained_model_dir:
            gpt2_utils.init_gpt2_checkpoint(self, self.pretrained_model_dir)
        elif self._hparams.initializer:
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        '.')[-1] == 'weight' and 'layer_norm' not in name:
                    initialize(param)
예제 #21
0
    def __init__(self, hparams=None):
        EncoderBase.__init__(self, hparams)

        with tf.variable_scope(self.variable_scope):
            if self._hparams.initializer:
                tf.get_variable_scope().set_initializer(
                    layers.get_initializer(self._hparams.initializer))

            self.position_embedder = \
                SinusoidsPositionEmbedder(
                    self._hparams.position_embedder_hparams)
            self.multihead_attention_list = []
            self.poswise_networks = []
            for i in range(self._hparams.num_blocks):
                with tf.variable_scope("layer_{}".format(i)):
                    with tf.variable_scope('self_attention'):
                        multihead_attention = MultiheadAttentionEncoder(
                            self._hparams.multihead_attention)
                        self.multihead_attention_list.append(
                            multihead_attention)
                    # pylint: disable=protected-access
                    if self._hparams.dim != \
                        multihead_attention._hparams.output_dim:
                        raise ValueError('The output dimenstion of'
                                         'MultiheadEncoder should be equal'
                                         'to the dim of TransformerEncoder')
                    poswise_network = FeedForwardNetwork(
                        hparams=self._hparams['poswise_feedforward'])
                    # pylint: disable=protected-access
                    if self._hparams.dim != \
                        poswise_network._hparams.layers[-1]['kwargs']['units']:
                        # poswise_network._hparams.layers[-1]['units']:
                        raise ValueError('The output dimenstion of'
                                         'FeedForwardNetwork should be equal'
                                         'to the dim of TransformerEncoder')
                    self.poswise_networks.append(poswise_network)
예제 #22
0
    def __init__(self, hparams=None):
        EncoderBase.__init__(self, hparams)
        self._input_size = self._hparams.dim
        self.self_attns = nn.ModuleList()
        if not self._hparams.use_bert_config:
            self.self_attn_layer_norm = nn.ModuleList()
        self.poswise_networks = nn.ModuleList()
        self.poswise_layer_norm = nn.ModuleList()
        self.output_layer_norm = nn.ModuleList()

        if self._hparams.use_bert_config:
            # In TensorFlow, eps for LayerNorm is 1e-12 by default.
            eps = 1e-12
        else:
            # In PyTorch, eps for LayerNorm is 1e-6 by default.
            eps = 1e-6

        for _ in range(self._hparams.num_blocks):
            mh_attn = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            self.self_attns.append(mh_attn)
            if not self._hparams.use_bert_config:
                self.self_attn_layer_norm.append(
                    nn.LayerNorm(self._input_size, eps=eps))
            if self._hparams.dim != mh_attn.hparams.output_dim:
                raise ValueError(
                    'The "dim" in the hparams of '
                    '"multihead_attention" should be equal to the '
                    '"dim" of TransformerEncoder')

            pw_net = FeedForwardNetwork(
                hparams=self._hparams['poswise_feedforward'])

            final_dim = pw_net.hparams.layers[-1]['kwargs']['out_features']
            if self._hparams.dim != final_dim:
                raise ValueError('The output dimenstion of '
                                 '"poswise_feedforward" should be equal '
                                 'to the "dim" of TransformerEncoder.')

            self.poswise_networks.append(pw_net)
            self.poswise_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))
            if self._hparams.use_bert_config:
                self.output_layer_norm.append(
                    nn.LayerNorm(self._input_size, eps=eps))

        self.embed_dropout = nn.Dropout(p=self._hparams.embedding_dropout)
        self.residual_dropout = nn.Dropout(p=self._hparams.residual_dropout)

        if self._hparams.use_bert_config:
            self.input_normalizer = nn.LayerNorm(self._input_size, eps=eps)
        else:
            self.final_layer_normalizer = nn.LayerNorm(self._input_size,
                                                       eps=eps)

        if self._hparams.initializer:
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        '.')[-1] == 'weight' and 'layer_norm' not in name:
                    initialize(param)
예제 #23
0
    def __init__(self,
                 vocab_size: Optional[int] = None,
                 output_layer: Optional[Union[nn.Module, torch.Tensor]] = None,
                 hparams: Optional[HParams] = None):
        super().__init__(
            0,
            vocab_size,  # dummy value for input_size
            input_time_major=False,
            output_time_major=False,
            hparams=hparams)
        self._input_size = self._hparams.dim

        self._output_layer, self._vocab_size = _make_output_layer(
            output_layer, vocab_size, self._input_size,
            self._hparams.output_layer_bias)

        self.self_attns = nn.ModuleList()
        self.self_attn_layer_norm = nn.ModuleList()
        self.enc_dec_attns = nn.ModuleList()
        self.end_dec_attn_layer_norm = nn.ModuleList()
        self.poswise_networks = nn.ModuleList()
        self.poswise_layer_norm = nn.ModuleList()

        if self._hparams.use_gpt_config:
            eps = 1e-5
        else:
            eps = 1e-12

        for _ in range(self._hparams.num_blocks):
            attn_module = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadEncoder should be equal "
                                 "to the dim of TransformerDecoder")
            self.self_attns.append(attn_module)
            self.self_attn_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

            attn_module = MultiheadAttentionEncoder(
                self._input_size, self._hparams.multihead_attention)
            if self._hparams.dim != attn_module.output_size:
                raise ValueError("The output dimension of "
                                 "MultiheadEncoder should be equal "
                                 "to the dim of TransformerDecoder")
            self.enc_dec_attns.append(attn_module)
            self.end_dec_attn_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

            poswise_network = FeedForwardNetwork(
                hparams=self._hparams.poswise_feedforward)
            if (poswise_network.hparams.layers[-1]['kwargs']['out_features'] !=
                    self._hparams.dim):
                raise ValueError("The output dimension of "
                                 "FeedForwardNetwork should be equal "
                                 "to the dim of TransformerDecoder")
            self.poswise_networks.append(poswise_network)
            self.poswise_layer_norm.append(
                nn.LayerNorm(self._input_size, eps=eps))

        self.final_layer_norm = nn.LayerNorm(self._input_size, eps=eps)
        self.embed_dropout = nn.Dropout(self._hparams.embedding_dropout)
        self.residual_dropout = nn.Dropout(self._hparams.residual_dropout)

        if self._hparams.initializer:
            # TODO: This might be different to what TensorFlow does
            initialize = layers.get_initializer(self._hparams.initializer)
            assert initialize is not None
            # Do not re-initialize LayerNorm modules.
            for name, param in self.named_parameters():
                if name.split(
                        ".")[-1] == "weight" and "layer_norm" not in name:
                    initialize(param)