Exemplo n.º 1
0
 def __init__(self,
              model_dimension,
              num_heads,
              intermediate_size,
              initializer_stddev=0.02,
              activation_dropout_rate=0.0,
              attention_dropout_rate=0.0,
              **kwargs):
     super(FunnelTransformerEncoder, self).__init__(**kwargs)
     self.model_dimension = model_dimension
     self.parameters.initializer = tf.keras.initializers.TruncatedNormal(
         stddev=initializer_stddev)
     self.self_attn = FunnelAttention(
         model_dimension,
         num_heads,
         attention_dropout_rate=attention_dropout_rate,
         parameters=self.parameters)
     self.prx = dense_layers.BaseQDenseVarLen(model_dimension,
                                              activation=None,
                                              parameters=self.parameters)
     self.upprx = dense_layers.BaseQDenseVarLen(intermediate_size,
                                                parameters=self.parameters)
     self.downprx = dense_layers.BaseQDenseVarLen(
         model_dimension, activation=None, parameters=self.parameters)
     self.activation_dropout_rate = activation_dropout_rate
     self.ln1 = normalization_layers.LayerNormalization(**kwargs)
     self.ln2 = normalization_layers.LayerNormalization(**kwargs)
     self.q1 = quantization_layers.ActivationQuantization(**kwargs)
     self.q2 = quantization_layers.ActivationQuantization(**kwargs)
Exemplo n.º 2
0
 def __init__(self, **kwargs):
     shape = kwargs.pop("shape", None)
     initializer = kwargs.pop("initializer", None)
     self.qoutput = quantization_layers.ActivationQuantization(**kwargs)
     super(EmbeddingFullyConnected, self).__init__(shape=shape,
                                                   initializer=initializer,
                                                   **kwargs)
Exemplo n.º 3
0
 def __init__(self, num_layers, max_time_step, vocabulary_size,
              embedding_size, model_dimension, num_heads, intermediate_size,
              **kwargs):
     self.max_time_step = max_time_step
     self.vocabulary_size = vocabulary_size
     self.embedding_size = embedding_size
     activation_dropout_rate = kwargs.pop('activation_dropout_rate', 0.0)
     attention_dropout_rate = kwargs.pop('attention_dropout_rate', 0.0)
     self.layers = []
     for _ in range(num_layers):
         self.layers.append(
             TransformerEncoder(
                 model_dimension=model_dimension,
                 num_heads=num_heads,
                 intermediate_size=intermediate_size,
                 activation_dropout_rate=activation_dropout_rate,
                 attention_dropout_rate=attention_dropout_rate,
                 **kwargs))
     self.embedding = embedding_layers.EmbeddingLayer(
         shape=[self.vocabulary_size, self.embedding_size], **kwargs)
     self.positional_embedding = embedding_layers.EmbeddingLayer(
         shape=[self.max_time_step, self.embedding_size], **kwargs)
     self.ln = normalization_layers.LayerNormalization(**kwargs)
     self.qact = quantization_layers.ActivationQuantization(**kwargs)
     super(TransformerEncoderStack, self).__init__(**kwargs)
Exemplo n.º 4
0
 def __init__(self,
              model_dimension,
              max_time_step,
              num_heads,
              intermediate_size,
              activation_dropout_rate=0.0,
              attention_dropout_rate=0.0,
              beam_size=1,
              cached_kv=False,
              **kwargs):
     self.model_dimension = model_dimension
     self.decoder_uniform_attn = transformer_layers.DecoderUniformAttention(
         model_dimension,
         max_time_step,
         attention_dropout_rate=attention_dropout_rate,
         beam_size=beam_size,
         **kwargs)
     self.multihead_cross_attn = transformer_layers.DecoderMultiheadAttention(
         model_dimension,
         num_heads,
         cached_kv=cached_kv,
         attention_dropout_rate=attention_dropout_rate,
         **kwargs)
     self.prx = dense_layers.BaseQDense(model_dimension,
                                        activation=None,
                                        normalize=False,
                                        bias=False,
                                        **kwargs)
     self.upprx = dense_layers.BaseQDense(intermediate_size,
                                          normalize=False,
                                          **kwargs)
     self.downprx = dense_layers.BaseQDense(model_dimension,
                                            activation=None,
                                            normalize=False,
                                            **kwargs)
     self.activation_dropout_rate = activation_dropout_rate
     self.ln1 = normalization_layers.LayerNormalization(**kwargs)
     self.ln2 = normalization_layers.LayerNormalization(**kwargs)
     self.q0 = quantization_layers.ActivationQuantization(**kwargs)
     self.q1 = quantization_layers.ActivationQuantization(**kwargs)
     self.q2 = quantization_layers.ActivationQuantization(**kwargs)
     super(TransformerUniformAttnDecoder, self).__init__(**kwargs)
Exemplo n.º 5
0
 def __init__(self, scalar=True, **kwargs):
   self.scalar = scalar
   # Attention logits should not have activation post linear layer so it can
   # be positive or negative. This would enable the attention distribution to
   # be anything that the network likes. Using relu activation makes the
   # attention distribution biased towards uniform distribution.
   # This gets better results for attention pooling. Though some outputs are
   # emphasized for making classification decision, all other outputs have
   # a non zero probability of influencing the class. This seems to result
   # in better backprop.
   self.attention = dense_layers.BaseQDenseVarLen(units=1, rank=3, **kwargs)
   self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
   super(AttentionPooling, self).__init__(**kwargs)
Exemplo n.º 6
0
 def __init__(self,
              shape,
              num_bits=8,
              initializer=None,
              trainable=True,
              **kwargs):
     self.shape = shape
     self.quantizer = quantization_layers.ActivationQuantization(
         num_bits=num_bits, **kwargs)
     super(EmbeddingLayer, self).__init__(**kwargs)
     if initializer is None:
         initializer = tf.keras.initializers.GlorotUniform()
     self.initializer = initializer
     self.trainable = trainable
Exemplo n.º 7
0
 def __init__(self,
              units,
              activation=tf.keras.layers.ReLU(),
              bias=True,
              rank=2,
              **kwargs):
     self.units = units
     self.rank = rank
     assert rank >= 2 and rank <= 4
     self.activation = activation
     self.bias = bias
     self.qoutput = quantization_layers.ActivationQuantization(**kwargs)
     self._create_normalizer(**kwargs)
     super(BaseQDense, self).__init__(**kwargs)
Exemplo n.º 8
0
 def __init__(self,
              model_dimension,
              num_heads,
              attention_dropout_rate=0.0,
              **kwargs):
     self.model_dimension = model_dimension
     self.num_heads = num_heads
     self.filters = model_dimension // num_heads
     self.dense_layers = dense_layers.BaseQDenseVarLen(
         units=model_dimension * 3, activation=None, **kwargs)
     self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
     self.attention_dropout_rate = attention_dropout_rate
     self.qconcat = quantization_layers.ConcatQuantization(axis=1, **kwargs)
     super(SelfAttentionV2, self).__init__(**kwargs)
Exemplo n.º 9
0
 def __init__(self,
              zoneout_probability=0.0,
              forward=True,
              pooling=QUASI_RNN_POOLING_FO,
              output_quantized=True,
              **kwargs):
   self.zoneout_probability = zoneout_probability
   self.pooling = pooling
   self.forward = forward
   self.output_quantized = output_quantized
   if output_quantized and self.pooling == QUASI_RNN_POOLING_IFO:
     self.qoutputs = quantization_layers.ActivationQuantization()
   self.num_gates = _QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP[pooling]
   assert pooling in _QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP.keys()
   self.pooling_core = QRNNUnidirectionalPoolingCore(forward=forward, **kwargs)
   super(QRNNUnidirectionalPooling, self).__init__(**kwargs)
Exemplo n.º 10
0
 def __init__(self,
              model_dimension,
              max_time_step,
              attention_dropout_rate=0.0,
              beam_size=1,
              **kwargs):
     self.model_dimension = model_dimension
     self.max_time_step = max_time_step
     self.beam_size = beam_size
     self.causal_mask = tf.expand_dims(
         tf.linalg.band_part(tf.ones([max_time_step, max_time_step]), -1,
                             0), 0)
     self.dense_layers = dense_layers.BaseQDenseVarLen(
         units=model_dimension,
         activation=None,
         normalize=False,
         bias=False,
         rank=3,
         **kwargs)
     self.qoutput = quantization_layers.ActivationQuantization(**kwargs)
     super(DecoderUniformAttention, self).__init__(**kwargs)
Exemplo n.º 11
0
 def __init__(self,
              model_dimension,
              num_heads,
              attention_dropout_rate=0.0,
              cached_kv=False,
              **kwargs):
     self.model_dimension = model_dimension
     self.num_heads = num_heads
     self.filters = model_dimension // num_heads
     self.cached_kv = cached_kv
     self.q_dense_layers = dense_layers.BaseQDense(units=model_dimension,
                                                   activation=None,
                                                   normalize=False,
                                                   bias=False,
                                                   **kwargs)
     self.kv_dense_layers = dense_layers.BaseQDenseVarLen(
         units=model_dimension * 2, activation=None, **kwargs)
     self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
     self.attention_dropout_rate = attention_dropout_rate
     self.qconcat = quantization_layers.ConcatQuantization(axis=1, **kwargs)
     super(DecoderMultiheadAttention, self).__init__(**kwargs)
Exemplo n.º 12
0
 def __init__(self,
              filters,
              ksize,
              stride=1,
              padding="SAME",
              dilations=None,
              activation=tf.keras.layers.ReLU(),
              bias=True,
              rank=4,
              **kwargs):
   self.out_filters = filters
   assert rank >= 3 and rank <= 4
   self.rank = rank
   self.ksize = self._unpack(ksize)
   self.strides = self._unpack(stride)
   self.dilations = [1] + self._unpack(dilations) + [1] if dilations else None
   self.activation = activation
   self.bias = bias
   self.padding = padding
   self.qoutput = quantization_layers.ActivationQuantization(**kwargs)
   self._create_normalizer(**kwargs)
   super(EncoderQConvolution, self).__init__(**kwargs)
Exemplo n.º 13
0
    def __init__(self, config, mode):

        super(Model, self).__init__()

        def _get_params(varname, default_value=None):
            value = config[varname] if varname in config else default_value
            default = "" if varname in config else " (default)"
            logging.info("%s = %s%s", varname, value, default)
            setattr(self, varname, value)

        _get_params("intermediate_size")
        _get_params("max_dec_time_step")
        _get_params("max_enc_time_step")
        _get_params("embedding_size")
        _get_params("vocabulary_size")
        _get_params("num_layers")
        _get_params("labels")
        _get_params("regularizer_scale")
        _get_params("num_heads")
        _get_params("model_dimension")
        _get_params("beam_size", 1)
        _get_params("quantize", True)
        _get_params("cached_kv", False)
        _get_params("attention_dropout_rate", 0.0)
        _get_params("activation_dropout_rate", 0.0)
        # If set, a separate dense layer is used to generate the logits instead of
        # re-using the input embedding table.
        _get_params("use_output_layer", False)
        self.parameters = base_layers.Parameters(mode, self.quantize,
                                                 self.regularizer_scale)
        # Activation/Normalization enabled on input bottleneck as there is no
        # temporal information.
        self.input_bottleneck = dense_layers.BaseQDenseVarLen(
            self.model_dimension, rank=3, parameters=self.parameters)
        self.output_bottleneck = dense_layers.BaseQDense(
            self.embedding_size,
            normalize=False,
            activation=None,
            bias=False,
            parameters=self.parameters)

        self.embedding = embedding_layers.EmbeddingFullyConnected(
            shape=[self.vocabulary_size, self.embedding_size],
            initializer=tf.random_uniform_initializer(-math.sqrt(3),
                                                      math.sqrt(3)),
            parameters=self.parameters)
        if self.use_output_layer:
            self.output_layer = dense_layers.BaseQDense(
                self.vocabulary_size,
                activation=None,
                normalize=False,
                bias=False,
                parameters=self.parameters)
        self.positional_embedding = embedding_layers.EmbeddingLayer(
            shape=[self.max_dec_time_step, self.model_dimension],
            initializer=tf.random_uniform_initializer(-math.sqrt(3),
                                                      math.sqrt(3)),
            parameters=self.parameters)
        self.ln = normalization_layers.LayerNormalization(
            parameters=self.parameters)
        self.qact = quantization_layers.ActivationQuantization(
            parameters=self.parameters)
        # Scales the weights for computing logits.
        self.logits_fc_weights_scale_factor = None
        self.logits_fc_bias = self.add_weight(
            "logits_fc_bias",
            shape=[self.vocabulary_size],
            initializer=tf.constant_initializer(0),
            dtype="float32")
        # Optional bias which can be used to mask logits output.
        self.output_bias = None
        self.transformer_uniform_attn_decoder = TransformerUniformAttnDecoderStack(
            parameters=self.parameters,
            num_layers=self.num_layers,
            intermediate_size=self.intermediate_size,
            embedding_size=self.embedding_size,
            max_time_step=self.max_dec_time_step,
            num_heads=self.num_heads,
            model_dimension=self.model_dimension,
            vocabulary_size=self.vocabulary_size,
            beam_size=self.beam_size,
            cached_kv=self.cached_kv,
            attention_dropout_rate=self.attention_dropout_rate,
            activation_dropout_rate=self.activation_dropout_rate)
        # Beam search output.
        self.finished_seq = None
        self.finished_scores = None
Exemplo n.º 14
0
  def __init__(self, config, mode, **kwargs):
    super(Encoder, self).__init__(**kwargs)

    def _get_params(varname, default_value=None):
      value = config[varname] if varname in config else default_value
      default = "" if varname in config else " (default)"
      logging.info("%s = %s%s", varname, value, default)
      setattr(self, varname, value)

    _get_params("labels", [])
    _get_params("regularizer_scale")
    _get_params("quantize")
    _get_params("feature_size")
    _get_params("bottleneck_size")

    self.max_seq_len = config.get("max_seq_len", 128)
    self.gbst_max_token_len = config.get("gbst_max_token_len", 128)
    # Including 3 additional special token ids (0=padding, 1=EOS, 2=UNK).
    self.vocabulary_size = config.get("vocabulary_size", 259)
    self.parameters = base_layers.Parameters(
        mode, quantize=self.quantize, regularizer_scale=self.regularizer_scale)

    self.embedding = embedding_layers.EmbeddingLayer(
        shape=[self.vocabulary_size, self.feature_size],
        parameters=self.parameters)
    self.gbst_downsample_rate = config.get("gbst_downsample_rate", 1)
    self.positional_embedding = embedding_layers.EmbeddingLayer(
        shape=[self.gbst_max_token_len, self.feature_size],
        parameters=self.parameters)
    self.ln = normalization_layers.LayerNormalization(
        parameters=self.parameters)
    self.qact = quantization_layers.ActivationQuantization(
        parameters=self.parameters)

    self.bottleneck_layer = None
    gbst_size = self.feature_size
    if self.bottleneck_size != self.feature_size:
      self.bottleneck_layer = dense_layers.BaseQDenseVarLen(
          self.bottleneck_size,
          rank=3,
          normalize=False,
          activation=None,
          parameters=self.parameters)
      gbst_size = self.bottleneck_size

    self.gbst_max_subword_block_width = config.get(
        "gbst_max_subword_block_width", 5)
    self.gbst_conv_kernel_size = config.get("gbst_conv_kernel_size", 5)
    self.gbst_block_mixing_mode = config.get("gbst_block_mixing_mode", None)
    self.gbst_layer = misc_layers.GBSTLayerV2(
        feature_size=gbst_size,
        max_seq_len=self.gbst_max_token_len,
        downsample_rate=self.gbst_downsample_rate,
        max_subword_block_width=self.gbst_max_subword_block_width,
        conv_kernel_size=self.gbst_conv_kernel_size,
        block_mixing_mode=self.gbst_block_mixing_mode,
        parameters=self.parameters)

    self.pool_windows = config.get("pool_windows", None)
    if self.pool_windows:
      self.transformer_encoder_layer = transformer_encoder.FunnelTransformerModel(
          config, mode)
    else:
      self.transformer_encoder_layer = transformer_encoder.ModelWithEmbeddings(
          config, mode)
    self.attention_pool = misc_layers.AttentionPooling(
        parameters=self.parameters)
    self.num_classes = len(self.labels)
    if self.num_classes:
      self.final_fc = dense_layers.BaseQDense(
          units=self.num_classes,
          rank=2,
          parameters=self.parameters,
          activation=None)
Exemplo n.º 15
0
 def __init__(self, axes=None, **kwargs):
     self.axes = axes or [-1]
     self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
     super(LayerNormalization, self).__init__(**kwargs)
Exemplo n.º 16
0
 def __init__(self, **kwargs):
   self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
   super(TreeInductionLayer, self).__init__(**kwargs)