class Discriminator(object):
    def __init__(self, x_k, n_steps, hidden_dim):
        self.x_k = x_k
        self.hidden_dim = hidden_dim
        constraint = lambda: ClipConstraint(1e-2)
        self.lstm = LSTM(hidden_dim)
        self.lstm.build((None, n_steps, 1))
        for w in self.lstm.trainable_weights:
            # print("Weight: {}".format(w))
            self.lstm.constraints[w] = constraint()
        self.dense = Dense(1, W_constraint=constraint())
        self.dense.build((None, hidden_dim))
        self.weights = self.lstm.trainable_weights + self.dense.trainable_weights
        self.constraints = self.lstm.constraints.copy()
        self.constraints.update(self.dense.constraints)
        # print("Constraints: {}".format(self.constraints))

    def call(self, x):
        return self.dense.call(self.lstm.call(x))
Exemplo n.º 2
0
    def call(self, x, **kwargs):

        return Dense.call(self, x)
Exemplo n.º 3
0
class NSE(Layer):
    '''
    Simple Neural Semantic Encoder.
    '''
    def __init__(self,
                 output_dim,
                 input_length=None,
                 composer_activation='linear',
                 return_mode='last_output',
                 weights=None,
                 **kwargs):
        '''
        Arguments:
        output_dim (int)
        input_length (int)
        composer_activation (str): activation used in the MLP
        return_mode (str): One of last_output, all_outputs, output_and_memory
            This is analogous to the return_sequences flag in Keras' Recurrent.
            last_output returns only the last h_t
            all_outputs returns the whole sequence of h_ts
            output_and_memory returns the last output and the last memory concatenated
                (needed if this layer is followed by a MMA-NSE)
        weights (list): Initial weights
        '''
        self.output_dim = output_dim
        self.input_dim = output_dim  # Equation 2 in the paper makes this assumption.
        self.initial_weights = weights
        self.input_spec = [InputSpec(ndim=3)]
        self.input_length = input_length
        self.composer_activation = composer_activation
        super(NSE, self).__init__(**kwargs)
        self.reader = LSTM(self.output_dim,
                           return_sequences=True,
                           name="{}_reader".format(self.name))
        # TODO: Let the writer use parameter dropout and any consume_less mode.
        # Setting dropout to 0 here to eliminate the need for constants.
        # Setting consume_less to mem to eliminate need for preprocessing
        self.writer = LSTM(self.output_dim,
                           dropout_W=0.0,
                           dropout_U=0.0,
                           consume_less="mem",
                           name="{}_writer".format(self.name))
        self.composer = Dense(self.output_dim * 2,
                              activation=self.composer_activation,
                              name="{}_composer".format(self.name))
        if return_mode not in [
                "last_output", "all_outputs", "output_and_memory"
        ]:
            raise Exception("Unrecognized return mode: %s" % (return_mode))
        self.return_mode = return_mode

    def get_output_shape_for(self, input_shape):
        input_length = input_shape[1]
        if self.return_mode == "last_output":
            return (input_shape[0], self.output_dim)
        elif self.return_mode == "all_outputs":
            return (input_shape[0], input_length, self.output_dim)
        else:
            # return_mode is output_and_memory. Output will be concatenated to memory.
            return (input_shape[0], input_length + 1, self.output_dim)

    def compute_mask(self, input, mask):
        if mask is None or self.return_mode == "last_output":
            return None
        elif self.return_mode == "all_outputs":
            return mask  # (batch_size, input_length)
        else:
            # Return mode is output_and_memory
            # Mask memory corresponding to all the inputs that are masked, and do not mask the output
            # (batch_size, input_length + 1)
            return K.cast(K.concatenate([K.zeros_like(mask[:, :1]), mask]),
                          'uint8')

    def get_composer_input_shape(self, input_shape):
        # Takes concatenation of output and memory summary
        return (input_shape[0], self.output_dim * 2)

    def get_reader_input_shape(self, input_shape):
        return input_shape

    def build(self, input_shape):
        self.input_spec = [InputSpec(shape=input_shape)]
        input_dim = input_shape[-1]
        assert self.reader.return_sequences, "The reader has to return sequences!"
        reader_input_shape = self.get_reader_input_shape(input_shape)
        print >> sys.stderr, "NSE reader input shape:", reader_input_shape
        writer_input_shape = (input_shape[0], 1, self.output_dim * 2
                              )  # Will process one timestep at a time
        print >> sys.stderr, "NSE writer input shape:", writer_input_shape
        composer_input_shape = self.get_composer_input_shape(input_shape)
        print >> sys.stderr, "NSE composer input shape:", composer_input_shape
        self.reader.build(reader_input_shape)
        self.writer.build(writer_input_shape)
        self.composer.build(composer_input_shape)

        # Aggregate weights of individual components for this layer.
        reader_weights = self.reader.trainable_weights
        writer_weights = self.writer.trainable_weights
        composer_weights = self.composer.trainable_weights
        self.trainable_weights = reader_weights + writer_weights + composer_weights

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights

    def read(self, nse_input, input_mask=None):
        '''
        This method produces the 'read' output (equation 1 in the paper) for all timesteps
        and initializes the memory slot mem_0.

        Input: nse_input (batch_size, input_length, input_dim)
        Outputs:
            o (batch_size, input_length, output_dim)
            flattened_mem_0 (batch_size, input_length * output_dim)
 
        While this method simply copies input to mem_0, variants that inherit from this class can do
        something fancier.
        '''
        input_to_read = nse_input
        mem_0 = input_to_read
        flattened_mem_0 = K.batch_flatten(mem_0)
        o = self.reader.call(input_to_read, input_mask)
        o_mask = self.reader.compute_mask(input_to_read, input_mask)
        return o, [flattened_mem_0], o_mask

    @staticmethod
    def summarize_memory(o_t, mem_tm1):
        '''
        This method selects the relevant parts of the memory given the read output and summarizes the
        memory. Implements Equations 2-3 or 8-11 in the paper.
        '''
        # Selecting relevant memory slots, Equation 2
        z_t = K.softmax(K.sum(K.expand_dims(o_t, dim=1) * mem_tm1,
                              axis=2))  # (batch_size, input_length)
        # Summarizing memory, Equation 3
        m_rt = K.sum(K.expand_dims(z_t, dim=2) * mem_tm1,
                     axis=1)  # (batch_size, output_dim)
        return z_t, m_rt

    def compose_memory_and_output(self, output_memory_list):
        '''
        This method takes a list of tensors and applies the composition function on their concatrnation.
        Implements equation 4 or 12 in the paper.
        '''
        # Composition, Equation 4
        c_t = self.composer.call(
            K.concatenate(output_memory_list))  # (batch_size, output_dim)
        return c_t

    def update_memory(self, z_t, h_t, mem_tm1):
        '''
        This method takes the attention vector (z_t), writer output (h_t) and previous timestep's memory (mem_tm1)
        and updates the memory. Implements equations 6, 14 or 15.
        '''
        tiled_z_t = K.tile(
            K.expand_dims(z_t),
            (self.output_dim))  # (batch_size, input_length, output_dim)
        input_length = K.shape(mem_tm1)[1]
        # (batch_size, input_length, output_dim)
        tiled_h_t = K.permute_dimensions(
            K.tile(K.expand_dims(h_t), (input_length)), (0, 2, 1))
        # Updating memory. First term in summation corresponds to selective forgetting and the second term to
        # selective addition. Equation 6.
        mem_t = mem_tm1 * (
            1 - tiled_z_t
        ) + tiled_h_t * tiled_z_t  # (batch_size, input_length, output_dim)
        return mem_t

    def compose_and_write_step(self, o_t, states):
        '''
        This method is a step function that updates the memory at each time step and produces
        a new output vector (Equations 2 to 6 in the paper).
        The memory_state is flattened because K.rnn requires all states to be of the same shape as the output,
        because it uses the same mask for the output and the states.
        Inputs:
            o_t (batch_size, output_dim)
            states (list[Tensor])
                flattened_mem_tm1 (batch_size, input_length * output_dim)
                writer_h_tm1 (batch_size, output_dim)
                writer_c_tm1 (batch_size, output_dim)

        Outputs:
            h_t (batch_size, output_dim)
            flattened_mem_t (batch_size, input_length * output_dim)
        '''
        flattened_mem_tm1, writer_h_tm1, writer_c_tm1 = states
        input_mem_shape = K.shape(flattened_mem_tm1)
        mem_tm1_shape = (input_mem_shape[0],
                         input_mem_shape[1] / self.output_dim, self.output_dim)
        mem_tm1 = K.reshape(
            flattened_mem_tm1,
            mem_tm1_shape)  # (batch_size, input_length, output_dim)
        z_t, m_rt = self.summarize_memory(o_t, mem_tm1)
        c_t = self.compose_memory_and_output([o_t, m_rt])
        # Collecting the necessary variables to directly call writer's step function.
        writer_constants = self.writer.get_constants(
            c_t)  # returns dropouts for W and U (all 1s, see init)
        writer_states = [writer_h_tm1, writer_c_tm1] + writer_constants
        # Making a call to writer's step function, Equation 5
        h_t, [_, writer_c_t] = self.writer.step(
            c_t, writer_states)  # h_t, writer_c_t: (batch_size, output_dim)
        mem_t = self.update_memory(z_t, h_t, mem_tm1)
        flattened_mem_t = K.batch_flatten(mem_t)
        return h_t, [flattened_mem_t, h_t, writer_c_t]

    def call(self, x, mask=None):
        # input_shape = (batch_size, input_length, input_dim). This needs to be defined in build.
        read_output, initial_memory_states, output_mask = self.read(x, mask)
        initial_write_states = self.writer.get_initial_states(
            read_output)  # h_0 and c_0 of the writer LSTM
        initial_states = initial_memory_states + initial_write_states
        # last_output: (batch_size, output_dim)
        # all_outputs: (batch_size, input_length, output_dim)
        # last_states:
        #       last_memory_state: (batch_size, input_length, output_dim)
        #       last_output
        #       last_writer_ct
        last_output, all_outputs, last_states = K.rnn(
            self.compose_and_write_step,
            read_output,
            initial_states,
            mask=output_mask)
        last_memory = last_states[0]
        if self.return_mode == "last_output":
            return last_output
        elif self.return_mode == "all_outputs":
            return all_outputs
        else:
            # return mode is output_and_memory
            expanded_last_output = K.expand_dims(
                last_output, dim=1)  # (batch_size, 1, output_dim)
            # (batch_size, 1+input_length, output_dim)
            return K.concatenate([expanded_last_output, last_memory], axis=1)

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'input_length': self.input_length,
            'composer_activation': self.composer_activation,
            'return_mode': self.return_mode
        }
        base_config = super(NSE, self).get_config()
        config.update(base_config)
        return config
Exemplo n.º 4
0
class CNNEncoder(Layer):
    '''
    CNNEncoder is a combination of multiple convolution layers and max pooling layers. This is
    defined as a single layer to be consistent with the other encoders in terms of input and output
    specifications.  The input to this "layer" is of shape (batch_size, num_words, embedding_size)
    and the output is of size (batch_size, output_dim).

    The CNN has one convolution layer per each ngram filter size. Each convolution operation gives
    out a vector of size num_filters. The number of times a convolution layer will be used
    depends on the ngram size: input_length - ngram_size + 1. The corresponding maxpooling layer
    aggregates all these outputs from the convolution layer and outputs the max.

    This operation is repeated for every ngram size passed, and consequently the dimensionality of
    the output after maxpooling is len(ngram_filter_sizes) * num_filters.

    We then use a fully connected layer to project in back to the desired output_dim.  For more
    details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural
    Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1.
    '''
    def __init__(self, weights=None, **kwargs):
        self.supports_masking = True

        # This is the output dim for each convolutional layer, which is the same as the number of
        # "filters" learned by that layer.
        self.num_filters = kwargs.pop('num_filters')

        # This specifies both the number of convolutional layers we will create and their sizes.
        # Must be a List[int].  The default of (2, 3, 4, 5) will have four convolutional layers,
        # corresponding to encoding ngrams of size 2 to 5 with some number of filters.
        ngram_filter_sizes = kwargs.pop('ngram_filter_sizes', (2, 3, 4, 5))
        self.ngram_filter_sizes = ngram_filter_sizes

        self.output_dim = kwargs.pop('output_dim')

        conv_layer_activation = kwargs.pop('conv_layer_activation', 'relu')
        self.conv_layer_activation = conv_layer_activation

        self.l1_regularization = kwargs.pop("l1_regularization", None)
        self.l2_regularization = kwargs.pop("l2_regularization", None)
        self.regularizer = lambda: l1l2(l1=self.l1_regularization,
                                        l2=self.l2_regularization)

        # These are member variables that will be defined during self.build().
        self.convolution_layers = None
        self.max_pooling_layers = None
        self.projection_layer = None

        self.input_spec = [InputSpec(ndim=3)]
        self.initial_weights = weights
        super(CNNEncoder, self).__init__(**kwargs)

    def build(self, input_shape):
        input_length = input_shape[1]  # number of words
        # We define convolution, maxpooling and dense layers first.
        self.convolution_layers = [
            Convolution1D(nb_filter=self.num_filters,
                          filter_length=ngram_size,
                          activation=self.conv_layer_activation,
                          W_regularizer=self.regularizer(),
                          b_regularizer=self.regularizer())
            for ngram_size in self.ngram_filter_sizes
        ]
        self.max_pooling_layers = [
            MaxPooling1D(pool_length=input_length - ngram_size + 1)
            for ngram_size in self.ngram_filter_sizes
        ]
        self.projection_layer = Dense(self.output_dim)
        # Building all layers because these sub-layers are not explitly part of the computatonal graph.
        for convolution_layer, max_pooling_layer in zip(
                self.convolution_layers, self.max_pooling_layers):
            convolution_layer.build(input_shape)
            max_pooling_layer.build(
                convolution_layer.get_output_shape_for(input_shape))
        maxpool_output_dim = self.num_filters * len(self.ngram_filter_sizes)
        projection_input_shape = (input_shape[0], maxpool_output_dim)
        self.projection_layer.build(projection_input_shape)
        # Defining the weights of this "layer" as the set of weights from all convolution
        # and maxpooling layers.
        self.trainable_weights = []
        for layer in self.convolution_layers + self.max_pooling_layers + [
                self.projection_layer
        ]:
            self.trainable_weights.extend(layer.trainable_weights)

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights

        super(CNNEncoder, self).build(input_shape)

    def call(self, x, mask=None):
        # Each convolution layer returns output of size (samples, pool_length, num_filters),
        #       where pool_length = num_words - ngram_size + 1
        # Each maxpooling layer returns output of size (samples, 1, num_filters).
        # We need to flatten to remove the second dimension of length 1 from the maxpooled output.
        filter_outputs = [
            K.batch_flatten(
                max_pooling_layer.call(convolution_layer.call(x, mask)))
            for max_pooling_layer, convolution_layer in zip(
                self.max_pooling_layers, self.convolution_layers)
        ]
        maxpool_output = merge(
            filter_outputs,
            mode='concat') if len(filter_outputs) > 1 else filter_outputs[0]
        return self.projection_layer.call(maxpool_output)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], self.output_dim)

    def compute_mask(self, input, input_mask=None):  # pylint: disable=redefined-builtin
        # By default Keras propagates the mask from a layer that supports masking. We don't need it
        # anymore. So eliminating it from the flow.
        return None

    def get_config(self):
        config = {
            "output_dim": self.output_dim,
            "num_filters": self.num_filters,
            "ngram_filter_sizes": self.ngram_filter_sizes,
            "conv_layer_activation": self.conv_layer_activation,
            "l1_regularization": self.l1_regularization,
            "l2_regularization": self.l2_regularization,
        }
        base_config = super(CNNEncoder, self).get_config()
        config.update(base_config)
        return config
Exemplo n.º 5
0
class CNNEncoder(MaskedLayer):
    '''
    CNNEncoder is a combination of multiple convolution layers and max pooling layers. This is
    defined as a single layer to be consistent with the other encoders in terms of input and output
    specifications.  The input to this "layer" is of shape (batch_size, num_words, embedding_dim)
    and the output is of size (batch_size, output_dim).

    The CNN has one convolution layer per each ngram filter size. Each convolution operation gives
    out a vector of size num_filters. The number of times a convolution layer will be used
    depends on the ngram size: input_length - ngram_size + 1. The corresponding maxpooling layer
    aggregates all these outputs from the convolution layer and outputs the max.

    This operation is repeated for every ngram size passed, and consequently the dimensionality of
    the output after maxpooling is len(ngram_filter_sizes) * num_filters.

    We then use a fully connected layer to project in back to the desired output_dim.  For more
    details, refer to "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural
    Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1.

    Parameters
    ----------
    units: int
        After doing convolutions, we'll project the collected features into a vector of this size.
        This used to be ``output_dim``, but Keras changed it to ``units``.  I prefer the name
        ``output_dim``, so we'll leave the code using ``output_dim``, and just use the name
        ``units`` in the external API.
    num_filters: int
        This is the output dim for each convolutional layer, which is the same as the number of
        "filters" learned by that layer.
    ngram_filter_sizes: Tuple[int], optional (default=(2, 3, 4, 5))
        This specifies both the number of convolutional layers we will create and their sizes.  The
        default of (2, 3, 4, 5) will have four convolutional layers, corresponding to encoding
        ngrams of size 2 to 5 with some number of filters.
    conv_layer_activation: str, optional (default='relu')
    l1_regularization: float, optional (default=None)
    l2_regularization: float, optional (default=None)
    '''
    def __init__(self,
                 units: int,
                 num_filters: int,
                 ngram_filter_sizes: Tuple[int] = (2, 3, 4, 5),
                 conv_layer_activation: str = 'relu',
                 l1_regularization: float = None,
                 l2_regularization: float = None,
                 **kwargs):
        self.num_filters = num_filters
        self.ngram_filter_sizes = ngram_filter_sizes
        self.output_dim = units
        self.conv_layer_activation = conv_layer_activation
        self.l1_regularization = l1_regularization
        self.l2_regularization = l2_regularization
        self.regularizer = lambda: l1_l2(l1=self.l1_regularization,
                                         l2=self.l2_regularization)

        # These are member variables that will be defined during self.build().
        self.convolution_layers = None
        self.max_pooling_layers = None
        self.projection_layer = None

        self.input_spec = [InputSpec(ndim=3)]
        super(CNNEncoder, self).__init__(**kwargs)

    @overrides
    def build(self, input_shape):
        input_length = input_shape[1]  # number of words
        # We define convolution, maxpooling and dense layers first.
        self.convolution_layers = [
            Convolution1D(filters=self.num_filters,
                          kernel_size=ngram_size,
                          activation=self.conv_layer_activation,
                          kernel_regularizer=self.regularizer(),
                          bias_regularizer=self.regularizer())
            for ngram_size in self.ngram_filter_sizes
        ]
        self.max_pooling_layers = [
            MaxPooling1D(pool_length=input_length - ngram_size + 1)
            for ngram_size in self.ngram_filter_sizes
        ]
        self.projection_layer = Dense(self.output_dim)
        # Building all layers because these sub-layers are not explitly part of the computatonal graph.
        for convolution_layer, max_pooling_layer in zip(
                self.convolution_layers, self.max_pooling_layers):
            with K.name_scope(convolution_layer.name):
                convolution_layer.build(input_shape)
            with K.name_scope(max_pooling_layer.name):
                max_pooling_layer.build(
                    convolution_layer.compute_output_shape(input_shape))
        maxpool_output_dim = self.num_filters * len(self.ngram_filter_sizes)
        projection_input_shape = (input_shape[0], maxpool_output_dim)
        with K.name_scope(self.projection_layer.name):
            self.projection_layer.build(projection_input_shape)
        # Defining the weights of this "layer" as the set of weights from all convolution
        # and maxpooling layers.
        self.trainable_weights = []
        for layer in self.convolution_layers + self.max_pooling_layers + [
                self.projection_layer
        ]:
            self.trainable_weights.extend(layer.trainable_weights)

        super(CNNEncoder, self).build(input_shape)

    @overrides
    def call(self, inputs, mask=None):  # pylint: disable=unused-argument
        # Each convolution layer returns output of size (samples, pool_length, num_filters),
        #       where pool_length = num_words - ngram_size + 1
        # Each maxpooling layer returns output of size (samples, 1, num_filters).
        # We need to flatten to remove the second dimension of length 1 from the maxpooled output.
        # TODO(matt): we need to use a convolutional layer here that supports masking.
        filter_outputs = [
            K.batch_flatten(
                max_pooling_layer.call(convolution_layer.call(inputs)))
            for max_pooling_layer, convolution_layer in zip(
                self.max_pooling_layers, self.convolution_layers)
        ]
        if K.backend() == 'theano':
            # Just using the `call` method on layers does not set the _keras_shape, which is
            # necessary with the theano backend.  So we set it manually here to what we expect the
            # shape to be.
            for filter_output in filter_outputs:
                filter_output._keras_shape = (None, self.num_filters)  # pylint: disable=protected-access
        maxpool_output = Concatenate()(
            filter_outputs) if len(filter_outputs) > 1 else filter_outputs[0]
        return self.projection_layer.call(maxpool_output)

    @overrides
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)

    @overrides
    def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
        # By default Keras propagates the mask from a layer that supports masking. We don't need it
        # anymore. So eliminating it from the flow.
        return None

    @overrides
    def get_config(self):
        config = {
            "units": self.output_dim,
            "num_filters": self.num_filters,
            "ngram_filter_sizes": self.ngram_filter_sizes,
            "conv_layer_activation": self.conv_layer_activation,
            "l1_regularization": self.l1_regularization,
            "l2_regularization": self.l2_regularization,
        }
        base_config = super(CNNEncoder, self).get_config()
        config.update(base_config)
        return config
Exemplo n.º 6
0
class NSE(Layer):
    '''
    Simple Neural Semantic Encoder.
    '''
    def __init__(self,
                 output_dim,
                 input_length=None,
                 composer_activation='linear',
                 return_mode='last_output',
                 weights=None,
                 **kwargs):
        '''
        Arguments:
        output_dim (int)
        input_length (int)
        composer_activation (str): activation used in the MLP
        return_mode (str): One of last_output, all_outputs, output_and_memory
            This is analogous to the return_sequences flag in Keras' Recurrent.
            last_output returns only the last h_t
            all_outputs returns the whole sequence of h_ts
            output_and_memory returns the last output and the last memory concatenated
                (needed if this layer is followed by a MMA-NSE)
        weights (list): Initial weights
        '''
        self.output_dim = output_dim
        self.input_dim = output_dim  # Equation 2 in the paper makes this assumption.
        self.initial_weights = weights
        self.input_spec = [InputSpec(ndim=3)]
        self.input_length = input_length
        self.composer_activation = composer_activation
        super(NSE, self).__init__(**kwargs)
        self.reader = LSTM(self.output_dim,
                           dropout_W=0.0,
                           dropout_U=0.0,
                           consume_less="gpu",
                           name="{}_reader".format(self.name))
        # TODO: Let the writer use parameter dropout and any consume_less mode.
        # Setting dropout to 0 here to eliminate the need for constants.
        # Setting consume_less to gpu to eliminate need for preprocessing
        self.writer = LSTM(self.output_dim,
                           dropout_W=0.0,
                           dropout_U=0.0,
                           consume_less="gpu",
                           name="{}_writer".format(self.name))
        self.composer = Dense(self.output_dim * 2,
                              activation=self.composer_activation,
                              name="{}_composer".format(self.name))
        if return_mode not in [
                "last_output", "all_outputs", "output_and_memory"
        ]:
            raise Exception("Unrecognized return mode: %s" % (return_mode))
        print("vj golden NSE.__init__ return_mode is {}".format(return_mode))
        self.return_mode = return_mode

    def get_output_shape_for(self, input_shape):
        input_length = input_shape[1]
        if self.return_mode == "last_output":
            return (input_shape[0], self.output_dim)
        elif self.return_mode == "all_outputs":
            return (input_shape[0], input_length, self.output_dim)
        else:
            # return_mode is output_and_memory. Output will be concatenated to memory.
            return (input_shape[0], input_length + 1, self.output_dim)

    def compute_mask(self, input, mask):
        if mask is None or self.return_mode == "last_output":
            return None
        elif self.return_mode == "all_outputs":
            return mask  # (batch_size, input_length)
        else:
            # Return mode is output_and_memory
            # Mask memory corresponding to all the inputs that are masked, and do not mask the output
            # (batch_size, input_length + 1)
            return K.cast(K.concatenate([K.zeros_like(mask[:, :1]), mask]),
                          'uint8')

    def get_composer_input_shape(self, input_shape):
        # Takes concatenation of output and memory summary
        return (input_shape[0], self.output_dim * 2)

    def get_reader_input_shape(self, input_shape):
        return input_shape

    def build(self, input_shape):
        self.input_spec = [InputSpec(shape=input_shape)]
        input_dim = input_shape[-1]
        reader_input_shape = self.get_reader_input_shape(input_shape)
        print >> sys.stderr, "NSE reader input shape:", reader_input_shape
        writer_input_shape = (input_shape[0], 1, self.output_dim * 2
                              )  # Will process one timestep at a time
        print >> sys.stderr, "NSE writer input shape:", writer_input_shape
        composer_input_shape = self.get_composer_input_shape(input_shape)
        print >> sys.stderr, "NSE composer input shape:", composer_input_shape
        self.reader.build(reader_input_shape)
        self.writer.build(writer_input_shape)
        self.composer.build(composer_input_shape)

        # Aggregate weights of individual components for this layer.
        reader_weights = self.reader.trainable_weights
        writer_weights = self.writer.trainable_weights
        composer_weights = self.composer.trainable_weights
        self.trainable_weights = reader_weights + writer_weights + composer_weights

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights

    def get_initial_states(self, nse_input, input_mask=None):
        '''
        This method produces the 'read' mask for all timesteps
        and initializes the memory slot mem_0.

        Input: nse_input (batch_size, input_length, input_dim)
        Output: list[Tensors]:
                h_0 (batch_size, output_dim)
                c_0 (batch_size, output_dim)
                flattened_mem_0 (batch_size, input_length * output_dim)
 
        While this method simply copies input to mem_0, variants that inherit from this class can do
        something fancier.
        '''
        input_to_read = nse_input
        mem_0 = input_to_read
        flattened_mem_0 = K.batch_flatten(mem_0)
        flattened_mem_0 = TF_PRINT(flattened_mem_0,
                                   "get_initial_states.flattened_mem_0",
                                   expected_shape=[BATCH, LENGTH * DIM])
        initial_states = self.reader.get_initial_states(nse_input)

        initial_states += [flattened_mem_0]

        return initial_states

    @staticmethod
    def summarize_memory(o_t, mem_tm1):
        '''
        This method selects the relevant parts of the memory given the read output and summarizes the
        memory. Implements Equations 2-3 or 8-11 in the paper.
        '''
        # Selecting relevant memory slots, Equation 2
        z_t = K.softmax(K.sum(K.expand_dims(o_t, dim=1) * mem_tm1,
                              axis=2))  # (batch_size, input_length)
        z_t = TF_PRINT(z_t,
                       "summarize_memory.z_t",
                       expected_shape=[BATCH, LENGTH])

        # Summarizing memory, Equation 3
        m_rt = K.sum(K.expand_dims(z_t, dim=2) * mem_tm1,
                     axis=1)  # (batch_size, output_dim)
        m_rt = TF_PRINT(m_rt,
                        "summarize_memory.m_rt",
                        expected_shape=[BATCH, DIM])
        return z_t, m_rt

    def compose_memory_and_output(self, output_memory_list):
        '''
        This method takes a list of tensors and applies the composition function on their concatrnation.
        Implements equation 4 or 12 in the paper.
        '''
        # Composition, Equation 4
        c_t = self.composer.call(
            K.concatenate(output_memory_list))  # (batch_size, output_dim)
        c_t = TF_PRINT(c_t,
                       "compose_memory_and_output.c_t",
                       expected_shape=[BATCH, DIM])
        return c_t

    def update_memory(self, z_t, h_t, mem_tm1):
        '''
        This method takes the attention vector (z_t), writer output (h_t) and previous timestep's memory (mem_tm1)
        and updates the memory. Implements equations 6, 14 or 15.
        '''
        """ 
        The following is written assuming the equations in the paper are implemented as they are written:
        tiled_z_t_trans = K.tile(K.expand_dims(z_t,1), [1,self.output_dim,1])  # (batch_size, input_length, output_dim)
        input_length = K.shape(mem_tm1)[1]
        # (batch_size, input_length, output_dim)
#        tiled_h_t = K.permute_dimensions(K.tile(K.expand_dims(h_t, -1), [1,input_length]), (0, 2, 1))
        tiled_h_t = K.tile(K.expand_dims(h_t, -1), [1,1, input_length])
# Updating memory. First term in summation corresponds to selective forgetting and the second term to
        # selective addition. Equation 6.
        mem_t = mem_tm1 * (1 - tiled_z_t_trans) + tiled_h_t * tiled_z_t_trans  # (batch_size, input_length, output_dim)
        """
        """ 
        The following code assumes that mem_t is actually the transpose of what is in the paper.
        Implemented by simply wrapping a K.permute_dimensions(_, (0, 2, 1)) call around the original value.
        """
        tiled_z_t = K.permute_dimensions(
            K.tile(K.expand_dims(z_t, 1), [1, self.output_dim, 1]),
            (0, 2, 1))  # (batch_size, input_length, output_dim)
        input_length = K.shape(mem_tm1)[1]
        # (batch_size, input_length, output_dim)
        #        tiled_h_t = K.permute_dimensions(K.tile(K.expand_dims(h_t, -1), [1,input_length]), (0, 2, 1))
        tiled_h_t = K.permute_dimensions(
            K.tile(K.expand_dims(h_t, -1), [1, 1, input_length]), (0, 2, 1))

        # Updating memory. First term in summation corresponds to selective forgetting and the second term to
        # selective addition. Equation 6.
        mem_t = mem_tm1 * (
            1 - tiled_z_t
        ) + tiled_h_t * tiled_z_t  # (batch_size, input_length, output_dim)
        mem_t = TF_PRINT(mem_t,
                         "update_memory.mem_t",
                         expected_shape=[BATCH, LENGTH, DIM])

        return mem_t

    @staticmethod
    def split_states(states):
        # This method is a helper for the step function to split the states into reader states, memory and
        # awrite states.
        return states[:2], states[2], states[3:]

    def step(self, input_t, states):
        '''
        This method is a step function that updates the memory at each time step and produces
        a new output vector (Equations 1 to 6 in the paper).
        The memory_state is flattened because K.rnn requires all states to be of the same shape as the output,
        because it uses the same mask for the output and the states.
        Inputs:
            input_t (batch_size, input_dim)
            states (list[Tensor])
                flattened_mem_tm1 (batch_size, input_length * output_dim)
                writer_h_tm1 (batch_size, output_dim)
                writer_c_tm1 (batch_size, output_dim)

        Outputs:
            h_t (batch_size, output_dim)
            flattened_mem_t (batch_size, input_length * output_dim)
        '''
        input_t = TF_PRINT(input_t,
                           "step.input_t",
                           expected_shape=[BATCH, DIM])

        reader_states, flattened_mem_tm1, writer_states = self.split_states(
            states)
        input_mem_shape = K.shape(flattened_mem_tm1)
        mem_tm1_shape = (input_mem_shape[0],
                         input_mem_shape[1] / self.output_dim, self.output_dim)

        mem_tm1 = K.reshape(
            flattened_mem_tm1,
            mem_tm1_shape)  # (batch_size, input_length, output_dim)
        mem_tm1 = TF_PRINT(mem_tm1,
                           "step.mem_tm1",
                           expected_shape=[BATCH, LENGTH, DIM])

        reader_constants = self.reader.get_constants(
            input_t)  # Does not depend on input_t, see init.
        reader_states = reader_states[:2] + tuple(
            reader_constants) + reader_states[2:]
        o_t, [_, reader_c_t] = self.reader.step(
            input_t,
            reader_states)  # o_t, reader_c_t: (batch_size, output_dim)

        o_t = TF_PRINT(o_t, "step.o_t", expected_shape=[BATCH, DIM])
        reader_c_t = TF_PRINT(reader_c_t,
                              "step.reader_c_t",
                              expected_shape=[BATCH, DIM])

        z_t, m_rt = self.summarize_memory(o_t, mem_tm1)
        c_t = self.compose_memory_and_output([o_t, m_rt])

        # Collecting the necessary variables to directly call writer's step function.
        writer_constants = self.writer.get_constants(
            c_t)  # returns dropouts for W and U (all 1s, see init)
        writer_states += tuple(writer_constants)

        # Making a call to writer's step function, Equation 5
        h_t, [_, writer_c_t] = self.writer.step(
            c_t, writer_states)  # h_t, writer_c_t: (batch_size, output_dim)

        h_t = TF_PRINT(h_t, "step.h_t", expected_shape=[BATCH, DIM])
        writer_c_t = TF_PRINT(writer_c_t,
                              "step.writer_c_t",
                              expected_shape=[BATCH, DIM])

        mem_t = self.update_memory(z_t, h_t, mem_tm1)

        flattened_mem_t = K.batch_flatten(mem_t)
        flattened_mem_t = TF_PRINT(flattened_mem_t,
                                   "step.flattened_mem_t",
                                   expected_shape=[BATCH, LENGTH * DIM])

        return h_t, [o_t, reader_c_t, flattened_mem_t, h_t, writer_c_t]

    def loop(self, x, initial_states, mask):
        # This is a separate method because Ontoaware variants will have to override this to make a call
        # to changingdim rnn.

        last_output, all_outputs, last_states = K.rnn(self.step,
                                                      x,
                                                      initial_states,
                                                      mask=mask)
        last_output = TF_PRINT(last_output, "loop.last_output")
        all_outputs = TF_PRINT(all_outputs, "loop.all_outputs")
        #        last_states = TF_PRINT(last_states, "loop.last_states")
        return last_output, all_outputs, last_states

    def call(self, x, mask=None):
        # input_shape = (batch_size, input_length, input_dim). This needs to be defined in build.
        if mask != None:
            print("vj golden call.mask ={}. Being set to None.".format(mask))
            mask = None
        initial_read_states = self.get_initial_states(x, mask)

        fake_writer_input = K.expand_dims(initial_read_states[0],
                                          dim=1)  # (batch_size, 1, output_dim)
        fake_writer_input = TF_PRINT(fake_writer_input,
                                     "call.fake_writer_input",
                                     expected_shape=[BATCH, 1, DIM])

        initial_write_states = self.writer.get_initial_states(
            fake_writer_input)  # h_0 and c_0 of the writer LSTM
        initial_states = initial_read_states + initial_write_states

        # last_output: (batch_size, output_dim)
        # all_outputs: (batch_size, input_length, output_dim)
        # last_states:
        #       last_memory_state: (batch_size, input_length, output_dim)
        #       last_output
        #       last_writer_ct
        last_output, all_outputs, last_states = self.loop(
            x, initial_states, mask)
        last_memory = last_states[0]

        if self.return_mode == "last_output":
            return last_output
        elif self.return_mode == "all_outputs":
            return all_outputs
        else:
            # return mode is output_and_memory
            expanded_last_output = K.expand_dims(
                last_output, dim=1)  # (batch_size, 1, output_dim)
            expanded_last_output = TF_PRINT(expanded_last_output,
                                            "call.expanded_last_output",
                                            expected_size=[BATCH, 1, DIM])
            # (batch_size, 1+input_length, output_dim)
            result = K.concatenate([expanded_last_output, last_memory], axis=1)
            result = TF_PRINT(result,
                              "call.result",
                              expected_size=[BATCH, 1 + LENGTH, DIM])
            return result

    def get_config(self):
        config = {
            'output_dim': self.output_dim,
            'input_length': self.input_length,
            'composer_activation': self.composer_activation,
            'return_mode': self.return_mode
        }
        base_config = super(NSE, self).get_config()
        config.update(base_config)
        return config
Exemplo n.º 7
0
class AttentionTransformer(Layer):
    """
    Keras implementation of the multihead attention layers in tensorflow, adapted from
         https://github.com/Kyubyong/transformer

    3 inputs - queries, keys, values (in this order)
    generally: [batch size; length of sequence; features vector]
    queries: A 3d tensor with shape of [N_batches, T_q, C_q].
    keys: A 3d tensor with shape of [N_batches, T_k, C_k].
    values: A 3d tensor with shape of [N_batches, T_v, C_v].
    if called with one input, assumes keys=queries=values as in attention is all you need.
    """
    
    def __init__(self, usesoftmax=True, num_units=None, num_heads=8, dropout_rate=0, activation='relu', causality=False,
                 usequerymasks=True, **kwargs):
        self.activation = activation
        self.num_units = num_units
        self.num_heads = num_heads
        self.dropout_rate = dropout_rate
        self.causality = causality
        self.usesoftmax = usesoftmax
        self.usequerymasks = usequerymasks
        Layer.__init__(self, **kwargs)
    
    def get_config(self):
        config = {'activation': self.activation,
                  'num_units': self.num_units,
                  'num_heads': self.num_heads,
                  'dropout_rate': self.dropout_rate,
                  'causality': self.causality,
                  'usesoftmax': self.usesoftmax,
                  'usequerymasks': self.usequerymasks,
                  }
        base_config = super(AttentionTransformer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    
    def build(self, input_shape):
        (queries, keys, values) = self._care_inputs(input_shape)
        queries = list(queries)
        keys = list(keys)
        values = list(values)
        if self.num_units is None:
            self.num_units = queries[-1]
        # we will now accept inputs as sequences, so if something is not a sequence it IS a sequence of len 1
        if len(queries) <= 2:
            queries.insert(-1, 1)
        if len(keys) <= 2:
            keys.insert(-1, 1)
        if len(values) <= 2:
            values.insert(-1, 1)
        
        self.Q_dense = Dense(self.num_units, activation=self.activation, name="Q_dense")
        self.Q_dense.build(queries)
        self.K_dense = Dense(self.num_units, activation=self.activation, name="K_dense")
        self.K_dense.build(keys)
        self.V_dense = Dense(self.num_units, activation=self.activation, name="V_dense")
        self.V_dense.build(values)
        
        self.trainable_weights = self.Q_dense.trainable_weights + self.K_dense.trainable_weights + \
                                 self.V_dense.trainable_weights
        self.non_trainable_weights = self.Q_dense.non_trainable_weights + self.K_dense.non_trainable_weights + \
                                     self.V_dense.non_trainable_weights
        
        self.dropout = Dropout(rate=self.dropout_rate)
        self.built = True
    
    # a hint about the Keras implementation: it is all called in the sequence: build, compute_output_shape, call
    def _care_inputs(self, inputs):
        inputs = copy.copy(inputs)
        if (isinstance(inputs, list)):
            while (len(inputs) < 3):
                inputs.append(inputs[-1])
            inputs = inputs[0:3]
        else:
            inputs = [inputs, inputs, inputs]
        return inputs
    
    def compute_output_shape(self, input_shape):
        (queries, keys, values) = self._care_inputs(input_shape)
        # assert input_shape and len(input_shape) >= 2
        # assert input_shape[-1]
        output_shape = list(queries)
        output_shape[-1] = self.num_units  # (N, T_q, C) num units = T_q, if num units unspecified by user
        return tuple(output_shape)
    
    def call(self, inputs, training=None):
        # expects 3 inputs as merge layer https://github.com/keras-team/keras/blob/master/keras/layers/merge.py
        (queries, keys, values) = self._care_inputs(inputs)
        if self.num_units is None:  # done in build too
            self.num_units = queries.get_shape().as_list()[-1]
        # we will now accept inputs as sequences, so if something is not a sequence it IS a sequence of len 1
        if len(queries.shape) <= 2:
            queries = tf.expand_dims(queries, -2)
        if len(keys.shape) <= 2:
            keys = tf.expand_dims(keys, -2)
        if len(values.shape) <= 2:
            values = tf.expand_dims(values, -2)
        Q = self.Q_dense.call(queries)  # call is a way how to use a layer inside a layer
        K = self.K_dense.call(keys)
        V = self.V_dense.call(values)
        if len(Q.shape) <= 2:
            Q = tf.expand_dims(Q, -2)
        if len(K.shape) <= 2:
            K = tf.expand_dims(K, -2)
        if len(V.shape) <= 2:
            V = tf.expand_dims(V, -2)
        return self.multihead_attention_mechanism(Q, K, V,
                                                  queries=queries, keys=keys,
                                                  num_heads=self.num_heads,
                                                  causality=self.causality,
                                                  usequerymasks=self.usequerymasks,
                                                  scope="multihead_attention",
                                                  usesoftmax=self.usesoftmax,
                                                  reuse=None)
    
    def normalize(self, inputs,
                  epsilon=1e-8,
                  scope="ln",
                  reuse=None):
        """Applies layer normalization.

        Args:
        ----
          inputs: A tensor with 2 or more dimensions, where the first dimension has
            `batch_size`.
          epsilon: A floating number. A very small number for preventing ZeroDivision Error.
          scope: Optional scope for `variable_scope`.
          reuse: Boolean, whether to reuse the weights of a previous layer
            by the same name.

        Returns
        -------
          A tensor with the same shape and data dtype as `inputs`.
        """
        with tf.variable_scope(scope, reuse=reuse):
            inputs_shape = inputs.get_shape()
            params_shape = inputs_shape[-1:]
            mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
            beta = tf.Variable(tf.zeros(params_shape))
            gamma = tf.Variable(tf.ones(params_shape))
            normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
            outputs = gamma * normalized + beta
        return outputs
    
    def multihead_attention_mechanism(self,
                                      Qinp, Kinp, Vinp,
                                      queries, keys,
                                      num_heads=8,
                                      causality=False,
                                      usequerymasks=True,
                                      scope="multihead_attention",
                                      usesoftmax=True,
                                      reuse=None):
        """Applies multihead attention mechanism. Just the computation eithout trainable weights.

        Args:
        ----
          queries: A 3d tensor with shape of [N, T_q, C_q].
          keys: A 3d tensor with shape of [N, T_k, C_k].
          causality: Boolean. If true, units that reference the future are masked.
          num_heads: An int. Number of heads.
          scope: Optional scope for `variable_scope`.
          reuse: Boolean, whether to reuse the weights of a previous layer
            by the same name.

        Returns
        -------
          A 3d tensor with shape of (N, T_q, C)
        """
        assert (len(Qinp.shape) + len(Kinp.shape) + len(Vinp.shape) > 3 * 2)
        with tf.variable_scope(scope, reuse=reuse):
            # Split and concat - for keras, the N dimension is HIDDEN, but in tf we see it!
            Q_ = tf.concat(tf.split(Qinp, num_heads, axis=2), axis=0)  # (h*N, T_q, C/h)
            K_ = tf.concat(tf.split(Kinp, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)
            V_ = tf.concat(tf.split(Vinp, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)
            # Multiplication                                     # T_q, T_k are the original queries and keys -
            # sequence lengths (and in the application they are the same)
            preoutputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (h*N, T_q, T_k)
            # Scale
            preoutputs = preoutputs / (K_.get_shape().as_list()[-1] ** 0.5)
            # Key Masking
            key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))  # (N, T_k)
            key_masks = tf.tile(key_masks, [num_heads, 1])  # (h*N, T_k)
            key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1])  # (h*N, T_q, T_k)
            paddings = tf.ones_like(preoutputs) * (-2 ** 32 + 1)
            preoutputs = tf.where(tf.equal(key_masks, 0), paddings, preoutputs)  # (h*N, T_q, T_k)
            # Causality = Future blinding
            if causality:
                diag_vals = tf.ones_like(preoutputs[0, :, :])  # (T_q, T_k)
                tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense()  # (T_q, T_k)
                masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(preoutputs)[0], 1, 1])  # (h*N, T_q, T_k)
                paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
                preoutputs = tf.where(tf.equal(masks, 0), paddings, preoutputs)  # (h*N, T_q, T_k)
            # Activation
            if (usesoftmax):
                preoutputs = tf.nn.softmax(preoutputs)  # (h*N, T_q, T_k)
            # Query Masking
            if usequerymasks:
                query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))  # (N, T_q)
                query_masks = tf.tile(query_masks, [num_heads, 1])  # (h*N, T_q)
                query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]])  # (h*N, T_q, T_k)
                preoutputs *= query_masks  # broadcasting. (N, T_q, T_k)
            outputs = self.dropout.call(preoutputs)
            # Weighted sum
            outputs = tf.matmul(outputs, V_)  # ( h*N, T_q, C/h)
            # Restore shape
            outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, T_q, C)
            # Residual connection #still the same dimension
            outputs += queries
            # Normalize
            outputs = self.normalize(outputs)  # (N, T_q, C)
        return outputs
Exemplo n.º 8
0
class CNNEncoder(Seq2VecEncoder):
    """ CNNEncoder is a combination of multiple convolutional layers and max
    pooling layers. This is defined as a single layer to be consistent with
    other encoders in terms of input and output specifications.

    Input shape: (batch_size, sequence_length, input_dim).
    Output shape: (batch_size, output_dim).

    The CNN has one convolution layer per each ngram filter size. Each
    convolution operation gives out a vector of size num_filters. The number
    of times a convolution layer will be used depends on the ngram size:
    input_len - ngram_size + 1. The corresponding maxpooling layer aggregates
    all these outputs from the convolution layer and outputs the max.

    This operation is repeated for every ngram size passed, and consequently
    the dimensionality of the output after maxpooling is
    len(ngram_filter_sizes) * num_filters.

    We the use a fully connected layer to project in back to the desired
    output_dim.

    References: "A Sensitivity Analysis of (and Practitioners’ Guide to)
    Convolutional Neural Networks for Sentence Classification",
    Zhang and Wallace 2016, particularly Figure 1.

    Args:
        filters: Integer, the output dim for each convolutional layer.
        kernel_sizes: An integer tuple of list, the kernel sizes of each
            convolutional layers.
        units: After doing convolutions, we'll project the collected features
            into a vecor of this size. If this value is `None`, just return the
            result of the max pooling.
        conv_layer_activation: string of convolutional layer `Activation`.
        l1_regularization: float.
        l2_regularization: float.
    """
    def __init__(self,
                 filters=100,
                 kernel_sizes=(2, 3, 4, 5),
                 units=None,
                 conv_layer_activation='relu',
                 l1_regularization=None,
                 l2_regularization=None,
                 **kwargs):
        self.filters = filters
        self.kernel_sizes = kernel_sizes
        self.units = units
        self.conv_layer_activation = conv_layer_activation
        self.l1_regularization = l1_regularization
        self.l2_regularization = l2_regularization
        self.regularizer = l1_l2(l1=self.l1_regularization,
                                 l2=self.l2_regularization)

        self.conv_layers = None
        self.projection_layer = None
        self.output_dim = None

        self.input_spec = [InputSpec(ndim=3)]
        super(CNNEncoder, self).__init__(**kwargs)

    def build(self, input_shape):
        self.conv_layers = [
            Conv1D(filters=self.filters,
                   kernel_size=kernel_size,
                   activation=self.conv_layer_activation,
                   kernel_regularizer=self.regularizer,
                   bias_regularizer=self.regularizer)
            for kernel_size in self.kernel_sizes
        ]
        for conv_layer in self.conv_layers:
            with K.name_scope(conv_layer.name):
                conv_layer.build(input_shape)
        maxpool_output_dim = self.filters * len(self.kernel_sizes)
        if self.units is not None:
            self.projection_layer = Dense(self.units)
            projection_input_shape = (input_shape[0], maxpool_output_dim)
            with K.name_scope(self.projection_layer.name):
                self.projection_layer.build(projection_input_shape)
            self.output_dim = self.units
            trainable_layers = self.conv_layers + [self.projection_layer]
        else:
            self.projection_layer = None
            self.output_dim = maxpool_output_dim
            trainable_layers = self.conv_layers
        # Define weights of this layer as the set of weights from all layers.
        self.trainable_weights = []
        for layer in trainable_layers:
            self.trainable_weights.extend(layer.trainable_weights)

        super(CNNEncoder, self).build(input_shape)

    def call(self, inputs, mask=None):
        # Each convolution layer returns output of size (batch_size, conv_length, filters),
        # where `conv_length = num_words - kernel_size + 1`. We then do max
        # pooling over each filter for the whole input sequence, just use K.max,
        # giving a result tensor of shape (batch_size, filters), which then
        # gets projected using the projection layer.
        filter_outputs = [
            K.max(conv_layer.call(inputs), axis=1)
            for conv_layer in self.conv_layers
        ]
        maxpool_output = Concatenate()(filter_outputs) \
                         if len(filter_outputs) > 1 else filter_outputs[0]
        if self.projection_layer:
            result = self.projection_layer.call(maxpool_output)
        else:
            result = maxpool_output
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)

    def compute_mask(self, inputs, mask=None):
        # By default Keras propagates the mask from a layer that supports masking. We don't need it
        # anymore. So eliminating it from the flow.
        return None

    def get_config(self):
        config = {
            "filters": self.filters,
            "kernel_sizes": self.kernel_sizes,
            "units": self.units,
            "conv_layer_activation": self.conv_layer_activation,
            "l1_regularization": self.l1_regularization,
            "l2_regularization": self.l2_regularization
        }
        base_config = super(CNNEncoder, self).get_config()
        config.update(base_config)
        return config
Exemplo n.º 9
0
class Connected(Layer):
    """
    Darknet "connected" layer. Main difference vs. keras Dense layer is that 
    input also becomes flatten.
    The same as 
    
    ```
    def get_connected(params):
        activation = get_activation(params.get('activation', "linear"))
        def _connected(x):
            y = Flatten()(x)
            return Dense(params.get('output', 1), activation=activation)(y)
        
        return Lambda(_connected)
    ```
    
    - but also has weights.
    """
    def __init__(self, output=1, activation=None, batch_normalize=0, **kwargs):
        self.units = output
        self.batch_normalize = batch_normalize
        super(Connected, self).__init__(**kwargs)
        self.dense_layer = Dense(self.units, **kwargs)
        # TODO: axis check
        if self.batch_normalize:
            self.batchnorm_layer = BatchNormalization(scale=True, center=False)
        self.activation_layer = get_activation(activation)

    def build(self, input_shape):
        super(Connected, self).build(input_shape)
        densed_shape = (input_shape[0], np.prod(input_shape[1:]))
        self.dense_layer.build(densed_shape)
        if self.batch_normalize:
            densed_shape = self.dense_layer.output_shape(densed_shape)
            self.batchnorm_layer.build(densed_shape)
        self.activation_layer.build(densed_shape)

    def call(self, x, training=None):
        flatten_inputs = K.batch_flatten(x)
        output = self.dense_layer.call(flatten_inputs)
        if self.batch_normalize:
            output = self.batchnorm_layer.call(output)
        output = self.activation_layer.call(output)
        return output

    def compute_output_shape(self, input_shape):
        dense_input_shape = (input_shape[0], np.prod(input_shape[1:]))
        shape = self.dense_layer.compute_output_shape(dense_input_shape)
        #if self.batch_normalize:
        #    shape = self.batch_normalize.compute_output_shape(shape)
        return shape

    def set_weights(self, weights):
        if self.batch_normalize:
            (weights, bias, scales, rolling_mean, rolling_variance) = weights
            self.dense_layer.set_weights((weights, bias))
            self.batchnorm_layer.set_weights(
                (scales, rolling_mean, rolling_variance))
        else:
            self.dense_layer.set_weights(weights)

    def get_weights(self):
        if self.batch_normalize:
            return self.dense_layer.get_weights(
            ) + self.batchnorm_layer.get_weights()
        return self.dense_layer.get_weights()