示例#1
0
def test_model_var_inputs():
    # wanted to test when our train graph has more inputs that do not need to be fed (e.g. variable state)
    n_features = 5
    embed_size = 4
    hidden_dim = 3
    seq_size = 3
    out_size = 2
    batch_size = 2

    x = tx.Input(np.random.random([batch_size, seq_size]),
                 n_units=seq_size,
                 dtype=tf.int32)
    y = tx.Input(np.random.random([batch_size, out_size]),
                 n_units=out_size,
                 dtype=tf.float32)
    lookup = tx.Lookup(x,
                       seq_size=seq_size,
                       embedding_shape=[n_features, embed_size])
    # seq = lookup.permute_batch_time()
    seq = tx.Transpose(lookup, [1, 0, 2])

    rnn1 = tx.RNN(seq, cell_config=tx.RNNCell.config(n_units=hidden_dim))
    y_ = tx.Linear(rnn1[seq_size - 1], n_units=out_size)

    # y_ = tx.Linear(tx.SeqConcat(lookup, seq_size=seq_size), n_units=out_size)

    # @tx.layer(n_units=2, dtype=tf.float32, name="loss")
    # def loss(pred, labels):
    #    return tx.mse(pred, labels)

    model = tx.Model(run_inputs=x,
                     run_outputs=y_,
                     train_inputs=[x, y],
                     train_outputs=y_,
                     train_loss=tx.MSE(y_, y))

    # model.draw("test.pdf")

    model.set_optimizer(tf.optimizers.SGD, lr=0.5)

    data1 = [[0, 1, 2], [2, 1, 0]]
    data2 = [[0., 1.], [1., 0.]]

    model.train_step(input_feed={x: data1, y: data2})
示例#2
0
def test_attention_rnn_shape():
    """ test attention and rnn layers integration with shape inference
    """
    x1 = tx.Input(tf.ones([1, 2, 3]), n_units=3, name="x1")
    rnn1 = tx.RNN(x1,
                  cell_config=tx.LSTMCell.config(n_units=4),
                  n_units=4,
                  stateful=False)
    att = tx.MHAttention(rnn1, rnn1, rnn1, n_units=3)

    rnn1_res = rnn1()
    att_res = att()

    assert rnn1.n_units == 4
    assert rnn1.n_units == rnn1.cell.n_units
    assert tx.shape_equal(rnn1.shape[:-1], att.shape[:-1])
    assert att.shape[-1] == att.n_units

    assert tx.shape_equal(rnn1_res.shape[1:], rnn1.shape[1:])
    assert tx.shape_equal(att_res.shape[1:], att.shape[1:])
示例#3
0
def test_module_with_attention():
    """ Module + Attention integration
    This also tests Graph indirectly to check if we can add layers
    whose input layers are the same object (e.g. in self-attention)
    """

    x1 = tx.Input(tf.ones([1, 2, 3]), n_units=3, name="x1")
    rnn1 = tx.RNN(x1,
                  cell_config=tx.LSTMCell.config(n_units=4),
                  n_units=4,
                  stateful=False)
    att = tx.MHAttention(rnn1, rnn1, rnn1, n_units=3)
    m = tx.Module(inputs=x1, output=att, dependencies=rnn1.previous_state)
    g = tx.Graph.build(inputs=x1, outputs=m, add_missing_inputs=True)
    fn = g.as_function(ord_inputs=x1, ord_outputs=m)
    # this returns a tuple
    out1 = g.compute(tf.ones([1, 2, 3]))
    # this returns the function result
    out2 = fn(tf.ones([1, 2, 3]))

    assert tx.tensor_equal(out1[0], out2)
示例#4
0
kernel_u = tx.Concat(*kernel_u)
tx_kernel = tx.Merge(kernel_w,
                     kernel_u,
                     merge_fn=lambda l: tf.concat(l, axis=0))

# kernel = tx.Reshape(kernel, [-1, 4 * cell_units])

tf_zero_state = tf_cell.zero_state(batch_size, dtype=tf.float32)
tf_out, tf_state = tf_cell(t1.tensor, state=tf_zero_state)

# inject my internal state into TensorFlow lstm
tf_cell._kernel = tx_kernel
tf_out, tf_state = tf_cell(t1.tensor, state=tf_zero_state)

tx_rnn = tx.RNN(seq,
                cell_proto=lambda x, **kwargs: tx_cell.reuse_with(x, **kwargs),
                stateful=False)
tx_rnn = tx.Transpose(tx_rnn, [1, 0, 2])

# time major maintains the format in the output
# if time major output is time major
# if batch major, output is batch major
tf_rnn, tf_state = tf.nn.dynamic_rnn(
    cell=tf_cell,
    inputs=lookup.tensor,
    sequence_length=None,
    initial_state=tf_zero_state,
    time_major=False,
)

with tf.Session() as sess:
示例#5
0
    def __init__(self,
                 inputs,
                 labels,
                 vocab_size,
                 embed_dim,
                 h_dim,
                 embed_init=tx.zeros_init(),
                 logit_init=tx.glorot_uniform(),
                 num_h=1,
                 h_activation=tx.tanh,
                 h_init=tx.glorot_uniform(),
                 w_dropconnect=None,
                 u_dropconnect=None,
                 r_dropout=0.4,
                 y_dropout=0.4,
                 embed_dropout=0.3,
                 other_dropout=0.3,
                 l2_loss=False,
                 l2_weight=1e-5,
                 use_f_predict=False,
                 f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=False,
                 logit_bias=False,
                 use_nce=False,
                 nce_samples=10,
                 skip_connections=False):
        if not isinstance(inputs, tx.Input):
            raise TypeError("inputs must be an Input layer")
        self.inputs = inputs
        self.labels = labels
        if not isinstance(labels, tx.Input):
            raise TypeError("labels must be an Input layer")

        if inputs.dtype != tf.int32 and inputs.dtype != tf.int64:
            raise TypeError(
                "Invalid dtype for input: expected int32 or int64, got {}".
                format(inputs.dtype))

        if num_h < 0:
            raise ValueError("num hidden should be >= 0")

        # ===============================================
        # RUN GRAPH
        # ===============================================
        var_reg = []

        with tf.name_scope("run"):
            # feature lookup

            embeddings = tx.Lookup(inputs,
                                   seq_size=None,
                                   lookup_shape=[vocab_size, embed_dim],
                                   weight_init=embed_init)
            var_reg.append(embeddings.weights)
            feature_lookup = embeddings.permute_batch_time()

            last_layer = feature_lookup

            cell_proto = tx.LSTMCell.proto(
                n_units=h_dim,
                activation=h_activation,
                gate_activation=tx.hard_sigmoid,
                w_init=h_init,
                u_init=h_init,
                w_dropconnect=w_dropconnect,
                u_dropconnect=u_dropconnect,
                r_dropout=r_dropout,
                x_dropout=None,
                y_dropout=y_dropout,
                regularized=False,
                name="cell",
            )

            lstm_layers = []
            for i in range(num_h):
                lstm_layer = tx.RNN(last_layer,
                                    cell_proto=cell_proto,
                                    regularized=False,
                                    stateful=True,
                                    name="LSTM_{}".format(i + 1))

                lstm_layers.append(lstm_layer)

                var_reg += [wi.weights for wi in lstm_layer.cell.w]
                var_reg += [ui.weights for ui in lstm_layer.cell.u]

                last_layer = lstm_layer

            # last time step is the state used to make the prediction
            # last_layer = tx.Reshape(last_layer, [-1, h_dim])

            # TODO this is not consistent with locked dropout for the last layer
            # where the same mask should be applied across time steps
            # to do this I need either y_dropout to be available or some sort of map
            # operation I can use with layers outputting 3D tensors
            # something equivalent to https://keras.io/layers/wrappers/ which applies
            # a layer to every temporal slice of an input. They implement this the same way
            # they implement an RNN

            # feature prediction for Energy-Based Model
            if use_f_predict:
                last_layer = tx.Linear(last_layer,
                                       embed_dim,
                                       f_init,
                                       add_bias=True,
                                       name="f_predict")
                # proto = tx.GRUCell.proto(n_units=embed_dim,
                #                          activation=h_activation,
                #                          gate_activation=tx.hard_sigmoid,
                #                          w_init=h_init,
                #                          u_init=h_init,
                #                          w_dropconnect=w_dropconnect,
                #                          u_dropconnect=u_dropconnect,
                #                          r_dropout=r_dropout,
                #                          x_dropout=None,
                #                          y_dropout=y_dropout,
                #                          regularized=False)
                # last_layer1 = tx.RNN(last_layer, cell_proto=proto, regularized=False, stateful=False)
                # last_layer2 = last_layer1.reuse_with(last_layer, reverse=True)
                # last_layer = tx.Add(last_layer1, last_layer2)
                # last_layer = tx.Module(last_layer, last_layer)
                var_reg += last_layer.variables
                # var_reg.append(last_layer.weights)
                f_predict = last_layer

            shared_weights = feature_lookup.weights if embed_share else None
            transpose_weights = embed_share
            logit_init = logit_init if not embed_share else None
            run_logits = tx.Linear(last_layer,
                                   n_units=vocab_size,
                                   weight_init=logit_init,
                                   shared_weights=shared_weights,
                                   transpose_weights=transpose_weights,
                                   add_bias=logit_bias,
                                   name="logits")

            if not embed_share:
                var_reg.append(run_logits.weights)

            run_output = tx.Activation(run_logits,
                                       tx.softmax,
                                       name="run_output")

            # ===============================================
            # TRAIN GRAPH
            # ===============================================
            with tf.name_scope("train"):
                embeddings = embeddings.reuse_with(inputs)
                feature_lookup = embeddings.permute_batch_time()

                if embed_dropout:
                    feature_lookup = tx.Dropout(feature_lookup,
                                                probability=embed_dropout,
                                                name="drop_features")

                last_layer = feature_lookup

                for i in range(num_h):
                    lstm_layer = lstm_layers[i].reuse_with(last_layer,
                                                           regularized=True)
                    last_layer = lstm_layer

                # last_layer = tx.Reshape(last_layer, [-1, h_dim])

                # feature prediction for Energy-Based Model
                if use_f_predict:
                    # last_layer = f_predict.reuse_with(last_layer)
                    last_layer = f_predict.reuse_with(last_layer,
                                                      regularized=True)

                last_layer = tx.Dropout(last_layer,
                                        probability=other_dropout,
                                        locked=False)

                train_logits = run_logits.reuse_with(last_layer,
                                                     name="train_logits")

                train_output = tx.Activation(train_logits,
                                             tx.softmax,
                                             name="run_output")

            def categorical_loss(labels, logits):
                # labels come as a batch of classes [[1,2],[3,4]] -> [1,3,2,4] time steps are ordered to match logits
                labels = tx.Transpose(labels)
                labels = tx.Reshape(labels, [-1])
                labels = tx.dense_one_hot(labels, num_cols=vocab_size)
                loss = tx.categorical_cross_entropy(labels=labels,
                                                    logits=logits)

                return tf.reduce_mean(loss)

            def nce_loss(labels, weights, bias, predict):
                noise = uniform_sampler(labels, 1, nce_samples, True,
                                        vocab_size)
                loss = tf.nn.nce_loss(weights=weights,
                                      biases=bias,
                                      inputs=predict,
                                      labels=labels,
                                      num_sampled=nce_samples,
                                      num_classes=vocab_size,
                                      num_true=1,
                                      sampled_values=noise)
                return tf.reduce_mean(loss)

            if use_nce:
                bias = tx.VariableLayer(var_shape=[vocab_size],
                                        name="nce_bias")

                # wraps a layer to expose the weights as a layer but with the layer as its input
                nce_weights = tx.WrapLayer(embeddings,
                                           n_units=embeddings.n_units,
                                           wrap_fn=lambda x: x.weights,
                                           layer_fn=True)
                train_loss = tx.LambdaLayer(labels,
                                            nce_weights,
                                            bias,
                                            last_layer,
                                            apply_fn=nce_loss,
                                            name="nce_loss")
            else:
                train_loss = tx.LambdaLayer(labels,
                                            train_logits,
                                            apply_fn=categorical_loss,
                                            name="train_loss")

            if l2_loss:
                l2_losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = tx.LambdaLayer(
                    train_loss,
                    apply_fn=lambda x: x + l2_weight * tf.add_n(l2_losses),
                    name="train_loss_l2")

        # ===============================================
        # EVAL GRAPH
        # ===============================================
        with tf.name_scope("eval"):
            eval_loss = tx.LambdaLayer(labels,
                                       run_logits,
                                       apply_fn=categorical_loss,
                                       name="eval_loss")

        self.stateful_layers = lstm_layers
        # BUILD MODEL
        super().__init__(run_outputs=run_output,
                         run_inputs=inputs,
                         train_inputs=[inputs, labels],
                         train_outputs=train_output,
                         train_loss=train_loss,
                         eval_inputs=[inputs, labels],
                         eval_outputs=run_output,
                         eval_score=eval_loss)
示例#6
0
def test_rnn_layer():
    n_features = 5
    embed_size = 4
    hidden_dim = 3
    seq_size = 3
    batch_size = 2

    inputs = tx.Input(np.random.random([batch_size, seq_size]),
                      n_units=seq_size,
                      dtype=tf.int32)
    lookup = tx.Lookup(inputs,
                       seq_size=seq_size,
                       embedding_shape=[n_features, embed_size])
    seq = lookup.permute_batch_time()

    ones_state = tf.ones([batch_size, hidden_dim])
    zero_state = (tf.zeros([batch_size, hidden_dim]))

    rnn_proto = tx.RNNCell.config(n_units=hidden_dim)

    rnn1 = tx.RNN(seq,
                  cell_config=rnn_proto,
                  previous_state=ones_state,
                  return_state=True)
    rnn2 = rnn1.reuse_with(seq)

    #  problem with RNN layer is that it uses modules that require
    #  all the params to output the right answer
    #  we need to supply the default values for the rest or all the inputs
    out1, last1 = rnn1()
    out2, last2 = rnn2()

    assert tx.tensor_equal(out1, out2)
    assert tx.tensor_equal(last1, last2)

    rnn3 = rnn1.reuse_with(seq, zero_state)
    rnn4 = rnn3.reuse_with(seq)
    rnn5 = rnn4.reuse_with(seq, ones_state)

    assert tx.tensor_equal(rnn2.previous_state, rnn1.previous_state)
    assert tx.tensor_equal(rnn3.previous_state, rnn4.previous_state)

    out3, last3 = rnn3()
    out4, last4 = rnn4()

    assert tx.tensor_equal(out3, out4)
    assert tx.tensor_equal(last3, last4)

    cell_state1 = rnn1.cell.previous_state[0]()
    cell_state2 = rnn2.cell.previous_state[0]()
    cell_state3 = rnn3.cell.previous_state[0]()
    cell_state4 = rnn4.cell.previous_state[0]()

    assert len(rnn1.cell.previous_state) == 1

    assert tx.tensor_equal(cell_state1, cell_state2)
    assert tx.tensor_equal(cell_state3, cell_state4)

    assert not tx.tensor_equal(out1, out3)

    out5, last5 = rnn5()

    assert tx.tensor_equal(out1, out5)
    assert tx.tensor_equal(last1, last5)
示例#7
0
def test_lstm_rnn_stateful():
    n_units = 4
    batch_size = 12
    seq_size = 3
    n_features = 16
    embed_size = 6

    feature_indices = np.random.randint(0,
                                        high=n_features,
                                        size=[batch_size, seq_size])

    inputs = tx.Input(init_value=feature_indices,
                      n_units=seq_size,
                      dtype=tf.int32)
    lookup = tx.Lookup(inputs,
                       seq_size=seq_size,
                       embedding_shape=[n_features, embed_size])
    seq = lookup.permute_batch_time()

    # (N, T, M)
    # print(np.shape(seq()))

    lstm_cell = tx.LSTMCell.config(n_units=n_units,
                                   activation=tf.tanh,
                                   gate_activation=tf.sigmoid,
                                   forget_bias_init=tf.initializers.ones())

    # state0 = [s() for s in lstm0.previous_state]

    # inputs.value = tf.ones([batch_size, n_features])
    # res1 = lstm1(inputs, state0)
    # res1_ = lstm1(inputs, state0)

    lstm_layer = tx.RNN(input_seq=seq,
                        cell_config=lstm_cell,
                        stateful=True,
                        return_state=True)
    state0 = [s() for s in lstm_layer.previous_state]
    lstm_layer()
    state1 = [s() for s in lstm_layer.previous_state]

    for i in range(len(state0)):
        assert not tx.tensor_equal(state0[i], state1[i])

    assert np.shape(state1[0]) == (batch_size, n_units)

    tx_cell = lstm_layer.cell
    kernel = tf.concat([w.weights.value() for w in tx_cell.w], axis=-1)
    recurrent_kernel = tf.concat([u.weights.value() for u in tx_cell.u],
                                 axis=-1)
    bias = tf.concat([w.bias.value() for w in tx_cell.w], axis=-1)

    # create keras lstm and update with the same cell state
    # since LSTM initializes the cell state internally this was
    # the only way to initializing that state from the tensorx state
    class FromOther(tf.keras.initializers.Initializer):
        def __init__(self, value):
            self.value = value

        def __call__(self, shape, dtype=None):
            if not tf.TensorShape(shape).is_compatible_with(
                    tf.shape(self.value)):
                raise Exception(
                    f"init called with shape {shape} != value shape {tf.shape(self.value)}"
                )
            else:
                return self.value

    # seq = lookup()
    # seq = tf.transpose(seq, [1, 0, 2])

    # lstm_cell = tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=n_units)
    # lstm_cell.build(np.shape(seq[0]))

    # full_kernel = tf.concat([kernel, recurrent_kernel], axis=0)
    # lstm_cell = (full_kernel, bias)
    # lstm_cell.weights[0] = full_kernel
    # lstm_cell.weights[1] = bias
    # print(type())

    # print(lstm_cell(seq[0],state=tuple(state1)))
    # rnn = tf.keras.layers.RNN(cell=lstm_cell,
    #                         dtype=tf.float32,
    #                         return_sequences=True,
    #                         time_major=True,
    #                         unroll=False)
    # print(rnn(seq))
    # print(lstm_layer())
    # tf_lstm_output = rnn(seq, tuple(state1))
    # tx_lstm_output = lstm_layer()

    keras_lstm = tf.keras.layers.LSTM(
        units=n_units,
        activation=tf.tanh,
        kernel_initializer=FromOther(kernel.numpy()),
        recurrent_initializer=FromOther(recurrent_kernel.numpy()),
        bias_initializer=FromOther(bias.numpy()),
        recurrent_activation=tf.sigmoid,
        unit_forget_bias=False,
        implementation=2,
        time_major=True,
        unroll=True,
        return_sequences=True,
        stateful=False)

    #
    # lookup is of form [batch x features x input_dim] instead of [features x batch x input_dim]
    keras_lstm_output = keras_lstm(seq(), initial_state=tuple(state1))

    assert tx.tensor_equal(keras_lstm.cell.kernel.value(), kernel)
    assert tx.tensor_equal(keras_lstm.cell.recurrent_kernel.value(),
                           recurrent_kernel)
    assert tx.tensor_equal(keras_lstm.cell.bias.value(), bias)

    tx_lstm_output = lstm_layer()[0]
    assert tx.tensor_all_close(keras_lstm_output, tx_lstm_output)