def test_dynamic_concat(): seq1 = [[1, 2], [3, 4]] seq2 = [[1, 2, 3], [4, 5, 6]] n = 10 m = 4 inputs = tx.Input(seq2, shape=[None, None], dtype=tf.int32, constant=False) inputs2 = tx.Input(seq2, dtype=tf.int32, constant=True) lookup = tx.Lookup(inputs, seq_size=None, embedding_shape=[n, m]) lookup2 = tx.Lookup(inputs2, seq_size=3, embedding_shape=[n, m]) concat1 = lookup.as_concat() concat2 = lookup2.as_concat() assert concat1.n_units is None assert concat2.n_units is not None concat3 = tx.SeqConcat(lookup, time_major=False) concat4 = tx.SeqConcat(lookup, seq_size=3, time_major=False) assert tx.shape_equal(concat4.shape, (None, 3 * 4)) c1, c2 = concat1(), concat3() assert tx.tensor_equal(c1, c2) assert concat3.n_units is None assert concat4.n_units == 3 * lookup.n_units inputs.value = seq1 l1 = lookup() inputs.value = seq2 l2 = lookup() assert np.shape(l1)[-1] == m assert np.shape(l2)[-1] == m
def test_coupled_gate(): vocab_size = 4 n_features = 3 seq_size = 2 inputs = tx.Input(init_value=np.array([[2, 0], [1, 2]]), n_units=seq_size, dtype=tf.int32, constant=True) features1 = tx.Lookup(inputs, seq_size, embedding_shape=[vocab_size, n_features]).as_concat() features2 = tx.Lookup(inputs, seq_size, embedding_shape=[vocab_size, n_features]).as_concat() gate_w = tx.Linear(features1, seq_size, add_bias=True) coupled_gate = tx.CoupledGate(features1, features2, gate_w) sp_features1 = tx.ToSparse(features1) assert tx.tensor_equal(tf.sparse.to_dense(sp_features1()), features1()) sp_gate = tx.CoupledGate(sp_features1, features2, gate_w) print(sp_gate()) print(sp_gate.shape) # coupled_gate2 = coupled_gate.reuse_with(sp_features1, features2) r1 = coupled_gate()
def test_lookup_sequence_transform(): vocab_size = 4 embed_dim = 2 seq_size = 2 inputs = tx.Input(n_units=seq_size, dtype=tf.int32) input_data = np.array([[2, 0], [1, 2], [0, 2]]) lookup = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[vocab_size, embed_dim], add_bias=True) concat_lookup = lookup.as_concat() seq_lookup = lookup.permute_batch_time() assert hasattr(lookup, "seq_size") inputs.value = input_data v1 = lookup() v2 = concat_lookup() v3 = seq_lookup() assert np.shape(v1) == (np.shape(input_data)[0], seq_size, embed_dim) assert np.shape(v2) == (np.shape(input_data)[0], seq_size * embed_dim) assert np.shape(v3) == (seq_size, np.shape(input_data)[0], embed_dim) assert tx.tensor_equal(v1[:, 0], v3[0])
def test_conv1d(): n_features = 3 embed_size = 128 seq_size = 3 batch_size = 2 inputs = tx.Constant(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) emb = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[n_features, embed_size]) seq = emb() n_units = 100 filter_size = 4 cnn = tf.keras.layers.Conv1D(filters=n_units, kernel_size=filter_size, padding='same') res = cnn(seq) cnn2 = tx.Conv1D(emb, n_units=100, filter_size=filter_size) res2 = cnn2(seq) assert len(cnn.variables) == len(cnn.variables) cnn.kernel = cnn2.filters cnn.bias = cnn2.bias res3 = cnn(seq) assert not tx.tensor_equal(res, res2) assert tx.tensor_equal(res2, res3)
def test_lookup_dynamic_sequence(): seq1 = [[1, 2], [3, 4]] seq2 = [[1, 2, 3], [4, 5, 6]] n = 10 h = 4 inputs = tx.Input(dtype=tf.int32, constant=False) lookup = tx.Lookup(inputs, seq_size=None, embedding_shape=[n, h]) assert tx.shape_equal(lookup.shape, (None, None, h)) concat = lookup.as_concat() inputs.value = seq1 inputs.value = seq1 inputs() inputs.value = seq2 inputs() inputs.value = seq1 l1 = lookup() inputs.value = seq2 l2 = lookup() inputs.value = seq1 c1 = concat() inputs.value = seq2 c2 = concat() assert np.shape(l1)[-1] == h assert np.shape(l2)[-1] == h assert np.shape(c1)[-1] == h * 2 assert np.shape(c2)[-1] == h * 3
def test_as_concat_wrap(): n = 10 h = 4 inputs = tx.Input(dtype=tf.int32, constant=False) lookup = tx.Lookup(inputs, seq_size=None, embedding_shape=[n, h]) assert tx.shape_equal(lookup.shape, (None, None, h)) concat = lookup.as_concat() assert tx.shape_equal(concat.shape, (None, None)) lookup = tx.Lookup(inputs, seq_size=2, embedding_shape=[n, h]) concat = lookup.as_concat() assert tx.shape_equal(concat.shape, (None, 2 * 4)) seq1 = [[1, 2], [3, 4]] inputs.value = seq1 concat_tensor = concat() assert concat_tensor.shape[-1] == concat.shape[-1]
def test_lookup_sequence_mismatch(): inputs = tx.Input(np.array([[2, 0], [1, 2]]), 2, dtype=tf.int64) lookup = tx.Lookup(inputs, None, embedding_shape=[2, 10], batch_size=None, batch_padding=True) assert lookup.shape.is_compatible_with(lookup().shape) lookup = tx.Lookup(inputs, 1, embedding_shape=[2, 10], batch_size=None, batch_padding=True) # not validating seq_len differing from input seq_len assert lookup.batch_size is None assert lookup.shape.is_compatible_with(lookup().shape)
def test_lookup_config(): inputs = tx.Input(np.array([[2, 0], [1, 2]]), 2, dtype=tf.int64) lookup = tx.Lookup(inputs, None, embedding_shape=[2, 10], batch_size=None, batch_padding=True) assert lookup.config['embedding_shape'] == [2, 10] assert lookup.config['batch_size'] is None assert lookup.config['batch_padding'] is True assert lookup.config['seq_size'] is None
def test_lookup_sequence_sparse(): input_dim = 10 embed_dim = 3 seq_size = 2 batch_size = 3 sparse_input = tf.SparseTensor([[0, 2], [1, 0], [2, 1]], [1, 1, 1], [3, input_dim]) sparse_input_1d = tf.SparseTensor([[2], [0], [1]], [1, 1, 1], [input_dim]) tensor_input = tx.Constant(sparse_input, input_dim) tensor_input_1d = tx.Constant(sparse_input_1d, input_dim) lookup = tx.Lookup(tensor_input, seq_size, embedding_shape=[input_dim, embed_dim], batch_size=batch_size, batch_padding=False) lookup_padding = tx.Lookup(tensor_input, seq_size, embedding_shape=[input_dim, embed_dim], batch_size=batch_size, batch_padding=True) lookup_1d = tx.Lookup(tensor_input_1d, seq_size, embedding_shape=[input_dim, embed_dim], batch_size=batch_size, batch_padding=True) result = lookup() result_padding = lookup_padding() result_1d = lookup_1d() assert np.shape(result) == (2, seq_size, embed_dim) assert np.shape(result_padding) == (batch_size, seq_size, embed_dim) assert np.shape(result_1d) == (batch_size, seq_size, embed_dim)
def test_multihead_attention(): """ TODO check causality """ n_features = 3 embed_size = 128 seq_size = 3 batch_size = 2 n_heads = 8 inputs = tx.Constant(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) emb = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[n_features, embed_size]) attention = tx.MHAttention(query=emb, key=emb, value=emb, n_units=embed_size, n_heads=n_heads, causality=False, attention_dropout=0.1, regularized=False) assert len(attention.inputs) == 3 # 3 "kernels" + bias assert len(attention.variables) == 3 attention_reg = attention.reuse_with(emb, emb, emb, regularized=True) attention_2 = attention.reuse_with(emb, emb, emb, regularized=False) attention_causal = attention.reuse_with(emb, emb, emb, causality=True) attention_causal() result = attention() result_reg = attention_reg() result2 = attention_2() assert tx.same_shape(result, result_reg) assert tx.tensor_equal(result, result2) vars1 = map(lambda v: v.ref(), attention.variables) vars2 = map(lambda v: v.ref(), attention_2.variables) assert set(vars1) == set(vars2)
def test_lookup_sequence_bias(): vocab_size = 4 n_features = 3 seq_size = 2 inputs = tx.Input(n_units=seq_size, dtype=tf.int32) input_data = np.array([[2, 0], [1, 2], [0, 2]]) lookup = tx.Lookup(input_layer=inputs, seq_size=seq_size, embedding_shape=[vocab_size, n_features], add_bias=True) inputs.value = input_data v1 = lookup() assert np.shape(v1) == (np.shape(input_data)[0], seq_size, n_features)
def test_drop_lookup(): """ Embedding Dropout TODO finish test """ seq_size = 4 vocab_size = 10 embed_dim = 3 input_data = tf.constant([[2, 0, 2, 0], [1, 2, 2, 3], [0, 3, 0, 2]]) inputs = tx.Input(init_value=input_data, n_units=seq_size, dtype=tf.int32) lookup = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[vocab_size, embed_dim], add_bias=True) tx.DropLookup(lookup, probability=0.5)
def test_biRNN(): # bidirectional RNN n_features = 5 embed_size = 4 hidden_dim = 3 seq_size = 6 batch_size = 2 inputs = tx.Input(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) lookup = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[n_features, embed_size]) seq = lookup.permute_batch_time() rnn_proto = tx.RNNCell.config(n_units=hidden_dim) rnn0 = tx.RNN(seq, cell_config=rnn_proto, stateful=False, return_state=True) # because a stateful rnn0 has a variable layer as input as well rnn_m0 = tx.Module(inputs=rnn0.inputs, output=rnn0) rnn1 = rnn0.reuse_with(seq, reverse=True, stateful=False, return_state=True) # this solves rnn output multiple tensors r01 = rnn_m0.compute(seq(), rnn0.previous_state[0]()) rnn0.reset() r02 = rnn0() assert tx.tensor_equal(r01[0], r02[0]) rnn0_0 = rnn0[0] rnn1_0 = rnn1[0] rnn0 = tx.Wrap(rnn0, wrap_fn=lambda y: y[0], n_units=rnn0.n_units) rnn1 = tx.Wrap(rnn1, wrap_fn=lambda y: y[0], n_units=rnn1.n_units) rnn0_tensor = rnn0() rnn1_tensor = rnn1() rnn0_0_tensor = rnn0_0() print(rnn0_tensor.shape) print(rnn0_0_tensor.shape)
def test_attention(): n_features = 3 embed_size = 8 seq_size = 3 batch_size = 2 inputs = tx.Constant(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) emb = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[n_features, embed_size]) seq = emb() # keras attention doesn't have multiple heads attention = Attention(use_scale=False) res = attention([seq, seq, seq]) attention2 = tx.MHAttention(emb, emb, emb, n_units=embed_size, n_heads=1) assert len(attention2.variables) == 3 attention2.wq = tx.Linear(emb, n_units=None, weights=tf.linalg.eye(embed_size, embed_size), add_bias=False) attention2.wk = tx.Linear(emb, n_units=None, weights=tf.linalg.eye(embed_size, embed_size), add_bias=False) attention2.wv = tx.Linear(emb, n_units=None, weights=tf.linalg.eye(embed_size, embed_size), add_bias=False) assert tx.tensor_equal(attention2.wq(seq), seq) res2 = attention2() g = tx.Graph.build(inputs=emb, outputs=attention2) g = g.as_function(ord_inputs=emb, ord_outputs=attention2) res3 = g(seq) assert tx.tensor_equal(res, res2) assert tx.tensor_equal(res, res3)
def test_stateful_rnn_layer(): n_features = 5 embed_size = 4 hidden_dim = 3 seq_size = 3 batch_size = 2 inputs = tx.Input(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) lookup = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[n_features, embed_size]) seq = lookup.permute_batch_time() rnn_proto = tx.RNNCell.config(n_units=hidden_dim) rnn1 = tx.RNN(seq, cell_config=rnn_proto, stateful=True, return_state=True) lstm1 = tx.RNN(seq, cell_config=tx.LSTMCell.config(n_units=hidden_dim), stateful=True, return_state=True) zero_state0 = [layer() for layer in rnn1.previous_state] assert len(zero_state0) == 1 expected_state = tf.zeros([1, hidden_dim], dtype=tf.float32) assert tx.tensor_equal(zero_state0[0], expected_state) # import logging # logging.getLogger("tensorx").setLevel(logging.DEBUG) out1, state1 = rnn1() tx.Graph.build(inputs=None, outputs=lstm1) # out2, state2 = lstm1() lstm1() # state after single run # zero_state1 = [layer() for layer in ] zero_state1 = rnn1.previous_state[0]() assert tx.tensor_equal(zero_state1, state1) rnn1.reset() reset_state = rnn1.previous_state[0]() assert tx.tensor_equal(reset_state, zero_state0[0])
def test_model_var_inputs(): # wanted to test when our train graph has more inputs that do not need to be fed (e.g. variable state) n_features = 5 embed_size = 4 hidden_dim = 3 seq_size = 3 out_size = 2 batch_size = 2 x = tx.Input(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) y = tx.Input(np.random.random([batch_size, out_size]), n_units=out_size, dtype=tf.float32) lookup = tx.Lookup(x, seq_size=seq_size, embedding_shape=[n_features, embed_size]) # seq = lookup.permute_batch_time() seq = tx.Transpose(lookup, [1, 0, 2]) rnn1 = tx.RNN(seq, cell_config=tx.RNNCell.config(n_units=hidden_dim)) y_ = tx.Linear(rnn1[seq_size - 1], n_units=out_size) # y_ = tx.Linear(tx.SeqConcat(lookup, seq_size=seq_size), n_units=out_size) # @tx.layer(n_units=2, dtype=tf.float32, name="loss") # def loss(pred, labels): # return tx.mse(pred, labels) model = tx.Model(run_inputs=x, run_outputs=y_, train_inputs=[x, y], train_outputs=y_, train_loss=tx.MSE(y_, y)) # model.draw("test.pdf") model.set_optimizer(tf.optimizers.SGD, lr=0.5) data1 = [[0, 1, 2], [2, 1, 0]] data2 = [[0., 1.], [1., 0.]] model.train_step(input_feed={x: data1, y: data2})
def test_map_seq(): n_features = 5 embed_size = 4 seq_size = 3 batch_size = 2 inputs = tx.Input(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) lookup = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[n_features, embed_size]) seq = lookup.permute_batch_time() n_units = 2 linear_fn = tx.Linear.config(n_units=n_units) assert tx.tensor_equal(tf.shape(seq()), [seq_size, batch_size, embed_size]) seq_map = tx.SeqMap(seq, n_units=2, layer_config=linear_fn) assert tx.tensor_equal(tf.shape(seq_map), [seq_size, batch_size, n_units])
def test_lookup_sequence_dense(): input_dim = 4 embed_dim = 3 seq_size = 2 batch_size = 3 inputs = tx.Input(np.array([[2, 0], [1, 2]]), 2, dtype=tf.int64) tensor_input = tx.Input(tf.constant([2]), 1, dtype=tf.int64) lookup = tx.Lookup(inputs, seq_size, embedding_shape=[input_dim, embed_dim], batch_size=batch_size, batch_padding=True) lookup_from_tensor = lookup.reuse_with(tensor_input) v1 = lookup() v2 = lookup_from_tensor() assert np.shape(v1) == (batch_size, seq_size, embed_dim) assert np.shape(v2) == (batch_size, seq_size, embed_dim)
def test_lookup_dynamic_sparse_sequence(): """ Testing Sparse Inputs to Lookup with dynamic seq_len passed through Input layer that acts as a parameter (scalar, this is n_units = 0) """ k = 8 m = 3 seq1 = tf.SparseTensor(indices=[[0, 1], [1, 2], [2, 3], [3, 4]], values=[1, 2, 3, 4], dense_shape=[4, k]) seq2 = tf.SparseTensor(indices=[[0, 1], [1, 2], [2, 3], [3, 3], [4, 4], [5, 5]], values=[1, 2, 3, 3, 4, 5], dense_shape=[6, k]) inputs = tx.Input(n_units=k, sparse=True, dtype=tf.int32, constant=False) seq_len = tx.Input(init_value=2, shape=[], constant=False) assert seq_len.n_units == 0 lookup = tx.Lookup(inputs, seq_size=seq_len, embedding_shape=[k, m]) # concat = lookup.as_concat() inputs.value = seq1 inputs() # set seq_len to 4 seq_len.value = 4 lookup_4 = lookup() # (batch, seq_len, embed_dim) assert lookup_4.numpy().shape == (1, 4, m) # set seq len to 3 inputs.value = seq2 seq_len.value = 3 lookup_4 = lookup() # (batch, seq_len, embed_dim) assert lookup_4.numpy().shape == (2, 3, 3)
def test_lookup_sparse_padding(): """ Sparse Lookup Padding Lookup adds padding if seq_size is greater than the max row indice in the input SparseTensor """ input_dim = 6 embed_dim = 4 seq_size = 3 sparse_input = tf.SparseTensor(indices=[[0, 1], [0, 3], [1, 0]], values=[1, 1, 1], dense_shape=[2, input_dim]) sparse_input = tx.Constant(sparse_input, input_dim) lookup = tx.Lookup(sparse_input, seq_size=seq_size, embedding_shape=[input_dim, embed_dim], batch_size=None, batch_padding=False) result = lookup() assert tf.sparse.to_dense(sparse_input()).shape == (2, input_dim) assert tx.tensor_equal(result[0][-1], tf.zeros([embed_dim]))
def __init__( self, run_inputs, label_inputs, eval_label_input, ctx_size, k_dim, ri_tensor_input, embed_dim, h_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), num_h=1, h_activation=tx.relu, h_init=tx.he_normal_init, use_dropout=False, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), use_nce=False, nce_samples=2, nce_noise_amount=0.1, noise_input=None, ): self.embed_dim = embed_dim var_reg = [] # =============================================== # RUN GRAPH # =============================================== with tf.name_scope("run"): feature_lookup = tx.Lookup(run_inputs, seq_size=ctx_size, lookup_shape=[k_dim, embed_dim], weight_init=embed_init, name="lookup") self.embeddings = feature_lookup var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() # =========================================================== with tf.name_scope("cache_embeddings"): # ris = [sign_index.get_ri(sign_index.get_sign(i)) for i in range(len(sign_index))] # self.all_ris = ris_to_sp_tensor_value(ri_seq=ris, # dim=sign_index.generator.dim, # all_positive=not sign_index.generator.symmetric) all_embeddings = tx.Linear( ri_tensor_input, n_units=self.embed_dim, shared_weights=self.embeddings.weights, bias=False, name='all_features') # caches all embedding computation for run/eval self.all_embeddings = tx.VariableLayer(all_embeddings, trainable=False) # =========================================================== last_layer = feature_lookup h_layers = [] for i in range(num_h): hi = tx.FC(last_layer, n_units=h_dim, activation=h_activation, weight_init=h_init, name="h_{i}".format(i=i)) h_layers.append(hi) last_layer = hi var_reg.append(hi.linear.weights) self.h_layers = h_layers # feature prediction for Energy-Based Model f_prediction = tx.Linear(last_layer, embed_dim, f_init, bias=True, name="f_predict") var_reg.append(f_prediction.weights) # RI DECODING =============================================== # shape is (?,?) because batch size is unknown and vocab size is unknown # when we build the graph run_logits = tx.Linear(f_prediction, n_units=None, shared_weights=self.all_embeddings.variable, transpose_weights=True, bias=False, name="logits") # =========================================================== embed_prob = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(run_inputs) last_layer = tx.Dropout(feature_lookup, probability=keep_prob) else: last_layer = feature_lookup # add dropout between each layer for layer in h_layers: h = layer.reuse_with(last_layer) if use_dropout: h = tx.Dropout(h, probability=keep_prob) last_layer = h f_prediction = f_prediction.reuse_with(last_layer) train_logits = run_logits.reuse_with(f_prediction, name="train_logits") train_embed_prob = tx.Activation(train_logits, tx.softmax, name="train_output") # convert labels to random indices model_prediction = f_prediction.tensor if use_nce: train_loss = tx.sparse_cnce_loss( label_features=label_inputs.tensor, noise_features=noise_input.tensor, model_prediction=model_prediction, weights=feature_lookup.weights, num_samples=nce_samples, noise_ratio=nce_noise_amount) else: one_hot_dense = tx.dense_one_hot( column_indices=label_inputs[0].tensor, num_cols=label_inputs[1].tensor) train_loss = tx.categorical_cross_entropy( one_hot_dense, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): one_hot_dense = tx.dense_one_hot( column_indices=eval_label_input[0].tensor, num_cols=label_inputs[1].tensor) train_loss = tx.categorical_cross_entropy(one_hot_dense, train_logits.tensor) eval_loss = tx.categorical_cross_entropy(one_hot_dense, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) if use_nce: train_loss_in = [label_inputs, noise_input] else: train_loss_in = label_inputs # BUILD MODEL super().__init__(run_inputs=run_inputs, run_outputs=embed_prob, train_inputs=run_inputs, train_outputs=train_embed_prob, eval_inputs=run_inputs, eval_outputs=embed_prob, train_out_loss=train_loss, train_in_loss=train_loss_in, eval_out_score=eval_loss, eval_in_score=eval_label_input, update_inputs=ri_tensor_input)
import numpy as np import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' n_features = 3 embed_size = 4 cell_units = 2 seq_size = 3 batch_size = 2 inputs = tx.TensorLayer(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) lookup = tx.Lookup(inputs, seq_size=seq_size, lookup_shape=[n_features, embed_size]) seq = lookup.permute_batch_time() # first step of a sequence t1 = seq[0] ks_cell = tf.keras.layers.LSTMCell(units=cell_units) tf_cell = tf.nn.rnn_cell.LSTMCell(num_units=cell_units, state_is_tuple=True) tx_cell = tx.LSTMCell(t1, n_units=cell_units) kernel_w = [ tx_cell.w_i.weights, tx_cell.w_c.weights, tx_cell.w_f.weights, tx_cell.w_o.weights ] kernel_u = [
generator = Generator(k, s) ris = [generator.generate() for _ in range(vocab_size)] ri_tensor = RandomIndexTensor.from_ri_list(ris, k, s) sp_values = ri_tensor.gather(flat_labels).to_sparse_tensor() sp_indices = tx.sparse_indices(sp_values) print(sp_values.get_shape()) print(tensor_util.constant_value_as_shape(sp_values.dense_shape)) print(tensor_util.constant_value(sp_values.dense_shape)) print(sp_values.dense_shape[-1].eval()) print(tf.shape(sp_values).eval()) lookup = tx.Lookup(tx.TensorLayer(sp_values), seq_size=1, lookup_shape=[k, embed_size]) linear = tx.Linear(tx.TensorLayer(sp_values), n_units=k, shared_weights=lookup.weights) w = embedding_lookup_sparse(params=lookup.weights, sp_ids=sp_indices, sp_weights=sp_values, combiner="sum", partition_strategy="mod") tf.global_variables_initializer().run() np.testing.assert_array_equal(w.eval(), tx.Flatten(lookup).eval())
import tensorflow as tf import tensorx as tx from deepsign.models.nrp import RandomIndexTensor from deepsign.rp.ri import Generator, RandomIndex import numpy as np sess = tf.InteractiveSession() vocab_size = 8 k = 6 s = 2 emebd = 3 generator = Generator(k, s) ris = [generator.generate() for _ in range(vocab_size)] ri_tensor = RandomIndexTensor.from_ri_list(ris, k, s) ri_input = ri_tensor.gather([[0, 1, 0], [1, 2, 0]]) sp = ri_input.to_sparse_tensor() sp = tx.TensorLayer(sp, k) print(sp.tensor.eval()) embed = tx.Lookup(sp, seq_size=3, lookup_shape=[k, 3]) tf.global_variables_initializer().run() print(np.shape(embed.tensor.eval()))
def __init__(self, inputs, labels, vocab_size, embed_dim, h_dim, embed_init=tx.zeros_init(), logit_init=tx.glorot_uniform(), num_h=1, h_activation=tx.tanh, h_init=tx.glorot_uniform(), w_dropconnect=None, u_dropconnect=None, r_dropout=0.4, y_dropout=0.4, embed_dropout=0.3, other_dropout=0.3, l2_loss=False, l2_weight=1e-5, use_f_predict=False, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=False, logit_bias=False, use_nce=False, nce_samples=10, skip_connections=False): if not isinstance(inputs, tx.Input): raise TypeError("inputs must be an Input layer") self.inputs = inputs self.labels = labels if not isinstance(labels, tx.Input): raise TypeError("labels must be an Input layer") if inputs.dtype != tf.int32 and inputs.dtype != tf.int64: raise TypeError( "Invalid dtype for input: expected int32 or int64, got {}". format(inputs.dtype)) if num_h < 0: raise ValueError("num hidden should be >= 0") # =============================================== # RUN GRAPH # =============================================== var_reg = [] with tf.name_scope("run"): # feature lookup embeddings = tx.Lookup(inputs, seq_size=None, lookup_shape=[vocab_size, embed_dim], weight_init=embed_init) var_reg.append(embeddings.weights) feature_lookup = embeddings.permute_batch_time() last_layer = feature_lookup cell_proto = tx.LSTMCell.proto( n_units=h_dim, activation=h_activation, gate_activation=tx.hard_sigmoid, w_init=h_init, u_init=h_init, w_dropconnect=w_dropconnect, u_dropconnect=u_dropconnect, r_dropout=r_dropout, x_dropout=None, y_dropout=y_dropout, regularized=False, name="cell", ) lstm_layers = [] for i in range(num_h): lstm_layer = tx.RNN(last_layer, cell_proto=cell_proto, regularized=False, stateful=True, name="LSTM_{}".format(i + 1)) lstm_layers.append(lstm_layer) var_reg += [wi.weights for wi in lstm_layer.cell.w] var_reg += [ui.weights for ui in lstm_layer.cell.u] last_layer = lstm_layer # last time step is the state used to make the prediction # last_layer = tx.Reshape(last_layer, [-1, h_dim]) # TODO this is not consistent with locked dropout for the last layer # where the same mask should be applied across time steps # to do this I need either y_dropout to be available or some sort of map # operation I can use with layers outputting 3D tensors # something equivalent to https://keras.io/layers/wrappers/ which applies # a layer to every temporal slice of an input. They implement this the same way # they implement an RNN # feature prediction for Energy-Based Model if use_f_predict: last_layer = tx.Linear(last_layer, embed_dim, f_init, add_bias=True, name="f_predict") # proto = tx.GRUCell.proto(n_units=embed_dim, # activation=h_activation, # gate_activation=tx.hard_sigmoid, # w_init=h_init, # u_init=h_init, # w_dropconnect=w_dropconnect, # u_dropconnect=u_dropconnect, # r_dropout=r_dropout, # x_dropout=None, # y_dropout=y_dropout, # regularized=False) # last_layer1 = tx.RNN(last_layer, cell_proto=proto, regularized=False, stateful=False) # last_layer2 = last_layer1.reuse_with(last_layer, reverse=True) # last_layer = tx.Add(last_layer1, last_layer2) # last_layer = tx.Module(last_layer, last_layer) var_reg += last_layer.variables # var_reg.append(last_layer.weights) f_predict = last_layer shared_weights = feature_lookup.weights if embed_share else None transpose_weights = embed_share logit_init = logit_init if not embed_share else None run_logits = tx.Linear(last_layer, n_units=vocab_size, weight_init=logit_init, shared_weights=shared_weights, transpose_weights=transpose_weights, add_bias=logit_bias, name="logits") if not embed_share: var_reg.append(run_logits.weights) run_output = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): embeddings = embeddings.reuse_with(inputs) feature_lookup = embeddings.permute_batch_time() if embed_dropout: feature_lookup = tx.Dropout(feature_lookup, probability=embed_dropout, name="drop_features") last_layer = feature_lookup for i in range(num_h): lstm_layer = lstm_layers[i].reuse_with(last_layer, regularized=True) last_layer = lstm_layer # last_layer = tx.Reshape(last_layer, [-1, h_dim]) # feature prediction for Energy-Based Model if use_f_predict: # last_layer = f_predict.reuse_with(last_layer) last_layer = f_predict.reuse_with(last_layer, regularized=True) last_layer = tx.Dropout(last_layer, probability=other_dropout, locked=False) train_logits = run_logits.reuse_with(last_layer, name="train_logits") train_output = tx.Activation(train_logits, tx.softmax, name="run_output") def categorical_loss(labels, logits): # labels come as a batch of classes [[1,2],[3,4]] -> [1,3,2,4] time steps are ordered to match logits labels = tx.Transpose(labels) labels = tx.Reshape(labels, [-1]) labels = tx.dense_one_hot(labels, num_cols=vocab_size) loss = tx.categorical_cross_entropy(labels=labels, logits=logits) return tf.reduce_mean(loss) def nce_loss(labels, weights, bias, predict): noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size) loss = tf.nn.nce_loss(weights=weights, biases=bias, inputs=predict, labels=labels, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=noise) return tf.reduce_mean(loss) if use_nce: bias = tx.VariableLayer(var_shape=[vocab_size], name="nce_bias") # wraps a layer to expose the weights as a layer but with the layer as its input nce_weights = tx.WrapLayer(embeddings, n_units=embeddings.n_units, wrap_fn=lambda x: x.weights, layer_fn=True) train_loss = tx.LambdaLayer(labels, nce_weights, bias, last_layer, apply_fn=nce_loss, name="nce_loss") else: train_loss = tx.LambdaLayer(labels, train_logits, apply_fn=categorical_loss, name="train_loss") if l2_loss: l2_losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = tx.LambdaLayer( train_loss, apply_fn=lambda x: x + l2_weight * tf.add_n(l2_losses), name="train_loss_l2") # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): eval_loss = tx.LambdaLayer(labels, run_logits, apply_fn=categorical_loss, name="eval_loss") self.stateful_layers = lstm_layers # BUILD MODEL super().__init__(run_outputs=run_output, run_inputs=inputs, train_inputs=[inputs, labels], train_outputs=train_output, train_loss=train_loss, eval_inputs=[inputs, labels], eval_outputs=run_output, eval_score=eval_loss)
print([vocab[w] for w in vocab.keys()]) ri_dict = {vocab[word]: generator.generate() for word in vocab.keys()} tokens = [vocab[w] for w in tokens] data_it = window_it(tokens, seq_size) data_it = batch_it(data_it, batch_size) vocab_tensor = [ri_dict[i] for i in range(len(vocab))] sp_ri = deepsign.data.transform.ris_to_sp_tensor_value(vocab_tensor, dim=k) inputs = tx.Input(n_units=2) ri_inputs = tx.gather_sparse(sp_ri, inputs.tensor) ri_inputs = tx.TensorLayer(ri_inputs, k) embed = tx.Lookup(ri_inputs, seq_size, [k, embed_dim]) # logits: take the embeddings and get the features for all random indexes ri_layer = tx.TensorLayer(sp_ri, n_units=k) logits = tx.Linear(input_layer=ri_layer, n_units=embed_dim, shared_weights=embed.weights, bias=True) single_input = tx.Input(1) ri_input = tx.TensorLayer(tx.gather_sparse(sp_ri, single_input.tensor), k) logit = logits.reuse_with(ri_input) session = tf.InteractiveSession()
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' input_size = 10000 var_size = 500 batch_size = 20 seq_size = 30 inputs = tf.constant(np.random.randint(0, 10, size=[batch_size, seq_size]), name="inputs") targets = tf.constant(np.random.randint(0, 10, size=[batch_size * seq_size]), name="targets") targets = tf.one_hot(targets, input_size) inputs = tx.TensorLayer(inputs) with jit_scope(): with tf.name_scope("scope1"): lookup = tx.Lookup(inputs, seq_size=seq_size, lookup_shape=[input_size, var_size], name="lookup") seq = lookup.permute_batch_time() seq = tx.Reshape(seq, [-1, var_size], name="flatten") mul1 = tx.Linear(seq, input_size, name="test_logits") mul2 = tx.Linear(seq, n_units=input_size, shared_weights=lookup.weights, transpose_weights=True, name="shared_embeddings") with tf.name_scope("scope2"): mul1 = mul1.reuse_with(seq) mul2 = mul2.reuse_with(seq) rnd_loss1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=targets, logits=mul1)) rnd_loss2 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=targets, logits=mul2))
name=name) """ Test staged implementation """ n_hidden = 20 embed_dim = 10 seq_size = 2 vocab_size = 10000 feature_shape = [vocab_size, embed_dim] loss_inputs = tx.Input(1, dtype=tf.int32) in_layer = tx.Input(seq_size, dtype=tf.int32) lookup = tx.Lookup(in_layer, seq_size=seq_size, lookup_shape=feature_shape) # [batch x seq_size * feature_shape[1]] # reshape to [batch x seq_size x feature_shape[1]] # lookup_to_seq = # I was thinking that this reshape could be done automatically based on the input share of # the tensor fed to the RNN cell layer out = tx.WrapLayer( lookup, embed_dim, shape=[None, seq_size, embed_dim], wrap_fn=lambda tensor: tf.reshape(tensor, [-1, seq_size, embed_dim])) out = tx.WrapLayer(out, embed_dim, wrap_fn=lambda tensor: tensor[0]) # apply rnn cell to single input batch
def test_rnn_layer(): n_features = 5 embed_size = 4 hidden_dim = 3 seq_size = 3 batch_size = 2 inputs = tx.Input(np.random.random([batch_size, seq_size]), n_units=seq_size, dtype=tf.int32) lookup = tx.Lookup(inputs, seq_size=seq_size, embedding_shape=[n_features, embed_size]) seq = lookup.permute_batch_time() ones_state = tf.ones([batch_size, hidden_dim]) zero_state = (tf.zeros([batch_size, hidden_dim])) rnn_proto = tx.RNNCell.config(n_units=hidden_dim) rnn1 = tx.RNN(seq, cell_config=rnn_proto, previous_state=ones_state, return_state=True) rnn2 = rnn1.reuse_with(seq) # problem with RNN layer is that it uses modules that require # all the params to output the right answer # we need to supply the default values for the rest or all the inputs out1, last1 = rnn1() out2, last2 = rnn2() assert tx.tensor_equal(out1, out2) assert tx.tensor_equal(last1, last2) rnn3 = rnn1.reuse_with(seq, zero_state) rnn4 = rnn3.reuse_with(seq) rnn5 = rnn4.reuse_with(seq, ones_state) assert tx.tensor_equal(rnn2.previous_state, rnn1.previous_state) assert tx.tensor_equal(rnn3.previous_state, rnn4.previous_state) out3, last3 = rnn3() out4, last4 = rnn4() assert tx.tensor_equal(out3, out4) assert tx.tensor_equal(last3, last4) cell_state1 = rnn1.cell.previous_state[0]() cell_state2 = rnn2.cell.previous_state[0]() cell_state3 = rnn3.cell.previous_state[0]() cell_state4 = rnn4.cell.previous_state[0]() assert len(rnn1.cell.previous_state) == 1 assert tx.tensor_equal(cell_state1, cell_state2) assert tx.tensor_equal(cell_state3, cell_state4) assert not tx.tensor_equal(out1, out3) out5, last5 = rnn5() assert tx.tensor_equal(out1, out5) assert tx.tensor_equal(last1, last5)
""" Test staged implementation """ n_hidden = 20 embed_dim = 3 seq_size = 2 vocab_size = 10000 feature_shape = [vocab_size, embed_dim] loss_inputs = tx.Input(1, dtype=tf.int32) in_layer = tx.Input(seq_size, dtype=tf.int32) lookup = tx.Lookup(in_layer, seq_size=seq_size, lookup_shape=feature_shape, as_sequence=True) lookup_flat = lookup.reuse_with(in_layer, as_sequence=False) with tf.name_scope("rnn"): rnn1 = RNNCell(lookup[0], 4, name="rnn1") rnn2 = rnn1.reuse_with(lookup[1], state=rnn1, name="rnn2") # setup optimizer optimizer = tx.AMSGrad(learning_rate=0.01) model = tx.Model(run_inputs=in_layer, run_outputs=[rnn1, rnn2]) runner = tx.ModelRunner(model) runner.set_session(runtime_stats=True)