def _build_graph(self, layer, previous_state): with layer_scope(self): if previous_state is None: input_batch = tf.shape(layer.tensor)[0] zero_state = tf.zeros([input_batch, self.n_units]) self.previous_state = tx.TensorLayer(zero_state, self.n_units) if self.share_state_with is None: # determines the weight of the previous state # we could add the bias at the end but this way we just define a single bias for the r unit self.r_current_w = tx.Linear(layer, self.n_units, bias=True, weight_init=self.init, name="r_current_w") self.r_recurrent_w = tx.Linear(self.previous_state, self.n_units, bias=False, weight_init=self.recurrent_init, name="r_current_w") self.u_current_w = tx.Linear(layer, self.n_units, bias=True, weight_init=self.init, name="u_current_w") self.u_recurrent_w = tx.Linear(self.previous_state, self.n_units, bias=False, weight_init=self.recurrent_init, name="u_current_w") self.current_w = tx.Linear(layer, self.n_units, bias=True, weight_init=self.init, name="current_w") self.recurrent_w = tx.Linear(self.previous_state, self.n_units, bias=False, weight_init=self.recurrent_init, name="recurrent_w") # kernel_gate = tx.Activation() kernel_act = tx.Activation(kernel_linear, self.activation) self.kernel = tx.Compose(kernel_linear, kernel_act) else: self.kernel = self.share_state_with.kernel.reuse_with(layer) self.recurrent_kernel = self.share_state_with.recurrent_kernel.reuse_with( self.previous_state) r_state = tx.Add(r_current_w, r_recurrent_w) r_state = tx.Bias(r_state) r_gate = tx.Activation(r_state, fn=tx.sigmoid, name="r_gate") # """Gated recurrent unit (GRU) with nunits cells.""" return self.kernel.tensor + self.recurrent_kernel.tensor
def test_loss_model_dependencies(): inputs = tx.Input(n_units=2, name="x", constant=False) labels = tx.Input(n_units=2, name="y_", constant=False) y = tx.Linear(inputs, 2, name="y") out1 = tx.Activation(y, tf.nn.softmax, name="out1") out2 = tx.Activation(y, tf.nn.softmax, name="out2") @tx.layer(n_units=2, name="loss") def loss(pred, labs): return tf.losses.categorical_crossentropy(labs, pred) logging.basicConfig(level=logging.DEBUG) model = tx.Model(run_inputs=inputs, run_outputs=[out1, out2], train_inputs=[inputs, labels], train_outputs=[out2, out1], train_loss=loss(out1, labels)) lr = tx.Param(0.5) opt = model.set_optimizer(tf.optimizers.SGD, lr=lr) assert isinstance(opt, tf.optimizers.Optimizer) it = model.train_graph.dependency_iter() layers = list(it) assert layers[0] is inputs assert layers[1] is labels assert len(layers) == 6
def test_model_run(): data1 = tf.constant([[1., 1.]]) x = tx.Input(n_units=2, name="x", constant=False) labels = tx.Input(n_units=2, name="y_", constant=False) y = tx.Linear(x, 2, name="y") out1 = tx.Activation(y, tf.nn.softmax) out2 = tx.Activation(y, tf.nn.softmax) @tx.layer(n_units=2, name="loss") def loss(pred, labs): return tf.losses.categorical_crossentropy(labs, pred) model = tx.Model(run_inputs=x, run_outputs=[out1, out2], train_inputs=[x, labels], train_outputs=out1, train_loss=loss(out1, labels)) model.set_optimizer(tf.optimizers.SGD, lr=0.5) result1 = model.run({x: data1}) result2 = model.run([data1]) assert tx.tensor_equal(result1[0], result2[0]) assert tx.tensor_equal(result1[1], result2[1]) result3 = model.run({x: data1}, compiled_graph=True) assert tx.tensor_equal(result3[0], result2[0]) assert tx.tensor_equal(result3[1], result2[1])
def test_set_optimizer(): x = tx.Input(n_units=2, name="x", constant=False) labels = tx.Input(n_units=2, name="labels", constant=False) y = tx.Linear(x, 2, name="y") out1 = tx.Activation(y, tf.nn.softmax) out2 = tx.Activation(y, tf.nn.softmax) @tx.layer(n_units=2, name="loss") def loss(pred, labs): return tf.losses.categorical_crossentropy(labs, pred) model = tx.Model(run_inputs=x, run_outputs=[out1, out2], train_inputs=[x, labels], train_outputs=[out2, out1], train_loss=loss(out1, labels)) lr = tx.Param(0.5) opt = model.set_optimizer(tf.optimizers.SGD, learning_rate=lr, clipnorm=0.1) assert isinstance(opt, tf.optimizers.Optimizer) assert model.optimizer.get_config()["learning_rate"] == 0.5 data1 = [[1., 1.], [1., 1.]] data2 = tf.constant([[0., 1.], [0., 1.]]) params = model.optimizer_params[model.optimizer] data_dict, params_dict = tx.Model.parse_input( { x: data1, "learning_rate": 0.2 }, model.run_graph.in_nodes, params) assert len(data_dict) == 1 assert len(params_dict) == 1 assert model.optimizer_params[opt]["learning_rate"] is lr result1 = model.train_step({x: data1, labels: data2}) result2 = model.train_step([data1, data2]) assert len(result1) == 3 assert len(result2) == 3 assert tf.reduce_all(tf.less(result2[-1], result1[-1])) result1 = model.run({x: np.array(data1, dtype=np.float32)}) result2 = model.run([data1]) result3 = model.run(np.array(data1, np.float32)) x.value = data1 o2 = out2() o1 = out1() result4 = (o2, o1) for i in range(2): assert tx.tensor_equal(result1[i], result2[i]) assert tx.tensor_equal(result1[i], result3[i]) assert tx.tensor_equal(result1[i], result4[i])
def test_override_out_nodes(): x = tx.Input(n_units=2, name="x", constant=False) y = tx.Linear(x, 2, name="y") out1 = tx.Activation(y, tf.nn.softmax, name="out1") out2 = tx.Activation(out1, tf.nn.softmax, name="out2") graph = Graph.build(inputs=x, outputs=[out1, out2]) assert out1 in graph.out_nodes assert out2 in graph.out_nodes graph = Graph.build(inputs=x, outputs=out1) assert out1 in graph.out_nodes assert out2 not in graph.out_nodes
def _build_graph(self, layer, previous_state): with layer_scope(self): if previous_state is None: input_batch = tf.shape(layer.tensor)[0] zero_state = tf.zeros([input_batch, self.n_units]) self.previous_state = tx.TensorLayer(zero_state, self.n_units) if self.share_state_with is None: kernel_linear = tx.Linear(layer, self.n_units, bias=True, weight_init=self.init, name="linear_kernel") kernel_act = tx.Activation(kernel_linear, self.activation) self.kernel = tx.Compose([kernel_linear, kernel_act]) self.recurrent_kernel = tx.Linear( self.previous_state, self.n_units, bias=False, weight_init=self.recurrent_init, name="recurrent_kernel") else: self.kernel = self.share_state_with.kernel.reuse_with(layer) self.recurrent_kernel = self.share_state_with.recurrent_kernel.reuse_with( self.previous_state) # TODO this might be wrong, I might need to couple the activation: act(kernel + recurrent + bias) # TODO it is wrong https://github.com/tensorflow/tensorflow/blob/r1.8/tensorflow/python/ops/rnn_cell_impl.py # """Most basic RNN: output = new_state = act(W * input + U * state + B).""" return self.kernel.tensor + self.recurrent_kernel.tensor
def test_reuse_dropout(): x1 = tx.Constant(np.ones(shape=[2, 4]), dtype=tf.float32) x2 = tx.Activation(x1) drop1 = tx.Dropout(x2, probability=0.5, locked=True) assert len(drop1.inputs) == 2 assert drop1.inputs[0] is x2 assert drop1.inputs[-1] is drop1.layer_state.mask # shared state overrides mask? _, mask = tx.dropout(x2, return_mask=True) drop2 = drop1.reuse_with(x2, mask) assert len(drop2.inputs) == 2 assert drop2.inputs[0] is x2 assert drop2.inputs[-1] is drop2.layer_state.mask assert not tx.tensor_equal(drop1(), drop2()) graph = tx.Graph.build(inputs=None, outputs=[drop1, drop2]) out1, out2 = graph() assert tx.tensor_equal(out1, out2) drop1 = tx.Dropout(x2, probability=0.5) drop2 = drop1.reuse_with(x1) graph.eval(drop1, drop2)
def test_to_sparse(): inputs = tx.Input(init_value=tf.ones([2, 100])) linear = tx.Linear(inputs, n_units=100) relu = tx.Activation(linear, tx.relu) sparse = tx.ToSparse(relu) assert tx.shape_equal(sparse.shape, linear.shape) assert tx.shape_equal(sparse.shape, relu.shape)
def test_build_graph(): x1 = tx.Input(n_units=1000, constant=False, dtype=tf.float32) x2 = tx.Input(init_value=tf.ones([1, 3]), dtype=tf.float32, constant=True) y10 = tx.Linear(x1, n_units=3) y11 = tx.Activation(y10) y1 = tx.Module(x1, y11) y2 = tx.Add(y1, x2) output = y2 graph = Graph.build(inputs=None, outputs=[y1, y2]) # module condenses 2 nodes so it's 4 and not 6 assert len(graph.nodes) == 4 @tf.function def simple_graph(in0): x1.value = in0 return y2() simple_graph_2 = Graph.build(inputs=[x1, x2], outputs=y2) simple_graph_2 = tf.function(simple_graph_2) g = Graph.build(inputs=[x1, x2], outputs=y2) y2fn = y2.as_function() data = tf.ones([256, 1000]) x1.value = data compiled_fn = g.as_function(ord_inputs=x1, ord_outputs=output) assert tx.tensor_equal(compiled_fn(data), y2fn()) assert tx.tensor_equal(compiled_fn(data), simple_graph_2()[0]) from timeit import timeit def update_run(): x1.value = tf.random.uniform([256, 1000]) return y2fn() n = 1000 t_update_run = timeit(update_run, number=n) t_generated = timeit(lambda: compiled_fn(tf.random.uniform([256, 1000])), number=n) t_compile_value_set = timeit( lambda: simple_graph(tf.random.uniform([256, 1000])), number=n) t_graph_call_tf = timeit( lambda: simple_graph_2(tf.random.uniform([256, 1000])), number=n) assert t_generated < t_update_run assert t_generated < t_compile_value_set assert t_generated < t_graph_call_tf assert t_update_run > t_compile_value_set o1 = compiled_fn(tf.random.uniform([256, 1000])) o2 = compiled_fn(tf.random.uniform([256, 1000])) assert not tx.tensor_equal(o1, o2)
def test_model_train(): x = tx.Input(n_units=2, name="x", constant=False) labels = tx.Input(n_units=2, name="labels", constant=False) y = tx.Linear(x, 2, name="y1", add_bias=False) out1 = tx.Activation(y, tf.nn.softmax) out2 = tx.Activation(y, tf.nn.softmax) @tx.layer(n_units=2, name="loss") def loss(pred, labs): return tf.losses.categorical_crossentropy(labs, pred) model = tx.Model(run_inputs=x, run_outputs=[out1, out2], train_inputs=[x, labels], train_outputs=[out2, out1], train_loss=loss(out1, labels)) lr = tx.Param(0.5) opt = model.set_optimizer(tf.optimizers.SGD, learning_rate=lr, clipnorm=0.1) data1 = [[1., 1.], [1., 1.]] data2 = [[0., 1.], [0., 1.]] w1 = y.weights.numpy() epochs = 100 model.train(train_data=[{x: data1, labels: data2}], epochs=epochs) w2 = y.weights.value() y.weights.assign(w1) for _ in range(epochs): model.train_step(input_feed={x: data1, labels: data2}) w3 = y.weights.value() assert tx.tensor_equal(w2, w3)
def test_gate(): inputs = tx.Input(init_value=tf.ones([2, 3])) linear = tx.Linear(inputs, n_units=4) nop = tx.Activation(linear, fn=tx.identity) gate_w = tx.Linear(linear, n_units=4, add_bias=True) gate1 = tx.Gate(linear, gate_w) gate2 = gate1.reuse_with(nop) assert tx.shape_equal(gate1.shape, gate2.shape) r1 = gate1() r2 = gate2() assert tx.tensor_equal(r1, r2)
def test_fully_connected(): x1 = tx.Input(init_value=[[1., 1., 1., 1.]], n_units=4, dtype=tf.float32, constant=True) x2 = tx.Input(init_value=np.random.uniform(size=[2, 4]), dtype=tf.float32, n_units=4, constant=True) y1 = tx.FC(x1, 4, add_bias=True, activation=tf.sigmoid) y2 = tx.Linear(x1, 4, add_bias=True, weights=y1.linear.weights, bias=y1.linear.bias) a2 = tx.Activation(y2, fn=tf.sigmoid) w = y2.weights b = y2.bias assert y1.linear.weights is w assert y1.linear.bias is b x = x1() y = tf.matmul(x, w) + b a = tf.sigmoid(y) assert tx.tensor_equal(y2(), y) assert tx.tensor_equal(y1(), a) assert tx.tensor_equal(y1(), a2()) assert tx.tensor_equal(a2(), a) y1 = y1.reuse_with(x2) y2 = y2.reuse_with(x2) assert y2.weights is w assert y2.bias is b assert y1.linear.weights is w assert y1.linear.bias is b
def test_activation(): inputs = tx.Input(init_value=tf.ones([2, 2]), n_units=2) output = tx.Activation(inputs, tf.sigmoid) assert tx.shape_equal(inputs.shape, output.shape)
data = np.concatenate([v, labels], -1) data = repeat_it(data, 2) data = shuffle_it(iter(data), buffer_size=batch_size * 4) data = batch_it(data, batch_size) label_layer = tx.Input(1) in_layer = tx.Input(M) f1 = tx.FC(in_layer, 512, activation=tf.nn.tanh) f2 = tx.FC(f1, 512, activation=tf.nn.relu) fm = tx.Highway(f1, f2, carry_gate=True) out = tx.Linear(f2, 1) out_prob = tx.Activation(out, fn=tx.sigmoid) loss = tx.binary_cross_entropy(labels=label_layer.tensor, logits=out.tensor) model = tx.Model(run_inputs=in_layer, run_outputs=out_prob, train_in_loss=label_layer, train_out_loss=loss) runner = tx.ModelRunner(model) runner.config_optimizer(optimizer=tf.train.AdamOptimizer(learning_rate=0.001)) runner.init_vars() for data_batch in data: data_batch = np.array(data_batch) ctx_vector = data_batch[:, :-1]
def __init__(self, ctx_size, vocab_size, k_dim, ri_tensor: RandomIndexTensor, embed_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=True, logit_bias=False, use_gate=True, use_hidden=False, h_dim=100, h_activation=tx.elu, h_init=tx.he_normal_init(), h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), use_dropout=True, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5): # GRAPH INPUTS run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input") loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target") eval_inputs = loss_inputs # RUN GRAPH ===================================================== var_reg = [] with tf.name_scope("run"): # RI ENCODING =============================================== # convert ids to ris gather a set of random indexes based on the ids in a sequence # ri_layer = tx.TensorLayer(ri_tensor, n_units=k_dim) # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) with tf.name_scope("ri_encode"): # used to compute logits if isinstance(ri_tensor, RandomIndexTensor): ri_layer = tx.TensorLayer(ri_tensor.to_sparse_tensor(), k_dim) ri_inputs = ri_tensor.gather(run_inputs.tensor) ri_inputs = ri_inputs.to_sparse_tensor() ri_inputs = tx.TensorLayer(ri_inputs, k_dim) else: ri_layer = tx.TensorLayer(ri_tensor, k_dim) ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) ri_inputs = tx.TensorLayer(ri_inputs, k_dim) # use those sparse indexes to lookup a set of features based on the ri values feature_lookup = tx.Lookup(ri_inputs, ctx_size, [k_dim, embed_dim], embed_init, name="lookup") var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() # =========================================================== if use_gate or use_hidden: hl = tx.Linear(feature_lookup, h_dim, h_init, bias=True, name="h_linear") ha = tx.Activation(hl, h_activation, name="h_activation") h = tx.Compose(hl, ha, name="hidden") var_reg.append(hl.weights) features = feature_lookup if use_gate: features = tx.Gate(features, ctx_size, gate_input=h) gate = features var_reg.append(features.gate_weights) x_to_f = tx.Linear(features, embed_dim, x_to_f_init, bias=True, name="x_to_f") var_reg.append(x_to_f.weights) f_prediction = x_to_f if use_hidden: h_to_f = tx.Linear(h, embed_dim, h_to_f_init, bias=True, name="h_to_f") var_reg.append(h_to_f.weights) f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted") # RI DECODING =============================================== shared_weights = feature_lookup.weights if embed_share else None logit_init = logit_init if not embed_share else None # embedding feature vectors for all words: shape [vocab_size, embed_dim] # later, for NCE we don't need to get all the features all_embeddings = tx.Linear(ri_layer, embed_dim, logit_init, shared_weights, name="logits", bias=False) # dot product of f_predicted . all_embeddings with bias for each target word run_logits = tx.Linear(f_prediction, n_units=vocab_size, shared_weights=all_embeddings.tensor, transpose_weights=True, bias=logit_bias) if not embed_share: var_reg.append(all_embeddings.weights) # =========================================================== run_embed_prob = tx.Activation(run_logits, tx.softmax) # TRAIN GRAPH =================================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(ri_inputs) features = tx.Dropout(feature_lookup, probability=keep_prob) else: features = feature_lookup if use_gate or use_hidden: if use_dropout: h = h.reuse_with(features) h = tx.Dropout(h, probability=keep_prob) if use_gate: features = gate.reuse_with(features, gate_input=h) f_prediction = x_to_f.reuse_with(features) if use_hidden: h_to_f = h_to_f.reuse_with(h) if use_dropout: h_to_f = tx.Dropout(h_to_f, probability=keep_prob) f_prediction = tx.Add(f_prediction, h_to_f) else: f_prediction = f_prediction.reuse_with(features) # we already define all_embeddings from which these logits are computed before so this should be ok train_logits = run_logits.reuse_with(f_prediction) train_embed_prob = tx.Activation(train_logits, tx.softmax, name="train_output") one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor, num_cols=vocab_size) train_loss = tx.categorical_cross_entropy(one_hot, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # EVAL GRAPH =============================================== with tf.name_scope("eval"): one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor, num_cols=vocab_size) eval_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) # SETUP MODEL CONTAINER ==================================== super().__init__(run_inputs=run_inputs, run_outputs=run_embed_prob, train_inputs=run_inputs, train_outputs=train_embed_prob, eval_inputs=run_inputs, eval_outputs=run_embed_prob, train_out_loss=train_loss, train_in_loss=loss_inputs, eval_out_score=eval_loss, eval_in_score=eval_inputs)
def __init__( self, inputs, label_inputs, vocab_size, embed_dim, h_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), num_h=1, h_activation=tx.elu, h_init=tx.he_normal_init(), use_dropout=False, embed_dropout=False, drop_probability=0.05, l2_loss=False, l2_weight=1e-5, use_f_predict=False, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=False, logit_bias=False, use_nce=False, nce_samples=10, ): if not isinstance(inputs, tx.Input): raise TypeError("inputs must be an Input layer") self.inputs = inputs self.labels = label_inputs if not isinstance(label_inputs, tx.Input): raise TypeError("labels must be an Input layer") if inputs.dtype != tf.int32 and inputs.dtype != tf.int64: raise TypeError( "Invalid dtype for input: expected int32 or int64, got {}". format(inputs.dtype)) if num_h < 0: raise ValueError("num hidden should be >= 0") ctx_size = inputs.n_units # =============================================== # RUN GRAPH # =============================================== var_reg = [] with tf.name_scope("run"): # feature lookup embeddings = tx.Lookup(inputs, ctx_size, [vocab_size, embed_dim], weight_init=embed_init) var_reg.append(embeddings.weights) feature_lookup = embeddings.as_concat() last_layer = feature_lookup h_layers = [] for i in range(num_h): h_i = tx.FC(layer=last_layer, n_units=h_dim, activation=h_activation, weight_init=h_init, add_bias=True, name="h_{}".format(i + 1)) h_layers.append(h_i) last_layer = h_i var_reg.append(h_i.linear.weights) # feature prediction for Energy-Based Model if use_f_predict: last_layer = tx.Linear(last_layer, embed_dim, f_init, add_bias=True, name="f_predict") var_reg.append(last_layer.weights) f_predict = last_layer shared_weights = feature_lookup.weights if embed_share else None transpose_weights = embed_share logit_init = logit_init if not embed_share else None run_logits = tx.Linear(last_layer, n_units=vocab_size, weight_init=logit_init, shared_weights=shared_weights, transpose_weights=transpose_weights, add_bias=logit_bias, name="logits") if not embed_share: var_reg.append(run_logits.weights) run_output = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): if use_dropout and embed_dropout: last_layer = tx.Dropout(feature_lookup, probability=drop_probability, name="dropout_features") else: last_layer = feature_lookup # add dropout between each layer for i, layer in enumerate(h_layers): h = layer.reuse_with(last_layer) if use_dropout: h = tx.Dropout(h, probability=drop_probability, name="dropout_{}".format(i + 1)) last_layer = h # feature prediction for Energy-Based Model if use_f_predict: last_layer = f_predict.reuse_with(last_layer) train_logits = run_logits.reuse_with(last_layer, name="train_logits") train_output = tx.Activation(train_logits, tx.softmax, name="train_output") def categorical_loss(labels, logits): labels = tx.dense_one_hot(column_indices=labels, num_cols=vocab_size) loss = tx.categorical_cross_entropy(labels=labels, logits=logits) return tf.reduce_mean(loss) def nce_loss(labels, weights, bias, predict): noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size) loss = tf.nn.nce_loss(weights=weights, biases=bias, inputs=predict, labels=labels, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=noise) return tf.reduce_mean(loss) if use_nce: bias = tx.VariableLayer(var_shape=[vocab_size], name="nce_bias") nce_weights = tx.WrapLayer(embeddings, n_units=embeddings.n_units, wrap_fn=lambda x: x.weights, layer_fn=True) train_loss = tx.LambdaLayer(label_inputs, nce_weights, bias, last_layer, apply_fn=nce_loss, name="nce_loss") else: train_loss = tx.LambdaLayer(label_inputs, train_logits, apply_fn=categorical_loss, name="train_loss") if l2_loss: l2_losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = tx.WrapLayer( train_loss, wrap_fn=lambda x: x + l2_weight * tf.add_n(l2_losses), name="train_loss_l2") # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): eval_loss = tx.LambdaLayer(label_inputs, run_logits, apply_fn=categorical_loss, name="eval_loss") # BUILD MODEL super().__init__(run_outputs=run_output, run_inputs=inputs, train_inputs=[inputs, label_inputs], train_outputs=train_output, train_loss=train_loss, eval_inputs=[inputs, label_inputs], eval_outputs=run_output, eval_score=eval_loss)
all_embeddings = tx.Linear(ri_layer, embed_size, shared_weights=lookup.weights, name="all_features", bias=False) # dot product of f_predicted . all_embeddings with bias for each target word run_logits = tx.Linear(feature_predict, vocab_size, shared_weights=all_embeddings.tensor, transpose_weights=True, bias=False, name="logits") embed_prob = tx.Activation(run_logits, tx.softmax, name="run_output") one_hot = tx.dense_one_hot(column_indices=input_labels.tensor, num_cols=vocab_size) val_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) val_loss = tf.reduce_mean(val_loss) # ************************************* # Testing adaptive noise # ************************************* noise_logits = tx.Linear(lookup, k, bias=True) adaptive_noise = tx.sample_sigmoid_from_logits(noise_logits.tensor, n=1) adaptive_noise = tx.TensorLayer(adaptive_noise, n_units=k) # adaptive_noise = tx.to_sparse(adaptive_noise) # *************************************
def __init__(self, inputs, labels, vocab_size, embed_dim, h_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), num_h=1, h_activation=tx.tanh, h_init=tx.he_normal_init(), reset_state=True, embed_dropout=False, w_dropout=False, u_dropconnect=False, other_dropout=False, w_keep_prob=0.9, u_keep_prob=0.9, embed_keep_prob=0.9, other_keep_prob=0.9, l2_loss=False, l2_weight=1e-5, use_f_predict=False, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=False, logit_bias=False, use_nce=False, nce_samples=10, ): if not isinstance(inputs, tx.Input): raise TypeError("inputs must be an Input layer") self.inputs = inputs self.labels = labels if not isinstance(labels, tx.Input): raise TypeError("labels must be an Input layer") if inputs.dtype != tf.int32 and inputs.dtype != tf.int64: raise TypeError("Invalid dtype for input: expected int32 or int64, got {}".format(inputs.dtype)) if num_h < 0: raise ValueError("num hidden should be >= 0") ctx_size = inputs.n_units # =============================================== # RUN GRAPH # =============================================== var_reg = [] with tf.name_scope("run"): # feature lookup embeddings = tx.Lookup(inputs, ctx_size, [vocab_size, embed_dim], weight_init=embed_init) var_reg.append(embeddings.weights) feature_lookup = embeddings.permute_batch_time() last_layer = feature_lookup last_feature_layer = feature_lookup for i in range(num_h): h_i = tx.QRNN(feature_lookup, n_units=h_dim, activation=h_activation, filter_size= ) last_layer = h_i # save last state, this will be used by state of first cell var_reg += [wi.weights for wi in last_layer.w] var_reg += [ui.weights for ui in last_layer.u] if not reset_state: last_layer = zero_state.reuse_with(last_layer, name="cache_last_state") # feature prediction for Energy-Based Model if use_f_predict: last_layer = tx.Linear(last_layer, embed_dim, f_init, add_bias=True, name="f_predict") var_reg.append(last_layer.weights) f_predict = last_layer shared_weights = feature_lookup.weights if embed_share else None transpose_weights = embed_share logit_init = logit_init if not embed_share else None run_logits = tx.Linear(last_layer, n_units=vocab_size, weight_init=logit_init, shared_weights=shared_weights, transpose_weights=transpose_weights, add_bias=logit_bias, name="logits") if not embed_share: var_reg.append(run_logits.weights) run_output = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): embeddings = embeddings.reuse_with(inputs) feature_lookup = embeddings.as_seq() if other_dropout and embed_dropout: feature_lookup = tx.Dropout(feature_lookup, probability=embed_keep_prob, name="drop_features") # last_layer = last_layer.as_seq() # add dropout between each layer # for i, layer in enumerate(h_layers): cell = lstm_cells[0] for i in range(ctx_size): if i == 0: h = cell.reuse_with(input_layer=feature_lookup[i], previous_state=None, # copy from first cell previous_memory=None, # copy from first cell regularized=w_dropout or u_dropconnect, name="lstm_cell_{}".format(i)) else: h = cell.reuse_with(input_layer=feature_lookup[i], previous_state=last_layer, name="lstm_cell_{}".format(i)) cell = h # if use_dropout: # h = tx.ZoneOut(h, # previous_layer=h.previous_state, # keep_prob=keep_prob, # name="zoneout_{}".format(i)) last_layer = h if not reset_state: last_layer = zero_state.reuse_with(last_layer, name="cache_last_cell") # feature prediction for Energy-Based Model if use_f_predict: last_layer = f_predict.reuse_with(last_layer) train_logits = run_logits.reuse_with(last_layer, name="train_logits") train_output = tx.Activation(train_logits, tx.softmax, name="train_output") def categorical_loss(labels, logits): labels = tx.dense_one_hot(column_indices=labels, num_cols=vocab_size) loss = tx.categorical_cross_entropy(labels=labels, logits=logits) # loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,logits=logits) return tf.reduce_mean(loss) def nce_loss(labels, weights, bias, predict): noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size) loss = tf.nn.nce_loss(weights=weights, biases=bias, inputs=predict, labels=labels, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=noise) return tf.reduce_mean(loss) if use_nce: bias = tx.VariableLayer(var_shape=[vocab_size], name="nce_bias") nce_weights = tx.WrapLayer(embeddings, n_units=embeddings.n_units, wrap_fn=lambda x: x.weights, layer_fn=True) train_loss = tx.LambdaLayer(labels, nce_weights, bias, last_layer, apply_fn=nce_loss, name="nce_loss") else: train_loss = tx.LambdaLayer(labels, train_logits, apply_fn=categorical_loss, name="train_loss") if l2_loss: l2_losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = tx.LambdaLayer(train_loss, apply_fn=lambda x: x + l2_weight * tf.add_n(l2_losses), name="train_loss_l2") # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): eval_loss = tx.LambdaLayer(labels, run_logits, apply_fn=categorical_loss, name="eval_loss") # BUILD MODEL super().__init__(run_outputs=run_output, run_inputs=inputs, train_inputs=[inputs, labels], train_outputs=train_output, train_loss=train_loss, eval_inputs=[inputs, labels], eval_outputs=run_output, eval_score=eval_loss)
from tqdm import tqdm n_hidden = 20 embed_dim = 10 seq_size = 2 vocab_size = 100 feature_shape = [vocab_size, embed_dim] loss_inputs = tx.Input(1, dtype=tf.int32) in_layer = tx.Input(seq_size, dtype=tf.int32) lookup = tx.Lookup(in_layer, seq_size=seq_size, lookup_shape=feature_shape) # [batch x seq_size * feature_shape[1]] h = tx.Linear(lookup, n_hidden, bias=True) ha = tx.Activation(h, tx.elu) h = tx.Compose(h, ha) logits = tx.Linear(h, vocab_size, bias=True) out = tx.Activation(logits, tx.softmax) labels = tx.dense_one_hot(loss_inputs.tensor, vocab_size) loss = tf.reduce_mean( tx.categorical_cross_entropy(labels=labels, logits=logits.tensor)) # setup optimizer optimizer = tx.AMSGrad(learning_rate=0.01) model = tx.Model(run_inputs=in_layer, run_outputs=out, train_inputs=in_layer,
# reshape to [batch x seq_size x feature_shape[1]] lookup_to_seq = tf.reshape(lookup.tensor, [-1, seq_size, embed_dim]) # type of rnn cell cell = tf.nn.rnn_cell.LSTMCell(num_units=n_hidden, state_is_tuple=True) val, state = tf.nn.dynamic_rnn(cell, lookup_to_seq, dtype=tf.float32) val = tf.transpose(val, [1, 0, 2]) # last = tf.gather(val, int(val.get_shape()[0]) - 1) last = val[-1] lstm_out = tx.TensorLayer(last, n_hidden) logits = tx.Linear(lstm_out, vocab_size, bias=True) out = tx.Activation(logits, tx.softmax) labels = tx.dense_one_hot(loss_inputs.tensor, vocab_size) loss = tf.reduce_mean(tx.categorical_cross_entropy(labels=labels, logits=logits.tensor)) # setup optimizer optimizer = tx.AMSGrad(learning_rate=0.01) model = tx.Model(run_inputs=in_layer, run_outputs=out, train_inputs=in_layer, train_outputs=out, train_in_loss=loss_inputs, train_out_loss=loss, eval_out_score=loss, eval_in_score=loss_inputs) print(model.feedable_train()) runner = tx.ModelRunner(model)
def __init__(self, ctx_size, vocab_size, embed_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=True, use_gate=True, use_hidden=False, h_dim=100, h_activation=tx.elu, h_init=tx.he_normal_init(), h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), use_dropout=True, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5, use_nce=False, nce_samples=100): # GRAPH INPUTS run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input") loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target") eval_inputs = loss_inputs # RUN GRAPH # if I create a scope here the Tensorboard graph will be a mess to read # because it groups everything by nested scope names # instead if I choose to create different scopes for train and eval only # the graph stays readable because it allows us to use the same names # under different scopes while still sharing variables var_reg = [] with tf.name_scope("run"): feature_lookup = tx.Lookup(run_inputs, ctx_size, [vocab_size, embed_dim], embed_init, name="lookup") var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() if use_gate or use_hidden: hl = tx.Linear(feature_lookup, h_dim, h_init, bias=True, name="h_linear") ha = tx.Activation(hl, h_activation, name="h_activation") h = tx.Compose(hl, ha, name="hidden") var_reg.append(hl.weights) features = feature_lookup if use_gate: gate_w = tx.Linear(h, ctx_size, bias=True) gate = tx.Gate(features, gate_input=gate_w) # gate = tx.Module([h, features], gate) features = gate var_reg.append(gate_w.weights) x_to_f = tx.Linear(features, embed_dim, x_to_f_init, bias=True, name="x_to_f") var_reg.append(x_to_f.weights) f_prediction = x_to_f if use_hidden: h_to_f = tx.Linear(h, embed_dim, h_to_f_init, bias=True, name="h_to_f") var_reg.append(h_to_f.weights) f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted") # RI DECODING =============================================== shared_weights = tf.transpose( feature_lookup.weights) if embed_share else None logit_init = logit_init if not embed_share else None run_logits = tx.Linear(f_prediction, vocab_size, logit_init, shared_weights, bias=True, name="logits") if not embed_share: var_reg.append(run_logits.weights) y_prob = tx.Activation(run_logits, tx.softmax) # TRAIN GRAPH =============================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(run_inputs) features = tx.Dropout(feature_lookup, probability=keep_prob) else: features = feature_lookup if use_gate or use_hidden: if use_dropout: h = h.reuse_with(features) h = tx.Dropout(h, probability=keep_prob) if use_gate: gate_w = gate_w.reuse_with(h) features = gate.reuse_with(layer=features, gate_input=gate_w) f_prediction = x_to_f.reuse_with(features) if use_hidden: h_to_f = h_to_f.reuse_with(h) if use_dropout: h_to_f = tx.Dropout(h_to_f, probability=keep_prob) f_prediction = tx.Add(f_prediction, h_to_f) else: f_prediction = f_prediction.reuse_with(features) train_logits = run_logits.reuse_with(f_prediction) if use_nce: # uniform gets good enough results if enough samples are used # but we can load the empirical unigram distribution # or learn the unigram distribution during training sampled_values = uniform_sampler(loss_inputs.tensor, 1, nce_samples, True, vocab_size) train_loss = tf.nn.nce_loss(weights=tf.transpose( train_logits.weights), biases=train_logits.bias, inputs=f_prediction.tensor, labels=loss_inputs.tensor, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=sampled_values) else: one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor, num_cols=vocab_size) train_loss = tx.categorical_cross_entropy( one_hot, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # EVAL GRAPH =============================================== with tf.name_scope("eval"): one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor, num_cols=vocab_size) eval_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) # SETUP MODEL CONTAINER ==================================== super().__init__(run_inputs=run_inputs, run_outputs=y_prob, train_inputs=run_inputs, train_outputs=y_prob, eval_inputs=run_inputs, eval_outputs=y_prob, train_out_loss=train_loss, train_in_loss=loss_inputs, eval_out_score=eval_loss, eval_in_score=eval_inputs)
all_embeddings = tx.Linear(ri_layer, embed_size, shared_weights=lookup.weights, name="all_features", bias=False) # dot product of f_predicted . all_embeddings with bias for each target word run_logits = tx.Linear(feature_predict, vocab_size, shared_weights=all_embeddings.tensor, transpose_weights=True, bias=False, name="logits") embed_prob = tx.Activation(run_logits, tx.softmax, name="run_output") one_hot = tx.dense_one_hot(column_indices=input_labels.tensor, num_cols=vocab_size) val_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) val_loss = tf.reduce_mean(val_loss) # ************************************* # Testing adaptive noise # ************************************* #TODO I need to test the infinite vocab scenario where we try to generate #RIs directly we can use sparsemax in that case noise_logits = tx.Linear(lookup, vocab_size, bias=True) adaptive_noise = tx.Activation(noise_logits, tx.softmax) # adaptive_noise = tx.sample_sigmoid_from_logits(noise_logits.tensor, n=1)
def __init__(self, ctx_size, vocab_size, k_dim, s_active, ri_tensor, embed_dim, h_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), num_h=1, h_activation=tx.relu, h_init=tx.he_normal_init, use_dropout=False, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=True, logit_bias=False, use_nce=False, nce_samples=100, noise_level=0.1): run_inputs = tx.Input(ctx_size, dtype=tf.int32) loss_inputs = tx.Input(n_units=1, dtype=tf.int64) eval_inputs = loss_inputs if run_inputs.dtype != tf.int32 and run_inputs.dtype != tf.int64: raise TypeError( "Invalid dtype for input: expected int32 or int64, got {}". format(run_inputs.dtype)) if num_h < 0: raise ValueError("num hidden should be >= 0") # =============================================== # RUN GRAPH # =============================================== var_reg = [] with tf.name_scope("run"): # RI ENCODING =============================================== # convert ids to ris gather a set of random indexes based on the ids in a sequence # ri_layer = tx.TensorLayer(ri_tensor, n_units=k_dim) # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) # ri_inputs = tx.TensorLayer(ri_inputs, n_units=k_dim) with tf.name_scope("ri_encode"): if isinstance(ri_tensor, RandomIndexTensor): ri_tensor = ri_tensor ri_layer = tx.TensorLayer(ri_tensor.to_sparse_tensor(), k_dim, shape=[vocab_size, k_dim]) ri_inputs = ri_tensor.gather(run_inputs.tensor) ri_inputs = ri_inputs.to_sparse_tensor() ri_inputs = tx.TensorLayer( ri_inputs, k_dim, shape=[ri_inputs.get_shape()[0], k_dim]) # ri_tensor is a sparse tensor else: raise TypeError( "please supply RandomIndexTensor instead of sparse Tensor" ) # ri_layer = tx.TensorLayer(ri_tensor, k_dim) # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) # ri_inputs = tx.TensorLayer(ri_inputs, k_dim) feature_lookup = tx.Lookup(ri_inputs, ctx_size, [k_dim, embed_dim], embed_init, name="lookup") self.embeddings = feature_lookup var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() # =========================================================== last_layer = feature_lookup h_layers = [] for i in range(num_h): h_i = tx.Linear(last_layer, h_dim, h_init, bias=True, name="h_{i}_linear".format(i=i)) h_a = tx.Activation(h_i, h_activation) h = tx.Compose(h_i, h_a, name="h_{i}".format(i=i)) h_layers.append(h) last_layer = h var_reg.append(h_i.weights) self.h_layers = h_layers # feature prediction for Energy-Based Model f_prediction = tx.Linear(last_layer, embed_dim, f_init, bias=True, name="f_predict") var_reg.append(f_prediction.weights) # RI DECODING =============================================== # Shared Embeddings if embed_share: shared_weights = feature_lookup.weights if embed_share else None logit_init = logit_init if not embed_share else None # ri_dense = tx.ToDense(ri_layer) all_embeddings = tx.Linear(ri_layer, embed_dim, logit_init, shared_weights, name="all_features", bias=False) # dot product of f_predicted . all_embeddings with bias for each target word run_logits = tx.Linear(f_prediction, vocab_size, shared_weights=all_embeddings.tensor, transpose_weights=True, bias=logit_bias, name="logits") else: run_logits = tx.Linear(f_prediction, vocab_size, bias=logit_bias, name="logits") if not embed_share: var_reg.append(run_logits.weights) # =========================================================== embed_prob = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(ri_inputs) last_layer = tx.Dropout(feature_lookup, probability=keep_prob) else: last_layer = feature_lookup # add dropout between each layer for layer in h_layers: h = layer.reuse_with(last_layer) if use_dropout: h = tx.Dropout(h, probability=keep_prob) last_layer = h f_prediction = f_prediction.reuse_with(last_layer) train_logits = run_logits.reuse_with(f_prediction, name="train_logits") train_embed_prob = tx.Activation(train_logits, tx.softmax, name="train_output") if use_nce: # labels labels = loss_inputs.tensor # convert labels to random indices def labels_to_ri(x): random_index_tensor = ri_tensor.gather(x) sp_features = random_index_tensor.to_sparse_tensor() return sp_features model_prediction = f_prediction.tensor train_loss = tx.sparse_cnce_loss( label_features=labels, model_prediction=model_prediction, weights=feature_lookup.weights, noise_ratio=noise_level, num_samples=nce_samples, labels_to_sparse_features=labels_to_ri) else: one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor, num_cols=vocab_size) train_loss = tx.categorical_cross_entropy( one_hot, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor, num_cols=vocab_size) eval_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) # BUILD MODEL super().__init__(run_inputs=run_inputs, run_outputs=embed_prob, train_inputs=run_inputs, train_outputs=train_embed_prob, eval_inputs=run_inputs, eval_outputs=embed_prob, train_out_loss=train_loss, train_in_loss=loss_inputs, eval_out_score=eval_loss, eval_in_score=eval_inputs)
def __init__(self, inputs, labels, vocab_size, embed_dim, h_dim, embed_init=tx.zeros_init(), logit_init=tx.glorot_uniform(), num_h=1, h_activation=tx.tanh, h_init=tx.glorot_uniform(), w_dropconnect=None, u_dropconnect=None, r_dropout=0.4, y_dropout=0.4, embed_dropout=0.3, other_dropout=0.3, l2_loss=False, l2_weight=1e-5, use_f_predict=False, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=False, logit_bias=False, use_nce=False, nce_samples=10, skip_connections=False): if not isinstance(inputs, tx.Input): raise TypeError("inputs must be an Input layer") self.inputs = inputs self.labels = labels if not isinstance(labels, tx.Input): raise TypeError("labels must be an Input layer") if inputs.dtype != tf.int32 and inputs.dtype != tf.int64: raise TypeError( "Invalid dtype for input: expected int32 or int64, got {}". format(inputs.dtype)) if num_h < 0: raise ValueError("num hidden should be >= 0") # =============================================== # RUN GRAPH # =============================================== var_reg = [] with tf.name_scope("run"): # feature lookup embeddings = tx.Lookup(inputs, seq_size=None, lookup_shape=[vocab_size, embed_dim], weight_init=embed_init) var_reg.append(embeddings.weights) feature_lookup = embeddings.permute_batch_time() last_layer = feature_lookup cell_proto = tx.LSTMCell.proto( n_units=h_dim, activation=h_activation, gate_activation=tx.hard_sigmoid, w_init=h_init, u_init=h_init, w_dropconnect=w_dropconnect, u_dropconnect=u_dropconnect, r_dropout=r_dropout, x_dropout=None, y_dropout=y_dropout, regularized=False, name="cell", ) lstm_layers = [] for i in range(num_h): lstm_layer = tx.RNN(last_layer, cell_proto=cell_proto, regularized=False, stateful=True, name="LSTM_{}".format(i + 1)) lstm_layers.append(lstm_layer) var_reg += [wi.weights for wi in lstm_layer.cell.w] var_reg += [ui.weights for ui in lstm_layer.cell.u] last_layer = lstm_layer # last time step is the state used to make the prediction # last_layer = tx.Reshape(last_layer, [-1, h_dim]) # TODO this is not consistent with locked dropout for the last layer # where the same mask should be applied across time steps # to do this I need either y_dropout to be available or some sort of map # operation I can use with layers outputting 3D tensors # something equivalent to https://keras.io/layers/wrappers/ which applies # a layer to every temporal slice of an input. They implement this the same way # they implement an RNN # feature prediction for Energy-Based Model if use_f_predict: last_layer = tx.Linear(last_layer, embed_dim, f_init, add_bias=True, name="f_predict") # proto = tx.GRUCell.proto(n_units=embed_dim, # activation=h_activation, # gate_activation=tx.hard_sigmoid, # w_init=h_init, # u_init=h_init, # w_dropconnect=w_dropconnect, # u_dropconnect=u_dropconnect, # r_dropout=r_dropout, # x_dropout=None, # y_dropout=y_dropout, # regularized=False) # last_layer1 = tx.RNN(last_layer, cell_proto=proto, regularized=False, stateful=False) # last_layer2 = last_layer1.reuse_with(last_layer, reverse=True) # last_layer = tx.Add(last_layer1, last_layer2) # last_layer = tx.Module(last_layer, last_layer) var_reg += last_layer.variables # var_reg.append(last_layer.weights) f_predict = last_layer shared_weights = feature_lookup.weights if embed_share else None transpose_weights = embed_share logit_init = logit_init if not embed_share else None run_logits = tx.Linear(last_layer, n_units=vocab_size, weight_init=logit_init, shared_weights=shared_weights, transpose_weights=transpose_weights, add_bias=logit_bias, name="logits") if not embed_share: var_reg.append(run_logits.weights) run_output = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): embeddings = embeddings.reuse_with(inputs) feature_lookup = embeddings.permute_batch_time() if embed_dropout: feature_lookup = tx.Dropout(feature_lookup, probability=embed_dropout, name="drop_features") last_layer = feature_lookup for i in range(num_h): lstm_layer = lstm_layers[i].reuse_with(last_layer, regularized=True) last_layer = lstm_layer # last_layer = tx.Reshape(last_layer, [-1, h_dim]) # feature prediction for Energy-Based Model if use_f_predict: # last_layer = f_predict.reuse_with(last_layer) last_layer = f_predict.reuse_with(last_layer, regularized=True) last_layer = tx.Dropout(last_layer, probability=other_dropout, locked=False) train_logits = run_logits.reuse_with(last_layer, name="train_logits") train_output = tx.Activation(train_logits, tx.softmax, name="run_output") def categorical_loss(labels, logits): # labels come as a batch of classes [[1,2],[3,4]] -> [1,3,2,4] time steps are ordered to match logits labels = tx.Transpose(labels) labels = tx.Reshape(labels, [-1]) labels = tx.dense_one_hot(labels, num_cols=vocab_size) loss = tx.categorical_cross_entropy(labels=labels, logits=logits) return tf.reduce_mean(loss) def nce_loss(labels, weights, bias, predict): noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size) loss = tf.nn.nce_loss(weights=weights, biases=bias, inputs=predict, labels=labels, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=noise) return tf.reduce_mean(loss) if use_nce: bias = tx.VariableLayer(var_shape=[vocab_size], name="nce_bias") # wraps a layer to expose the weights as a layer but with the layer as its input nce_weights = tx.WrapLayer(embeddings, n_units=embeddings.n_units, wrap_fn=lambda x: x.weights, layer_fn=True) train_loss = tx.LambdaLayer(labels, nce_weights, bias, last_layer, apply_fn=nce_loss, name="nce_loss") else: train_loss = tx.LambdaLayer(labels, train_logits, apply_fn=categorical_loss, name="train_loss") if l2_loss: l2_losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = tx.LambdaLayer( train_loss, apply_fn=lambda x: x + l2_weight * tf.add_n(l2_losses), name="train_loss_l2") # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): eval_loss = tx.LambdaLayer(labels, run_logits, apply_fn=categorical_loss, name="eval_loss") self.stateful_layers = lstm_layers # BUILD MODEL super().__init__(run_outputs=run_output, run_inputs=inputs, train_inputs=[inputs, labels], train_outputs=train_output, train_loss=train_loss, eval_inputs=[inputs, labels], eval_outputs=run_output, eval_score=eval_loss)
def __init__( self, run_inputs, label_inputs, eval_label_input, ctx_size, k_dim, ri_tensor_input, embed_dim, h_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), num_h=1, h_activation=tx.relu, h_init=tx.he_normal_init, use_dropout=False, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5, f_init=tx.random_uniform(minval=-0.01, maxval=0.01), use_nce=False, nce_samples=2, nce_noise_amount=0.1, noise_input=None, ): self.embed_dim = embed_dim var_reg = [] # =============================================== # RUN GRAPH # =============================================== with tf.name_scope("run"): feature_lookup = tx.Lookup(run_inputs, seq_size=ctx_size, lookup_shape=[k_dim, embed_dim], weight_init=embed_init, name="lookup") self.embeddings = feature_lookup var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() # =========================================================== with tf.name_scope("cache_embeddings"): # ris = [sign_index.get_ri(sign_index.get_sign(i)) for i in range(len(sign_index))] # self.all_ris = ris_to_sp_tensor_value(ri_seq=ris, # dim=sign_index.generator.dim, # all_positive=not sign_index.generator.symmetric) all_embeddings = tx.Linear( ri_tensor_input, n_units=self.embed_dim, shared_weights=self.embeddings.weights, bias=False, name='all_features') # caches all embedding computation for run/eval self.all_embeddings = tx.VariableLayer(all_embeddings, trainable=False) # =========================================================== last_layer = feature_lookup h_layers = [] for i in range(num_h): hi = tx.FC(last_layer, n_units=h_dim, activation=h_activation, weight_init=h_init, name="h_{i}".format(i=i)) h_layers.append(hi) last_layer = hi var_reg.append(hi.linear.weights) self.h_layers = h_layers # feature prediction for Energy-Based Model f_prediction = tx.Linear(last_layer, embed_dim, f_init, bias=True, name="f_predict") var_reg.append(f_prediction.weights) # RI DECODING =============================================== # shape is (?,?) because batch size is unknown and vocab size is unknown # when we build the graph run_logits = tx.Linear(f_prediction, n_units=None, shared_weights=self.all_embeddings.variable, transpose_weights=True, bias=False, name="logits") # =========================================================== embed_prob = tx.Activation(run_logits, tx.softmax, name="run_output") # =============================================== # TRAIN GRAPH # =============================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(run_inputs) last_layer = tx.Dropout(feature_lookup, probability=keep_prob) else: last_layer = feature_lookup # add dropout between each layer for layer in h_layers: h = layer.reuse_with(last_layer) if use_dropout: h = tx.Dropout(h, probability=keep_prob) last_layer = h f_prediction = f_prediction.reuse_with(last_layer) train_logits = run_logits.reuse_with(f_prediction, name="train_logits") train_embed_prob = tx.Activation(train_logits, tx.softmax, name="train_output") # convert labels to random indices model_prediction = f_prediction.tensor if use_nce: train_loss = tx.sparse_cnce_loss( label_features=label_inputs.tensor, noise_features=noise_input.tensor, model_prediction=model_prediction, weights=feature_lookup.weights, num_samples=nce_samples, noise_ratio=nce_noise_amount) else: one_hot_dense = tx.dense_one_hot( column_indices=label_inputs[0].tensor, num_cols=label_inputs[1].tensor) train_loss = tx.categorical_cross_entropy( one_hot_dense, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # =============================================== # EVAL GRAPH # =============================================== with tf.name_scope("eval"): one_hot_dense = tx.dense_one_hot( column_indices=eval_label_input[0].tensor, num_cols=label_inputs[1].tensor) train_loss = tx.categorical_cross_entropy(one_hot_dense, train_logits.tensor) eval_loss = tx.categorical_cross_entropy(one_hot_dense, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) if use_nce: train_loss_in = [label_inputs, noise_input] else: train_loss_in = label_inputs # BUILD MODEL super().__init__(run_inputs=run_inputs, run_outputs=embed_prob, train_inputs=run_inputs, train_outputs=train_embed_prob, eval_inputs=run_inputs, eval_outputs=embed_prob, train_out_loss=train_loss, train_in_loss=train_loss_in, eval_out_score=eval_loss, eval_in_score=eval_label_input, update_inputs=ri_tensor_input)
v_dim = 1000 m_dim = 2 n_hidden = 100 seq_size = 2 w = [[0, 1], [1, 5], [0, 1]] v2 = tf.constant(np.random.uniform(-1., 1., [v_dim, m_dim])) inputs = tx.Input(2, dtype=tf.int32) lookup = tx.Lookup(inputs, 2, lookup_shape=[v_dim, m_dim]) # GATING MECHANISM # I can call this a seq gate, takes the parameters and divides by seq_size h = tx.Linear(lookup, 100, bias=True) h = tx.Activation(h, tx.elu) gate = tx.Linear(h, 2, bias=True) gate = tx.Activation(gate, tx.sigmoid) # lookup might output a sequence format with [batch,seq_size,m_dim] # lookup_out = lookup.tensor lookup_out = tf.reshape(lookup.tensor, [-1, seq_size, m_dim]) # reshape works anyway gated_out = tf.reshape(lookup_out, [-1, seq_size, m_dim]) * tf.expand_dims( gate.tensor, -1) # gated_out = tf.reshape(gated_out, [-1, seq_size * m_dim]) # gated_out = tf.reshape(gated_out, [-1, lookup.n_units]) gated_out = tf.reshape(gated_out, tf.shape(lookup.tensor))