def state_size(self): cs_size = self._num_units * 5 return (LSTMStateTuple(cs_size, 5 * self._num_units) if self._state_is_tuple else 2 * self._num_units)
def __init__(self, is_train, params): self.is_train = is_train self.batch_size = params['batch_size'] self.n_seqs = params['n_seqs'] self.lstm_size = params['lstm_size'] self.num_layers = params['num_layers'] self.keep_prob = params['keep_prob'] self.learning_rate = params['learning_rate'] self.input_size = params['input_size'] self.output_size = params['output_size'] self.num_scopes = len(self.n_seqs[0]) self.grad_clip = 2 self.targets = tf.placeholder(tf.float32, shape=[None, self.output_size]) #lstm=tf.nn.rnn_cell.BasicLSTMCell(lstm_size) #drop=tf.nn.rnn_cell.DropoutWrapper(lstm, output_keep_prob=keep_prob) self.inputs = [] self.initial_states = [] self.states = [] self.in_outputs = [] self.scope_scales = [] for i in range(self.num_scopes): with tf.variable_scope("lstm_scope" + str(i)): self.inputs.append( tf.placeholder( tf.float32, shape=[None, self.n_seqs[1][i], self.input_size])) if i > 0: self.scope_scales.append( tf.placeholder(tf.float32, [1], "scale")) cell = tf.nn.rnn_cell.MultiRNNCell( [self.lstm_cell() for _ in range(self.num_layers)]) #self.initial_states.append(cell.zero_state(self.batch_size, tf.float32)) print(cell.name) if i == 0: initstate = [ LSTMStateTuple( tf.zeros([self.batch_size, self.lstm_size], tf.float32), tf.zeros([self.batch_size, self.lstm_size], tf.float32)) ] else: initstate = [ LSTMStateTuple( tf.zeros([self.batch_size, self.lstm_size], tf.float32), self.in_outputs[i - 1]) ] for j in range(self.num_layers - 1): initstate.append( LSTMStateTuple( tf.zeros([self.batch_size, self.lstm_size], tf.float32), tf.zeros([self.batch_size, self.lstm_size], tf.float32))) self.initial_states.append(tuple(initstate)) cell_inputs = self.inputs[i] cell_outputs, state = tf.nn.dynamic_rnn( cell, cell_inputs, initial_state=self.initial_states[i], dtype=tf.float32) self.states.append(state) with tf.variable_scope('attention'): # attention Wc = tf.get_variable('weighted_c', shape=(self.lstm_size, 1)) Wh = tf.get_variable('weighted_h', shape=(self.lstm_size, 1)) wcc = tf.matmul(state[self.num_layers - 1].c, Wc) ms = [ tf.nn.tanh(wcc[i] + tf.matmul(cell_outputs[i, :, :], Wh)) for i in range(self.batch_size) ] ws = tf.nn.softmax(ms) re_outs = tf.reduce_sum(tf.multiply(ws, cell_outputs), axis=1) #print(re_outs.shape) #re_outs=cell_outputs[:, -1, :] #if i>0: re_outs=tf.add((1-self.scope_scales[i-1])*re_outs, self.scope_scales[i-1]*self.in_outputs[i-1]) if i > 0: re_outs = tf.add(0.8 * re_outs, 0.2 * self.in_outputs[i - 1]) self.in_outputs.append(re_outs) #print(self.in_outputs[0].name, self.in_outputs[1].name, self.in_outputs[2].name) w_o = tf.get_variable('weight', [self.lstm_size, self.output_size]) b_o = tf.get_variable('bias', [self.output_size]) print(w_o.name) self.outputs = tf.matmul(self.in_outputs[-1], w_o) + b_o #loss and optimizer self.loss = tf.sqrt( tf.reduce_mean(tf.square(self.targets - self.outputs))) if is_train: #clipping gradients optimizer self.tvars = tf.trainable_variables() self.grads, _ = tf.clip_by_global_norm( tf.gradients(self.loss, self.tvars), self.grad_clip) self.optimizer = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(zip( self.grads, self.tvars))
def make_model(self): # number of programs * number of paths * number of executions * number of states/number of statements, number of variables self.placeholders["executions"] = tf.placeholder(tf.int32, [None, None], name="executions") # number of programs * number of paths * number of executions * number of states/number of statements self.placeholders["variable_number_sequence"] = tf.placeholder(tf.int32, [None], name="variable_number_sequence") # number of programs * number of paths * number of statements/number of states, number of tokens self.placeholders["tokens"] = tf.placeholder(tf.int32, [None, None], name="tokens") # number of programs * number of paths * number of statements/number of states self.placeholders["tokens_number_sequence"] = tf.placeholder(tf.int32, [None], name="tokens_number_sequence") # number of programs * number of paths self.placeholders["state_statement_number_sequence"] = tf.placeholder(tf.int32, [None], name="state_statement_number_sequence") self.placeholders['rnn_state_dropout_keep_prob'] = tf.placeholder(tf.float32, None, name='rnn_state_dropout_keep_prob') self.placeholders['mlp_dropout_keep_prob'] = tf.placeholder(tf.float32, None, name='mlp_dropout_keep_prob') self.placeholders["max_state_statement"] = tf.placeholder(tf.int32, None, name="max_state") self.placeholders['label'] = tf.placeholder(tf.int32, [None], name='label') batch_size = self.params["number_of_programs"] * self.params["number_of_paths"] embedding_matrix = tf.get_variable('embedding_matrix', [len(self.vocabulary)+1, self.params["vocabulary_embedding_size"]]) # number of programs * number of paths * number of executions * number of states/number of statements, number of variables , embedding_size embedded_executions = tf.nn.embedding_lookup(params=embedding_matrix, ids=self.placeholders["executions"]) # number of programs * number of paths * number of states/number of statements, number of tokens , embedding_size embedded_tokens = tf.nn.embedding_lookup(params=embedding_matrix, ids=self.placeholders["tokens"]) with tf.variable_scope("state_encoding"): state_encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.params["hidden_size"]) state_encoder_cell = tf.nn.rnn_cell.DropoutWrapper(state_encoder_cell, state_keep_prob=self.params["rnn_state_dropout_keep_prob"]) _, states_embedding = tf.nn.dynamic_rnn(state_encoder_cell, embedded_executions, sequence_length=self.placeholders["variable_number_sequence"], initial_state=state_encoder_cell.zero_state(tf.shape(embedded_executions)[0], tf.float32), dtype=tf.float32) # number of programs * number of paths * number of executions, number of states/number of statements, embedding_size dynamic_state_embedding = tf.convert_to_tensor(tf.split(states_embedding[1], batch_size * self.params["number_of_executions"], axis=0)) with tf.variable_scope("statement_encoding"): statement_encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.params["hidden_size"]) statement_encoder_cell = tf.nn.rnn_cell.DropoutWrapper(statement_encoder_cell, state_keep_prob=self.params["rnn_state_dropout_keep_prob"]) _, statements_embedding = tf.nn.dynamic_rnn(statement_encoder_cell, embedded_tokens, sequence_length=self.placeholders["tokens_number_sequence"], initial_state=statement_encoder_cell.zero_state(tf.shape(embedded_tokens)[0], tf.float32), dtype=tf.float32) # number of programs * number of paths, number of states/number of statements, embedding_size static_tokens_embedding = tf.convert_to_tensor(tf.split(statements_embedding[1], batch_size, axis=0)) trace_encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.params["hidden_size"]) trace_encoder_cell = tf.nn.rnn_cell.DropoutWrapper(trace_encoder_cell, state_keep_prob=self.params["rnn_state_dropout_keep_prob"]) trace_encoder_initial_states = trace_encoder_cell.zero_state(batch_size, tf.float32) # axis zero stands for two LSTM states: c and h trace_encoder_final_states = tf.zeros([2, batch_size, self.params["hidden_size"]]) loop_counter_inital = tf.constant(0) monitor_rnn_states = tf.zeros([0, batch_size, self.params["hidden_size"]]) monitor_output = tf.zeros([0, batch_size, self.params["hidden_size"]]) monitor_mask = tf.zeros([0, batch_size], tf.float32) monitor_attention_probabilities = tf.zeros([0, self.params["number_of_executions"]+1, 1], tf.float32) def while_condition(loop_counter,state_statement_number_sequence, rnn_states, trace_encoder_final_states, monitor_rnn_states, monitor_mask, monitor_output, monitor_attention_probabilities): return loop_counter < self.placeholders["max_state_statement"] def while_body(loop_counter, state_statement_number_sequence, rnn_states, trace_encoder_final_states, monitor_rnn_states, monitor_mask, monitor_output, monitor_attention_probabilities): loop_counter_current = loop_counter # number of programs * number of paths * number of executions, embedding_size current_states = tf.gather_nd(dynamic_state_embedding, tf.stack([tf.range(0, batch_size * self.params["number_of_executions"]), tf.zeros([batch_size * self.params["number_of_executions"]], tf.int32)+loop_counter_current], axis=1)) # number of programs * number of paths, number of executions, embedding_size current_states = tf.convert_to_tensor(tf.split(current_states, batch_size, axis=0)) # number of programs * number of paths, embedding_size current_tokens = tf.gather_nd(static_tokens_embedding, tf.stack([tf.range(0, batch_size), tf.zeros([batch_size], tf.int32)+loop_counter_current], axis=1)) # number of programs * number of paths, 1, embedding_size current_tokens = tf.expand_dims(current_tokens, axis=1) # number of programs * number of paths, number of executions+1, embedding_size current_states_and_tokens = tf.concat([current_states,current_tokens], axis=1) # number of programs * number of paths, 2 * lstm hidden_size rnn_states_concat = tf.concat((rnn_states[0], rnn_states[1]), axis=1) # number of programs * number of paths, 1, 2 * lstm hidden_size rnn_states_concat = tf.expand_dims(rnn_states_concat, axis=1) # 1 * number of executions+1 * 1 replicate_factor = tf.ones([1,self.params["number_of_executions"]+1,1], tf.float32) # number of programs * number of paths, number of executions+1, 2 * lstm hidden_size rnn_states_concat = rnn_states_concat * replicate_factor # number of programs * number of paths, number of executions+1, embedding_size + 2 * lstm hidden_size rnn_inputs_and_states = tf.concat([current_states_and_tokens,rnn_states_concat], axis=-1) # number of programs * number of paths * number of executions+1, embedding_size + 2 * lstm hidden_size rnn_inputs_and_states = tf.concat(tf.unstack(rnn_inputs_and_states, num=batch_size, axis=0), 0) # number of programs * number of paths * number of executions+1, 1 attention_scores_fn = MLP(rnn_inputs_and_states, 0, 1, self.placeholders['mlp_dropout_keep_prob']) # number of programs * number of paths, number of executions+1, 1 attention_scores = tf.convert_to_tensor(tf.split(attention_scores_fn(), batch_size, axis=0)) attention_probabilities = tf.nn.softmax(attention_scores, dim=1) monitor_attention_probabilities = tf.concat([monitor_attention_probabilities, attention_probabilities], axis=0) # number of programs * number of paths, embedding_size inputs_after_attention = tf.reduce_sum(attention_probabilities * current_states_and_tokens, axis=1) _, rnn_states = trace_encoder_cell(inputs_after_attention, rnn_states) monitor_rnn_states = tf.concat([monitor_rnn_states, rnn_states], axis=0) monitor_output = tf.concat([monitor_output, tf.expand_dims(rnn_states[1], axis=0)], axis=0) loop_counter_current += 1 mask = tf.zeros([0], tf.float32) it_state_length = tf.unstack(state_statement_number_sequence, batch_size, axis=0) for each_state_length in it_state_length: def f1(): return tf.zeros([1], tf.float32) def f2(): return tf.ones([1], tf.float32) result = tf.cond(tf.equal(each_state_length,loop_counter_current), f2, f1) mask = tf.concat([mask, result], axis=0) monitor_mask = tf.concat([monitor_mask, tf.expand_dims(mask,0)], axis=0) mask = tf.expand_dims(mask, axis=1) trace_encoder_final_states = trace_encoder_final_states + mask * rnn_states return [loop_counter_current, state_statement_number_sequence, rnn_states, trace_encoder_final_states, monitor_rnn_states, monitor_mask, monitor_output, monitor_attention_probabilities] _, _, _, self.ops['l_res'], self.ops['l_mono'], self.ops['l_mono_mask'], self.ops['l_mono_out'], self.ops['l_mono_attention'] = \ tf.while_loop(while_condition, while_body, loop_vars=[loop_counter_inital, self.placeholders['state_statement_number_sequence'], trace_encoder_initial_states, trace_encoder_final_states, monitor_rnn_states, monitor_mask, monitor_output, monitor_attention_probabilities], shape_invariants=[loop_counter_inital.shape, self.placeholders["state_statement_number_sequence"].shape, LSTMStateTuple(tf.TensorShape([batch_size,self.params["hidden_size"]]), tf.TensorShape([batch_size,self.params["hidden_size"]])), trace_encoder_final_states.shape, tf.TensorShape([None, batch_size, self.params["hidden_size"]]), tf.TensorShape([None, batch_size]), tf.TensorShape([None, batch_size, self.params["hidden_size"]]), tf.TensorShape([None, self.params["number_of_executions"]+1, 1])]) self.ops['attention'] = tf.squeeze(tf.reduce_mean(self.ops['l_mono_attention'], axis=0), axis=-1) h_states = self.ops['l_res'][0] # c_states = self.ops['l_res'][1] # hc_conca_states = tf.concat([h_states,c_states], axis=1) state_rep = h_states self.ops['final_embeddings'] = tf.reduce_max(tf.stack(tf.split(state_rep, self.params["number_of_programs"], axis=0), axis=0), axis=1) W_pred = tf.get_variable("weights_for_prediction", [self.params["hidden_size"], self.params["num_labels"]], tf.float32) b_pred = tf.get_variable("bias_for_prediction", [self.params["num_labels"]], tf.float32) logits = tf.matmul(self.ops['final_embeddings'], W_pred) + b_pred predictions = tf.argmax(logits, 1) comparisons = tf.cast(tf.equal(tf.cast(predictions,tf.int32), self.placeholders['label']),tf.float32) accuracy = tf.reduce_mean(comparisons) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.placeholders['label'], logits=logits) self.ops["predictions"] = predictions self.ops["comparisons"] = comparisons self.ops["accuracy"] = accuracy self.ops["loss"] = tf.reduce_sum(loss)
def Model(_abnormal_data, _abnormal_label, _hidden_num, _elem_num, _file_name, _partition): tf.reset_default_graph() g = tf.Graph() with g.as_default(): # placeholder list p_input = tf.placeholder(tf.float32, shape=(batch_num, _abnormal_data.shape[1], _abnormal_data.shape[2])) p_inputs = [ tf.squeeze(t, [1]) for t in tf.split(p_input, _abnormal_data.shape[1], 1) ] # projection_layer = tf.layers.Dense(units=_elem_num, use_bias=True) # with tf.device('/device:GPU:0'): d_enc = {} with tf.variable_scope('encoder'): for j in range(ensemble_space): if cell_type == 0: enc_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: pure_enc_cell = LSTMCell(_hidden_num) residual_enc_cell = RLSTMCell(_hidden_num, reuse=tf.AUTO_REUSE) enc_cell = RSLSTMCell(_hidden_num, file_name=_file_name, component=j, partition=_partition, type='enc', reuse=tf.AUTO_REUSE) if cell_type == 2: pure_enc_cell = GRUCell(_hidden_num) enc_cell = RSGRUCell(_hidden_num) if j == 0: enc_state = pure_enc_cell.zero_state(batch_size=batch_num, dtype=tf.float32) enc_outputs = [] for step in range(len(p_inputs)): enc_input = p_inputs[step] enc_output_, enc_state = pure_enc_cell( enc_input, enc_state) enc_outputs.append(enc_output_) d_enc['enc_output_{0}'.format(j)] = enc_outputs d_enc['enc_state_{0}'.format(j)] = enc_state elif j == 1: enc_state = residual_enc_cell.zero_state( batch_size=batch_num, dtype=tf.float32) enc_outputs = [] for step in range(len(p_inputs)): enc_input = p_inputs[step] enc_output_, enc_state = residual_enc_cell( enc_input, enc_state) enc_outputs.append(enc_output_) d_enc['enc_output_{0}'.format(j)] = enc_outputs d_enc['enc_state_{0}'.format(j)] = enc_state else: enc_state = enc_cell.zero_state(batch_size=batch_num, dtype=tf.float32) enc_outputs = [] for step in range(len(p_inputs)): enc_input = p_inputs[step] enc_output_, enc_state = enc_cell(enc_input, enc_state) enc_outputs.append(enc_output_) d_enc['enc_output_{0}'.format(j)] = enc_outputs d_enc['enc_state_{0}'.format(j)] = enc_state shared_state_c = tf.concat([ d_enc['enc_state_{0}'.format(j)].c for j in range(ensemble_space) ], axis=1) shared_state_h = tf.concat([ d_enc['enc_state_{0}'.format(j)].h for j in range(ensemble_space) ], axis=1) if compress: compress_state = tf.layers.Dense(units=_hidden_num, activation=tf.tanh, use_bias=True) shared_state_c = compress_state(shared_state_c) shared_state_h = compress_state(shared_state_h) shared_state = LSTMStateTuple(shared_state_c, shared_state_h) # with tf.device('/device:GPU:1'): d_dec = {} with tf.variable_scope('decoder') as vs: dec_weight_ = tf.Variable(tf.truncated_normal( [_hidden_num * ensemble_space, _elem_num], dtype=tf.float32), name="dec_weight") dec_bias_ = tf.Variable(tf.constant(0.1, shape=[_elem_num], dtype=tf.float32), name="dec_bias") if decode_without_input: for k in range(ensemble_space): if cell_type == 0: dec_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: if compress: pure_dec_cell = LSTMCell(_hidden_num) residual_dec_cell = RLSTMCell(_hidden_num) dec_cell = RSLSTMCell(_hidden_num, file_name=_file_name, component=k, partition=_partition, type='dec', reuse=tf.AUTO_REUSE) else: pure_dec_cell = LSTMCell(_hidden_num * ensemble_space) residual_dec_cell = RLSTMCell(_hidden_num * ensemble_space) dec_cell = RSLSTMCell(_hidden_num * ensemble_space, file_name=_file_name, component=k, partition=_partition, type='dec', reuse=tf.AUTO_REUSE) if cell_type == 2: if compress: pure_dec_cell = GRUCell(_hidden_num) dec_cell = RSGRUCell(_hidden_num) else: pure_dec_cell = GRUCell(_hidden_num * ensemble_space) dec_cell = RSGRUCell(_hidden_num * ensemble_space) if k == 0: dec_inputs = [ tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) for _ in range(len(p_inputs)) ] dec_outputs, dec_state = tf.contrib.rnn.static_rnn( pure_dec_cell, dec_inputs, initial_state=shared_state, dtype=tf.float32) elif k == 1: dec_inputs = [ tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) for _ in range(len(p_inputs)) ] dec_outputs, dec_state = tf.contrib.rnn.static_rnn( residual_dec_cell, dec_inputs, initial_state=shared_state, dtype=tf.float32) else: dec_inputs = [ tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) for _ in range(len(p_inputs)) ] dec_outputs, dec_state = tf.contrib.rnn.static_rnn( dec_cell, dec_inputs, initial_state=shared_state, dtype=tf.float32) if reverse: dec_outputs = dec_outputs[::-1] dec_output_ = tf.transpose(tf.stack(dec_outputs), [1, 0, 2]) dec_weight_ = tf.tile(tf.expand_dims(dec_weight_, 0), [batch_num, 1, 1]) d_dec['dec_output_{0}'.format(k)] = tf.matmul( dec_output_, dec_weight_) + dec_bias_ if reverse: d_dec['dec_output_{0}'.format(k)] = d_dec[ 'dec_output_{0}'.format(k)][::-1] else: for k in range(ensemble_space): if cell_type == 0: dec_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: if compress: pure_dec_cell = LSTMCell(_hidden_num) residual_dec_cell = RLSTMCell(_hidden_num, reuse=tf.AUTO_REUSE) dec_cell = RSLSTMCell(_hidden_num, file_name=_file_name, component=k, partition=_partition, type='dec', reuse=tf.AUTO_REUSE) else: pure_dec_cell = LSTMCell(_hidden_num * ensemble_space) residual_dec_cell = RLSTMCell(_hidden_num * ensemble_space, reuse=tf.AUTO_REUSE) dec_cell = RSLSTMCell(_hidden_num * ensemble_space, file_name=_file_name, component=k, partition=_partition, type='dec', reuse=tf.AUTO_REUSE) if cell_type == 2: if compress: pure_dec_cell = GRUCell(_hidden_num) dec_cell = RSGRUCell(_hidden_num) else: pure_dec_cell = GRUCell(_hidden_num * ensemble_space) dec_cell = RSGRUCell(_hidden_num * ensemble_space) if k == 0: dec_state = shared_state dec_input_ = tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) dec_outputs = [] for step in range(len(p_inputs)): if step > 0: vs.reuse_variables() dec_input_, dec_state = pure_dec_cell( dec_input_, dec_state) dec_input_ = tf.matmul(dec_input_, dec_weight_) + dec_bias_ dec_outputs.append(dec_input_) elif k == 1: dec_state = shared_state dec_input_ = tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) dec_outputs = [] for step in range(len(p_inputs)): if step > 0: vs.reuse_variables() dec_input_, dec_state = residual_dec_cell( dec_input_, dec_state) dec_input_ = tf.matmul(dec_input_, dec_weight_) + dec_bias_ dec_outputs.append(dec_input_) else: dec_state = shared_state dec_input_ = tf.zeros(tf.shape(p_inputs[0]), dtype=tf.float32) dec_outputs = [] for step in range(len(p_inputs)): if step > 0: vs.reuse_variables() dec_input_, dec_state = dec_cell( dec_input_, dec_state) dec_input_ = tf.matmul(dec_input_, dec_weight_) + dec_bias_ dec_outputs.append(dec_input_) d_dec['dec_output_{0}'.format(k)] = dec_outputs if reverse: d_dec['dec_output_{0}'.format(k)] = d_dec[ 'dec_output_{0}'.format(k)][::-1] sum_of_difference = 0 for i in range(ensemble_space): sum_of_difference += d_dec['dec_output_{0}'.format(i)][0] - p_input loss = tf.reduce_mean(tf.square(sum_of_difference)) optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) # Add ops to save and restore all the variables. saver = tf.train.Saver() return g, p_input, d_dec, loss, optimizer, saver
def call(self, inputs, state): """Run one step of LSTM. Args: inputs: input Tensor, must be 2-D, `[batch, input_size]`. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, [batch, state_size]`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. Returns: A tuple containing: - A `2-D, [batch, output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") # No feedback, if desired; also, gcnn/cnn do not have feedback if self._no_feedback or self._gate_mod in ["gcnn", "cnn"]: m_prev = tf.zeros(m_prev.shape) # i = input_gate, j = new_input, f = forget_gate, o = output_gate if self._ngram: lstm_matrix = inputs + math_ops.matmul(m_prev, self._kernel) else: lstm_matrix = math_ops.matmul( array_ops.concat([inputs, m_prev], 1), self._kernel) lstm_matrix = nn_ops.bias_add(lstm_matrix, self._bias) i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * self._activation(j)) elif self._gate_mod == "lstm": c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) elif self._gate_mod == "rkm_lstm": c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * j) elif self._gate_mod == "rkm_cifg": c = (sigmoid(f + self._forget_bias) * c_prev + (1 - sigmoid(f + self._forget_bias)) * j) elif self._gate_mod in ["gated_linear", "linear"]: # sigma2_f = 0.5 # sigma2_i = 0.5 # c = (sigma2_f * c_prev + sigma2_i * j) c = (self._sigma2_f * c_prev + self._sigma2_i * j) elif self._gate_mod in ["gcnn", "cnn"]: sigma2_i = 1 c = sigma2_i * j else: raise NotImplementedError("Invalid gate_mod: {0}".format( self._gate_mod)) if self._layer_norm: c = tf.contrib.layers.layer_norm(c) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c) * self._activation(c) elif self._gate_mod == "lstm": m = sigmoid(o) * self._activation(c) elif self._gate_mod in [ "rkm_lstm", "rkm_cifg", "gated_linear", "gcnn" ]: m = sigmoid(o) * c elif self._gate_mod in ["linear", "cnn"]: m = self._activation(c) else: raise NotImplementedError("Invalid gate_mod: {0}".format( self._gate_mod)) if self._num_proj is not None: m = math_ops.matmul(m, self._proj_kernel) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def state_size(self): cs_size = self.num_units * 1 return (LSTMStateTuple(cs_size, 1 * self.num_units) if self._state_is_tuple else 1 * self.num_units)
def build(self): tf.reset_default_graph() config = self.build_config embeddings = [] rnn_out_drop = tf.get_variable(name='rnn_out_drop', trainable=False, initializer=config.rnn_out_drop) endings_inp_drop = tf.get_variable(name='endings_inp_drop', trainable=False, initializer=config.endings_inp_drop) gram_inp_drop = tf.get_variable(name='gram_inp_drop', trainable=False, initializer=config.gram_inp_drop) rnn_state_drop = tf.get_variable(name='rnn_state_drop', trainable=False, initializer=config.rnn_state_drop) dense_drop = tf.get_variable(name='dense_drop', trainable=False, initializer=config.dense_drop) self.training = tf.get_variable(name='is_training', trainable=False, dtype=tf.bool, initializer=True) self.lr = tf.get_variable(name='lr', initializer=self.train_config.lr, trainable=False) tf.summary.scalar('lr__', self.lr) self.weights = tf.placeholder(dtype=tf.float32, shape=[None, None], name='weights') weights = tf.reshape(self.weights, shape=[-1]) total = tf.reduce_sum(weights) self.total = total self.global_step = tf.train.get_or_create_global_step() if config.use_endings: with tf.variable_scope('word_endings'): voc_size = self.endings_vectorizer.get_size() self.endings_input = tf.placeholder(dtype=tf.int32, shape=[None, None], name='endings_input') self.endings_embedding = tf.get_variable( 'endings_embs', shape=[voc_size, config.endings_emb_size], initializer=tf.initializers.random_normal) endings_input = tf.nn.embedding_lookup(self.endings_embedding, self.endings_input) endings_input = tf.nn.dropout(endings_input, keep_prob=1. - endings_inp_drop) embeddings.append(endings_input) if config.use_gram: with tf.variable_scope('grammems'): gram_vec_size = self.grammeme_vectorizer_input.grammemes_count( ) self.grammems_input = tf.placeholder( dtype=tf.float32, shape=[None, None, gram_vec_size], name='grammems_input') grammems_input = tf.nn.dropout(self.grammems_input, keep_prob=1. - gram_inp_drop) grammems_input = self.dense_layer( in_size=gram_vec_size, out_size=config.gram_hidden_size, name='gram_embs', inputs=grammems_input, activation='relu') embeddings.append(grammems_input) if len(embeddings) > 1: embeddings = tf.concat(embeddings, axis=-1, name='concatenated_inputs') else: embeddings = embeddings[0] batch_size = tf.shape(embeddings, name='batch_size')[0] with tf.variable_scope('lstm_input'): lstm_input = tf.get_variable( name='lstm_input', shape=[ embeddings.get_shape().as_list()[-1], config.rnn_hidden_size ]) lstm_input_bias = tf.get_variable(name='lstm_input_bias', shape=[config.rnn_hidden_size]) lstm_input = tf.tensordot(embeddings, lstm_input, axes=( (-1), (0))) + lstm_input_bias lstm_input = tf.nn.relu(lstm_input) with tf.variable_scope('lstm'): if config.learn_init_state: initial_state_forward = tf.get_variable( name='f_initial_state_1', shape=[config.rnn_hidden_size * 2]) initial_state_backward = tf.get_variable( 'b_initial_state_1', shape=[config.rnn_hidden_size * 2]) f_init_state_c = tf.expand_dims( initial_state_forward[:config.rnn_hidden_size], axis=0) f_init_state_m = tf.expand_dims( initial_state_forward[config.rnn_hidden_size:], axis=0) b_init_state_c = tf.expand_dims( initial_state_backward[:config.rnn_hidden_size], axis=0) b_init_state_m = tf.expand_dims( initial_state_backward[config.rnn_hidden_size:], axis=0) f_init_state_c = tf.tile(f_init_state_c, multiples=[batch_size, 1]) f_init_state_m = tf.tile(f_init_state_m, multiples=[batch_size, 1]) b_init_state_c = tf.tile(b_init_state_c, multiples=[batch_size, 1]) b_init_state_m = tf.tile(b_init_state_m, multiples=[batch_size, 1]) f_init_state = LSTMStateTuple(f_init_state_c, f_init_state_m) b_init_state = LSTMStateTuple(b_init_state_c, b_init_state_m) else: f_init_state = None b_init_state = None f_lstm_cell = tf.nn.rnn_cell.LSTMCell(config.rnn_hidden_size, name='f_lstm_cell_1') b_lstm_cell = tf.nn.rnn_cell.LSTMCell(config.rnn_hidden_size, name='b_lstm_cell_1') f_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( f_lstm_cell, state_keep_prob=1. - rnn_state_drop, output_keep_prob=1. - rnn_out_drop, seed=config.seed) b_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( b_lstm_cell, state_keep_prob=1. - rnn_state_drop, output_keep_prob=1. - rnn_out_drop, seed=config.seed) (f_outputs, b_outputs), _ = bidirectional_dynamic_rnn( f_lstm_cell, b_lstm_cell, lstm_input, dtype=tf.float32, initial_state_fw=f_init_state, initial_state_bw=b_init_state) def merge_mode(forward, backward): if config.merge_mode == 'ave': outputs = tf.reduce_mean(tf.stack([forward, backward], axis=0), axis=0) elif config.merge_mode == 'concat': outputs = tf.concat([forward, backward], axis=-1, name='rnn_layer_outputs') elif config.merge_mode == 'sum': outputs = tf.reduce_sum(tf.stack([forward, backward], axis=0), axis=0) else: raise ValueError() return outputs outputs = merge_mode(f_outputs, b_outputs) # self.first_layer_outputs = outputs # [bs, seq_len, rnn_hidden_size (2 * rnn_hidden_size)] def make_cell(size, name): f_cell = tf.nn.rnn_cell.LSTMCell(size, name='f_' + name) b_cell = tf.nn.rnn_cell.LSTMCell(size, name='b_' + name) f_cell = tf.nn.rnn_cell.DropoutWrapper( f_cell, output_keep_prob=rnn_out_drop, state_keep_prob=1. - rnn_state_drop) b_cell = tf.nn.rnn_cell.DropoutWrapper( b_cell, output_keep_prob=rnn_out_drop, state_keep_prob=1. - rnn_state_drop) return (f_cell, b_cell) extra_rnn_layers = config.n_rnn_layers - 1 if extra_rnn_layers > 0: if config.learn_init_state: initial_state_forward = tf.get_variable( 'f_initial_state_2', shape=[config.rnn_hidden_size * 2]) initial_state_backward = tf.get_variable( 'b_initial_state_2', shape=[config.rnn_hidden_size * 2]) f_init_state_c = tf.expand_dims( initial_state_forward[:config.rnn_hidden_size], axis=0) f_init_state_m = tf.expand_dims( initial_state_forward[config.rnn_hidden_size:], axis=0) b_init_state_c = tf.expand_dims( initial_state_backward[:config.rnn_hidden_size], axis=0) b_init_state_m = tf.expand_dims( initial_state_backward[config.rnn_hidden_size:], axis=0) f_init_state_c = tf.tile(f_init_state_c, multiples=[batch_size, 1]) f_init_state_m = tf.tile(f_init_state_m, multiples=[batch_size, 1]) b_init_state_c = tf.tile(b_init_state_c, multiples=[batch_size, 1]) b_init_state_m = tf.tile(b_init_state_m, multiples=[batch_size, 1]) f_init_state = LSTMStateTuple(f_init_state_c, f_init_state_m) b_init_state = LSTMStateTuple(b_init_state_c, b_init_state_m) f_init_state = tuple([f_init_state] * extra_rnn_layers) b_init_state = tuple([b_init_state] * extra_rnn_layers) else: f_init_state = None b_init_state = None cells = [ make_cell(config.rnn_hidden_size, name=f'lstm_cell_{i + 2}') for i in range(extra_rnn_layers) ] f_cells = [x for (x, y) in cells] b_cells = [y for (x, y) in cells] f_cell = tf.nn.rnn_cell.MultiRNNCell(f_cells, state_is_tuple=True) b_cell = tf.nn.rnn_cell.MultiRNNCell(b_cells, state_is_tuple=True) (f_rnn_outputs, b_rnn_outputs), _ = bidirectional_dynamic_rnn( f_cell, b_cell, outputs, dtype=tf.float32, initial_state_fw=f_init_state, initial_state_bw=b_init_state) outputs = merge_mode( f_rnn_outputs, b_rnn_outputs) # [bs, seq_len, rnn_size (2 * rnn_size)] with tf.variable_scope('after_lstm'): rnn_output_size = config.rnn_hidden_size if config.merge_mode != 'concat' else ( config.rnn_hidden_size * 2) outputs = self.dense_layer(rnn_output_size, config.dense_size, 'dense_post_rnn', outputs) outputs = tf.nn.dropout(outputs, keep_prob=1. - dense_drop) outputs = tf.contrib.layers.batch_norm(inputs=outputs, updates_collections=None) outputs = tf.nn.relu(outputs) if config.use_pos_lm: with tf.variable_scope('next_pos'): self.next_pos_target = tf.placeholder(dtype=tf.int32, shape=[None, None]) next_pos = self.dense_layer(config.rnn_hidden_size, config.dense_size, 'dense_next_pos', f_outputs, 'relu') next_pos = self.dense_layer( config.dense_size, self.grammeme_vectorizer_output.pos_count() + 1, 'next_pos', next_pos, 'softmax') next_pos_loss = sequence_loss(logits=next_pos, targets=self.next_pos_target, weights=self.weights, average_across_timesteps=False, average_across_batch=False, name='next_pos_loss') next_pos_loss = tf.reshape(next_pos_loss, shape=[-1]) next_pos_loss *= weights self.next_pos_loss = tf.reduce_sum(next_pos_loss) self.next_pos_loss_avg = self.next_pos_loss / total tf.summary.scalar('next_pos_loss__', self.next_pos_loss_avg) with tf.variable_scope('pred_pos'): self.pred_pos_target = tf.placeholder(dtype=tf.int32, shape=[None, None]) pred_pos = self.dense_layer(config.rnn_hidden_size, config.dense_size, 'dense_pred_pos', b_outputs, 'relu') pred_pos = self.dense_layer( config.dense_size, self.grammeme_vectorizer_output.pos_count() + 1, 'pred_pos', pred_pos, 'softmax') pred_pos_loss = sequence_loss(logits=pred_pos, targets=self.pred_pos_target, weights=self.weights, average_across_timesteps=False, average_across_batch=False, name='pred_pos_loss') pred_pos_loss = tf.reshape(pred_pos_loss, shape=[-1]) pred_pos_loss *= weights self.pred_pos_loss = tf.reduce_sum(pred_pos_loss) self.pred_pos_loss_avg = self.pred_pos_loss / total tf.summary.scalar('pred_pos_loss__', self.pred_pos_loss_avg) with tf.variable_scope('main_pred'): self.target = tf.placeholder(dtype=tf.int32, shape=[None, None]) outputs = self.dense_layer( config.dense_size, self.grammeme_vectorizer_output.get_size() + 1, 'main_pred', outputs, 'softmax') main_loss = sequence_loss(logits=outputs, targets=self.target, weights=self.weights, average_across_timesteps=False, average_across_batch=False, name='main_loss') main_loss = tf.reshape(main_loss, shape=[-1]) main_loss *= weights self.main_loss = tf.reduce_sum(main_loss) self.main_loss_avg = self.main_loss / total tf.summary.scalar('main_loss__', self.main_loss_avg) targets = tf.reshape(self.target, shape=[-1]) predictions = tf.cast(tf.reshape(tf.argmax(outputs, axis=-1), shape=[-1]), dtype=tf.int32) correct = tf.cast(tf.equal(predictions, targets), dtype=tf.float32) self.correct = tf.reduce_sum(correct * weights) self.accuracy = divide(self.correct, total + 1e-12, name='accuracy') tf.summary.scalar('accuracy__', self.accuracy) self.summaries = tf.summary.merge_all() update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if self.is_training: with tf.control_dependencies(update_ops): def get_optimizer(build_config, lr): if build_config.optimizer.lower() == 'adam': optimizer = tf.train.AdamOptimizer(lr) elif build_config.optimizer.lower() == 'sgd': optimizer = tf.train.GradientDescentOptimizer(lr) elif build_config.optimizer.lower() == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(lr) elif build_config.optimizer.lower() == 'adagrad': optimizer = tf.train.AdagradOptimizer(lr) else: raise ValueError() return optimizer optimizer = get_optimizer(self.build_config, self.lr) trainable_variables = tf.trainable_variables() loss = tf.constant(0.0, dtype=tf.float32) if config.use_pos_lm: loss += self.pred_pos_loss_avg + self.next_pos_loss_avg loss += self.main_loss_avg if config.use_wd: self.wd = tf.get_variable(name='weight_decay', initializer=config.wd, trainable=False) l2_loss = tf.constant(0.0, dtype=tf.float32) for var in tf.trainable_variables(): l2_loss += tf.nn.l2_loss(var) loss = loss + l2_loss * self.wd grads = tf.gradients(loss, trainable_variables) if self.build_config.clip_norm is not None: self.clip_norm = tf.get_variable( name='clip_norm', initializer=self.build_config.clip_norm, trainable=False) grads = [ tf.clip_by_norm(grad, self.clip_norm) for grad in grads ] self.train_op = optimizer.apply_gradients( zip(grads, trainable_variables), global_step=self.global_step, name='train_op') self.variables_to_save = { 'lr': self.lr, 'global_step': self.global_step } self.variables_to_save.update( dict([(x.op.name, x) for x in tf.trainable_variables()])) self.saver = tf.train.Saver(self.variables_to_save)
def __call__(self, input, state, scope=None): with tf.variable_scope(scope or type(self).__name__): c, m, last_prior_mu, last_prior_sigma, = state # TODO: why shall we apply c instead of m x, y, last_input_prior_mu, last_input_prior_sigma, train_flag_ph = \ tf.split(value=input, num_or_size_splits=[self.n_x, self.n_y, self.n_z, self.n_z, 1], axis=1) train_flag_ph = tf.cast(tf.squeeze(train_flag_ph), tf.bool) with tf.variable_scope("phi_y"): y_phi = linear(y, self.n_h) with tf.variable_scope("Prior"): with tf.variable_scope("hidden"): prior_hidden = tf.nn.relu( linear(tf.concat(values=[y_phi, m], axis=1), self.n_prior_hidden)) with tf.variable_scope("delta_mu"): delta_prior_mu = linear(prior_hidden, self.n_z) with tf.variable_scope("mu"): prior_mu = delta_prior_mu + last_prior_mu + last_input_prior_mu # last_input_prior_mu is a zero matrix except the first time step with tf.variable_scope("delta_sigma"): delta_prior_sigma = linear(prior_hidden, self.n_z) with tf.variable_scope("sigma"): prior_sigma = delta_prior_sigma + last_prior_sigma + last_input_prior_sigma # last_input_prior_sigma is a zero matrix except the first time step prior_sigma = tf.nn.softplus(prior_sigma) # lambda_prior = tf.nn.relu(linear(tf.concat(values=[y_phi, m], axis=1), self.n_prior_hidden)) with tf.variable_scope("cond_x"): xy = tf.concat(values=(x, linear(y, self.n_h)), axis=1) with tf.variable_scope("phi_x"): xy_phi = tf.nn.relu(linear(xy, self.n_h)) with tf.variable_scope("Encoder"): with tf.variable_scope("hidden"): enc_hidden = tf.nn.relu( linear(tf.concat(axis=1, values=(xy_phi, m)), self.n_enc_hidden)) with tf.variable_scope("mu"): enc_mu = linear(enc_hidden, self.n_z) with tf.variable_scope("sigma"): enc_sigma = tf.nn.softplus(linear(enc_hidden, self.n_z)) # print x.get_shape().as_list() # eps = tf.random_normal((x.get_shape().as_list()[0], self.n_z), 0.0, 1.0, dtype=tf.float32) eps1 = tf.random_normal((tf.shape(x)[0], self.n_z), 0.0, 1.0, dtype=tf.float32) # z = mu + sigma*epsilon z_encoder = tf.add(enc_mu, tf.multiply(enc_sigma, eps1)) z_prior = tf.add(prior_mu, tf.multiply(prior_sigma, eps1)) with tf.variable_scope("cond_z"): z = tf.where(train_flag_ph, x=z_encoder, y=z_prior) zy = tf.concat(values=(z, linear(y, self.n_h)), axis=1) with tf.variable_scope("Phi_z"): zy_phi = tf.nn.relu(linear(zy, self.n_h)) with tf.variable_scope("Decoder"): with tf.variable_scope("hidden"): dec_hidden_enc = tf.nn.relu( linear(tf.concat(axis=1, values=(zy_phi, m)), self.n_dec_hidden)) with tf.variable_scope("mu"): dec_mu = linear(dec_hidden_enc, self.n_x) with tf.variable_scope("sigma"): dec_sigma = tf.nn.softplus(linear(dec_hidden_enc, self.n_x)) with tf.variable_scope("rho"): dec_rho = tf.nn.sigmoid(linear(dec_hidden_enc, self.n_x)) eps2 = tf.random_normal((tf.shape(x)[0], self.n_x), 0.0, 1.0, dtype=tf.float32) dec_x = tf.add(dec_mu, tf.multiply(dec_sigma, eps2)) output, state_update = self.lstm( tf.concat(axis=1, values=(xy_phi, zy_phi)), LSTMStateTuple(c, m)) # TODO: recheck it # return tf.nn.rnn_cell.LSTMStateTuple(h=(enc_mu, enc_sigma, dec_mu, dec_sigma, dec_rho, prior_mu, prior_sigma), c=state2) cell_output = tf.concat(values=(enc_mu, enc_sigma, dec_mu, dec_sigma, dec_x, prior_mu, prior_sigma, z_encoder), axis=1) c_update, m_update = state_update tpp_cvrnn_state = TPPCVRNNStateTuple(c=c_update, h=m_update, mu_p=prior_mu, sigma_p=prior_sigma) return cell_output, tpp_cvrnn_state
dic_embeddings = tf.constant(dic_em()) # Encoder encode_outputs, encode_states, z_mean, z_stddev, new_states = get_encoder_layer( encoder_embed_input, keep_prob) #VAE samples = tf.random_normal(tf.shape(z_stddev)) z = z_mean + tf.exp(z_stddev * 0.5) * samples #Decoder # inital state vae_z h_state = tf.nn.softplus(tf.matmul(z, weights_de['w_']) + biases_de['b_']) # tf.nn.relu decoder_initial_state = LSTMStateTuple(h_state, encode_states[0][1]) decoder_output, predicting_logits, training_logits, masks, target = get_decoder_layer( z, decoder_embed_input, decoder_initial_state, keep_prob, is_train) latent_loss = 0.5 * tf.reduce_sum( tf.exp(z_stddev) - 1. - z_stddev + tf.square(z_mean), 1) #variable # a=tf.reduce_sum(tf.exp(z_stddev),1) # b=tf.reduce_sum(z_stddev,1) # c=tf.reduce_sum(tf.square(z_mean),1) latent_cost = tf.reduce_mean(latent_loss) laten_ = latentscale_iter * tf.reduce_mean(latent_loss) encropy_loss = tf.contrib.seq2seq.sequence_loss(training_logits, target, masks) cost = tf.reduce_mean( tf.contrib.seq2seq.sequence_loss(training_logits, target, masks) +
def call(self, inputs, state): char_inputs = inputs[0] # shape = [batch_size, input_dimension] state_inputs = inputs[ 1] # shape = [batch_size, max_num_of_lexicon words, lexicon_state_dimension] # check whether the last dimension of state_inputs are all zero. # check_state_0 should be in the shape of [batch_size, max_num_of_lexicon words] check_state_0 = tf.reduce_sum(state_inputs, axis=-1) # check_state_1 should be in the shape of [batch_size] check_state_1 = tf.reduce_sum(check_state_0, axis=-1) # 查找匹配含有词汇的索引,只处理该部分信息,避免较多无词库匹配的信息参与计算消耗资源 # state_inputs_indices_for_lexicon should be in the shape of [batch_size, 2] state_inputs_indices_for_lexicon = tf.where( tf.not_equal(check_state_0, 0)) # 查找不含有词汇的索引,避免较多无词库匹配的信息参与计算消耗资源 # tf.where(tf.equal(check_state_1, 0)) should be in the shape of [batch_size, 1] # state_inputs_indices_for_not_lexicon should be in the shape of [batch_size] state_inputs_indices_for_not_lexicon = tf.squeeze( tf.where(tf.equal(check_state_1, 0))) # 对不含词汇的细胞状态进行选择,主要是针对标量数据,因其秩为0,需进行维度扩展 # in case `[i]` is squeezed to scalar `i`, change it back to 1-dimension tensor `[i]` by `tf.expand_dims()` # otherwise, `[]` and `[i, j]` will remain as-is after tf.squeeze() and further conversion on it state_inputs_indices_for_not_lexicon = tf.cond( pred=tf.equal(tf.rank(state_inputs_indices_for_not_lexicon), 0), true_fn=lambda: tf.expand_dims( state_inputs_indices_for_not_lexicon, axis=0), false_fn=lambda: state_inputs_indices_for_not_lexicon) # 含有词汇匹配的字符索引 # char_inputs_indices_for_lexicon should be in the shape of [batch_size, 1] char_inputs_indices_for_lexicon = tf.where( tf.not_equal(check_state_1, 0)) # 不含有词汇匹配的字符索引 # char_inputs_indices_for_not_lexicon should be in the shape of [batch_size, 1] char_inputs_indices_for_not_lexicon = tf.where( tf.equal(check_state_1, 0)) if self._state_is_tuple: c, h = state else: c, h = tf.split(value=state, num_or_size_splits=2, axis=1) # tf.concat([char_inputs, h], 1) should be in the shape of # [batch_size, char_embedding_size + state_dimension] # h should be in the shape of [batch_size, state_dimension] # self._kernel should be in the shape of [char_embedding_size + state_dimension, X] # gate_inputs should be in the shape of [batch_size, 4 * state_dimension] gate_inputs = tf.matmul(tf.concat([char_inputs, h], 1), self._kernel) gate_inputs = tf.nn.bias_add(gate_inputs, self._bias) i, j, f, o = tf.split(value=gate_inputs, num_or_size_splits=4, axis=1) new_c_without_lexicon = self._new_c_without_lexicon( i=i, f=f, j=j, c=c, indices_tensor=state_inputs_indices_for_not_lexicon) new_c = tf.scatter_nd_update( self._char_state_tensor, indices=char_inputs_indices_for_not_lexicon, updates=new_c_without_lexicon) new_c = tf.cond(tf.not_equal( tf.shape(state_inputs_indices_for_not_lexicon)[-1], tf.shape(state_inputs)[0]), true_fn=lambda: self._if_not_empty_lexicon_state( i, j, char_inputs, state_inputs, char_inputs_indices_for_lexicon, state_inputs_indices_for_lexicon, new_c), false_fn=lambda: new_c) # 计算输出隐状态 new_h = tf.multiply(self._activation(new_c), tf.nn.sigmoid(o)) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = tf.concat([new_c, new_h], 1) return new_h, new_state
def model_fn(features, labels, mode, params, config): cur_batch_D = params.num_char if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: X_s, X_l, X_r, X_u = features cur_batch_B = tf.shape(X_s)[0] cur_batch_T = tf.shape(X_s)[1] Xs_embd = tf.one_hot(X_s, cur_batch_D) X_ta = tf.TensorArray(size=cur_batch_T, dtype=tf.float32).unstack( _transpose_batch_time(Xs_embd), 'TBD_Formatted_X') else: cur_batch_B = params.infer_batch_size cur_batch_T = params.infer_seq_length acell = { 'lstm': lambda: LSTMCell(params.num_hidden), 'sru': lambda: SRUCell(params.num_hidden) }[params.cell]() output_layer_info = { 'units': cur_batch_D, # this is the size of vocabulary 'name': 'out_to_character', # linear 'activation': tf.nn.softmax } with tf.variable_scope('Shared_Dense', reuse=False) as dense_layer_scope: # this will be replaced by the cell_output later zeros_placeholder = tf.zeros([1, acell.output_size]) tf.layers.dense(zeros_placeholder, **output_layer_info) def get_logits(cell_out): # useful when measuring the cross-entropy loss with tf.variable_scope(dense_layer_scope, reuse=True): return tf.layers.dense(cell_out, **output_layer_info) def get_dist(cell_out): return Categorical(logits=get_logits(cell_out), name='categorical_dist', allow_nan_stats=False, dtype=tf.int32) def get_sample(cell_out): return tf.one_hot(get_dist(cell_out).sample(), cur_batch_D) def get_prob(cell_out, obs): # the observation is in return get_dist(cell_out).prob(obs) with tf.variable_scope('Initial_State'): h_init = tf.tile( tf.get_variable('init_state_h', [1, params.num_hidden], initializer=tf.random_uniform_initializer(0)), [cur_batch_B, 1]) c_init = tf.tile( tf.get_variable('init_state_c', [1, params.num_hidden], initializer=tf.random_uniform_initializer(0)), [cur_batch_B, 1]) cell_init_state = LSTMStateTuple(c_init, h_init) first_step = tf.zeros(shape=[cur_batch_B, cur_batch_D], dtype=tf.float32, name='first_character') with tf.name_scope('NADE'): output_ta = tf.TensorArray(size=cur_batch_T, dtype=tf.float32) def loop_fn(time, cell_output, cell_state, loop_state): emit_output = cell_output # == None for time == 0 if cell_output is None: next_cell_state = cell_init_state next_step = first_step next_loop_state = output_ta else: # pass the last state to the next next_cell_state = cell_state if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: next_step = X_ta.read(time - 1) else: next_step = get_sample(cell_output) next_loop_state = loop_state.write(time - 1, next_step) if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: elements_finished = (time >= X_l) else: elements_finished = (time >= cur_batch_T) return elements_finished, next_step, next_cell_state, emit_output, next_loop_state output_ta, _, loop_state_ta = tf.nn.raw_rnn(acell, loop_fn) with tf.name_scope('Output'): outputs = _transpose_batch_time(output_ta.stack()) logits = get_logits(outputs) if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: logp_loss = -tf.reduce_mean(tf.log(1e-6 + get_prob(outputs, X_s))) xentropy_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( labels=Xs_embd, logits=logits), name='xtropy_loss') train_op = tf.train.RMSPropOptimizer( learning_rate=params.learning_rate).minimize( loss=logp_loss, global_step=tf.train.get_global_step()) logging_hook = tf.train.LoggingTensorHook( tensors={"xtropy_loss": "xtropy_loss"}, every_n_iter=100) return tf.estimator.EstimatorSpec(mode=mode, loss=logp_loss, train_op=train_op, training_chief_hooks=[logging_hook]) else: X_sampled = tf.argmax(_transpose_batch_time(loop_state_ta.stack()), axis=2) return tf.estimator.EstimatorSpec(mode=mode, predictions=X_sampled)
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, target_dict_dim, is_generating, beam_size, max_generation_length): src_word_idx = tf.placeholder(tf.int32, shape=[None, None]) src_sequence_length = tf.placeholder(tf.int32, shape=[ None, ]) src_embedding_weights = tf.get_variable("source_word_embeddings", [source_dict_dim, embedding_dim]) src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx) src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) # no peephole encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=src_forward_cell, cell_bw=src_reversed_cell, inputs=src_embedding, sequence_length=src_sequence_length, dtype=tf.float32) # concat the forward outputs and backward outputs encoded_vec = tf.concat(encoder_outputs, axis=2) # project the encoder outputs to size of decoder lstm encoded_proj = tf.contrib.layers.fully_connected(inputs=tf.reshape( encoded_vec, shape=[-1, embedding_dim * 2]), num_outputs=decoder_size, activation_fn=None, biases_initializer=None) encoded_proj_reshape = tf.reshape( encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size]) # get init state for decoder lstm's H backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1]) decoder_boot = tf.contrib.layers.fully_connected(inputs=tf.reshape( backword_first, shape=[-1, embedding_dim]), num_outputs=decoder_size, activation_fn=tf.nn.tanh, biases_initializer=None) # prepare the initial state for decoder lstm cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32) initial_state = LSTMStateTuple(cell_init, decoder_boot) # create decoder lstm cell decoder_cell = LSTMCellWithSimpleAttention( decoder_size, encoded_vec if not is_generating else seq2seq.tile_batch( encoded_vec, beam_size), encoded_proj_reshape if not is_generating else seq2seq.tile_batch( encoded_proj_reshape, beam_size), src_sequence_length if not is_generating else seq2seq.tile_batch( src_sequence_length, beam_size), forget_bias=0.0) output_layer = Dense(target_dict_dim, name='output_projection') if not is_generating: trg_word_idx = tf.placeholder(tf.int32, shape=[None, None]) trg_sequence_length = tf.placeholder(tf.int32, shape=[ None, ]) trg_embedding_weights = tf.get_variable( "target_word_embeddings", [target_dict_dim, embedding_dim]) trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights, trg_word_idx) training_helper = seq2seq.TrainingHelper( inputs=trg_embedding, sequence_length=trg_sequence_length, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder(cell=decoder_cell, helper=training_helper, initial_state=initial_state, output_layer=output_layer) # get the max length of target sequence max_decoder_length = tf.reduce_max(trg_sequence_length) decoder_outputs_train, _, _ = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length) decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output) decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') masks = tf.sequence_mask(lengths=trg_sequence_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # place holder of label sequence lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None]) # compute the loss loss = seq2seq.sequence_loss(logits=decoder_logits_train, targets=lbl_word_idx, weights=masks, average_across_timesteps=True, average_across_batch=True) # return feeding list and loss operator return { 'src_word_idx': src_word_idx, 'src_sequence_length': src_sequence_length, 'trg_word_idx': trg_word_idx, 'trg_sequence_length': trg_sequence_length, 'lbl_word_idx': lbl_word_idx }, loss else: start_tokens = tf.ones([ tf.shape(src_word_idx)[0], ], tf.int32) * START_TOKEN_IDX # share the same embedding weights with target word trg_embedding_weights = tf.get_variable( "target_word_embeddings", [target_dict_dim, embedding_dim]) inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=decoder_cell, embedding=lambda tokens: tf.nn.embedding_lookup( trg_embedding_weights, tokens), start_tokens=start_tokens, end_token=END_TOKEN_IDX, initial_state=tf.nn.rnn_cell.LSTMStateTuple( tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size), tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)), beam_width=beam_size, output_layer=output_layer) decoder_outputs_decode, _, _ = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True,# error occurs maximum_iterations=max_generation_length) predicted_ids = decoder_outputs_decode.predicted_ids return { 'src_word_idx': src_word_idx, 'src_sequence_length': src_sequence_length }, predicted_ids
def Model(_abnormal_data, _abnormal_label, _hidden_num, _elem_num, _file_name, _partition): tf.reset_default_graph() g = tf.Graph() with g.as_default(): # placeholder list p_input = tf.placeholder(tf.float32, shape=(batch_num, _abnormal_data.shape[1], _abnormal_data.shape[2])) # p_inputs = [tf.squeeze(t, [1]) for t in tf.split(p_input, _abnormal_data.shape[1], 1)] # Regularizer signature l1_regularizer = tf.contrib.layers.l1_regularizer(scale=0.005, scope=None) # Projection layer projection_layer = tf.layers.Dense(units=_elem_num, use_bias=True) # with tf.device('/device:GPU:0'): d_enc = {} with tf.variable_scope('encoder'): for j in range(ensemble_space): # create RNN cell if cell_type == 0: enc_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: pure_enc_cell = LSTMCell(_hidden_num) residual_enc_cell = RLSTMCell(_hidden_num) # enc_cell = RSLSTMCell(_hidden_num, file_name=_file_name, type='enc', partition=_partition, # component=j, reuse=tf.AUTO_REUSE) enc_cell = RKLSTMCell(_hidden_num, file_name=_file_name, type='enc', partition=_partition, component=j, reuse=tf.AUTO_REUSE) if cell_type == 2: pure_enc_cell = GRUCell(_hidden_num) enc_cell = RSGRUCell(_hidden_num) if j == 0: d_enc['enc_output_{0}'.format(j)], d_enc[ 'enc_state_{0}'.format(j)] = tf.nn.dynamic_rnn( pure_enc_cell, p_input, dtype=tf.float32) elif j == 1: d_enc['enc_output_{0}'.format(j)], d_enc[ 'enc_state_{0}'.format(j)] = tf.nn.dynamic_rnn( residual_enc_cell, p_input, dtype=tf.float32) else: d_enc['enc_output_{0}'.format(j)], d_enc[ 'enc_state_{0}'.format(j)] = tf.nn.dynamic_rnn( enc_cell, p_input, dtype=tf.float32) # shared_state_c = tf.concat([d_enc['enc_state_{0}'.format(j)].c for j in range(ensemble_space)], axis=1) # shared_state_h = tf.concat([d_enc['enc_state_{0}'.format(j)].h for j in range(ensemble_space)], axis=1) w_c = tf.Variable(tf.zeros([_hidden_num, _hidden_num])) b_c = tf.Variable(tf.zeros([_hidden_num])) w_h = tf.Variable(tf.zeros([_hidden_num, _hidden_num])) b_h = tf.Variable(tf.zeros([_hidden_num])) shared_state_c = tf.concat([ tf.matmul(d_enc['enc_state_{0}'.format(j)].c, w_c) + b_c for j in range(ensemble_space) ], axis=1) shared_state_h = tf.concat([ tf.matmul(d_enc['enc_state_{0}'.format(j)].h, w_h) + b_h for j in range(ensemble_space) ], axis=1) if compress: compress_state = tf.layers.Dense(units=_hidden_num, activation=tf.tanh, use_bias=True) shared_state_c = compress_state(shared_state_c) shared_state_h = compress_state(shared_state_h) shared_state = LSTMStateTuple(shared_state_c, shared_state_h) # with tf.device('/device:GPU:1'): d_dec = {} with tf.variable_scope('decoder') as vs: if decode_without_input: dec_input = tf.zeros( [p_input.shape[0], p_input.shape[1], p_input.shape[2]], dtype=tf.float32) for k in range(ensemble_space): # create RNN cell if cell_type == 0: dec_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: if compress: pure_dec_cell = LSTMCell(_hidden_num) residual_dec_cell = RLSTMCell(_hidden_num) dec_cell = RSLSTMCell(_hidden_num, file_name=_file_name, type='dec', partition=_partition, component=k, reuse=tf.AUTO_REUSE) else: pure_dec_cell = LSTMCell(_hidden_num * ensemble_space) residual_dec_cell = RLSTMCell(_hidden_num * ensemble_space) dec_cell = RSLSTMCell(_hidden_num * ensemble_space, file_name=_file_name, type='dec', partition=_partition, component=k, reuse=tf.AUTO_REUSE) if cell_type == 2: if compress: pure_dec_cell = GRUCell(_hidden_num) dec_cell = RSGRUCell(_hidden_num) else: pure_dec_cell = GRUCell(_hidden_num * ensemble_space) dec_cell = RSGRUCell(_hidden_num * ensemble_space) if k == 0: d_dec['dec_output_{0}'.format(k)], d_dec[ 'dec_state_{0}'.format(k)] = tf.nn.dynamic_rnn( pure_dec_cell, dec_input, initial_state=shared_state, dtype=tf.float32) elif k == 1: d_dec['dec_output_{0}'.format(k)], d_dec[ 'dec_state_{0}'.format(k)] = tf.nn.dynamic_rnn( residual_dec_cell, dec_input, initial_state=shared_state, dtype=tf.float32) else: d_dec['dec_output_{0}'.format(k)], d_dec[ 'dec_state_{0}'.format(k)] = tf.nn.dynamic_rnn( dec_cell, dec_input, initial_state=shared_state, dtype=tf.float32) if reverse: d_dec['dec_output_{0}'.format(k)] = d_dec[ 'dec_output_{0}'.format(k)][::-1] else: dec_input = tf.zeros([p_input.shape[0], p_input.shape[2]], dtype=tf.float32) for k in range(ensemble_space): # create RNN cell if cell_type == 0: dec_cell = tf.nn.rnn_cell.BasicRNNCell(_hidden_num) if cell_type == 1: if compress: pure_dec_cell = LSTMCell(_hidden_num) residual_dec_cell = RLSTMCell(_hidden_num) # dec_cell = RSLSTMCell(_hidden_num, file_name=_file_name, type='dec', partition=_partition, # component=k, reuse=tf.AUTO_REUSE) dec_cell = RKLSTMCell(_hidden_num, file_name=_file_name, type='dec', partition=_partition, component=k, reuse=tf.AUTO_REUSE) else: pure_dec_cell = LSTMCell(_hidden_num * ensemble_space) residual_dec_cell = RLSTMCell(_hidden_num * ensemble_space) # dec_cell = RSLSTMCell(_hidden_num * ensemble_space, file_name=_file_name, type='dec', # partition=_partition, component=k, reuse=tf.AUTO_REUSE) dec_cell = RKLSTMCell(_hidden_num * ensemble_space, file_name=_file_name, type='dec', partition=_partition, component=k, reuse=tf.AUTO_REUSE) if cell_type == 2: if compress: pure_dec_cell = GRUCell(_hidden_num) dec_cell = RSGRUCell(_hidden_num) else: pure_dec_cell = GRUCell(_hidden_num * ensemble_space) dec_cell = RSGRUCell(_hidden_num * ensemble_space) inference_helper = tf.contrib.seq2seq.InferenceHelper( sample_fn=lambda outputs: outputs, sample_shape=[_elem_num], sample_dtype=tf.float32, start_inputs=dec_input, end_fn=lambda sample_ids: False) if k == 0: inference_decoder = tf.contrib.seq2seq.BasicDecoder( pure_dec_cell, inference_helper, shared_state, output_layer=projection_layer) elif k == 1: inference_decoder = tf.contrib.seq2seq.BasicDecoder( residual_dec_cell, inference_helper, shared_state, output_layer=projection_layer) else: inference_decoder = tf.contrib.seq2seq.BasicDecoder( dec_cell, inference_helper, shared_state, output_layer=projection_layer) d_dec['dec_output_{0}'.format( k)], _, _ = tf.contrib.seq2seq.dynamic_decode( inference_decoder, impute_finished=True, maximum_iterations=p_input.shape[1]) if reverse: d_dec['dec_output_{0}'.format(k)] = d_dec[ 'dec_output_{0}'.format(k)][::-1] sum_of_difference = 0 for i in range(ensemble_space): sum_of_difference += d_dec['dec_output_{0}'.format(i)][0] - p_input loss = tf.reduce_mean(tf.square(sum_of_difference)) regularization_penalty = tf.contrib.layers.apply_regularization( l1_regularizer, [shared_state]) loss = loss + regularization_penalty optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss) # Add ops to save and restore all the variables. saver = tf.train.Saver() return g, p_input, d_dec, loss, optimizer, saver
def call(self, skel_inputs, state): ''' here inputs with the shape of (batch_size, feat_dim) in kinect 2.0, feat_dim is 25*3 = 75 for five part of a skeleton body. (head, r_arm, l_arm, r_leg, l_leg) divide config: head: [ 3, 4, 1,2,21] r_arm: [ 5, 6, 7, 8,22,23, 1,2,21] l_arm: [ 9,10,11,12,24,25, 1,2,21] r_leg: [13,14,15,16, 1,2,21] l_leg: [17,18,19,20, 1,2,21] state: LSTMStateTuple with the format of (Tensor(c1, c2, ..., c5), Tensor(h)) ''' sigmoid = math_ops.sigmoid tanh = math_ops.tanh if self._state_is_tuple: cs, h = state else: cs, h = array_ops.split(value=state, num_or_size_splits=2, axis=1) # split he state into c and h # here cs mean c1 to c5, where each part means a part of body, cs is also a list or turple # split the cs into 5 parts cs = array_ops.split(cs, num_or_size_splits=5, axis=1) divide_config = { 'head': (3, 4, 1, 2, 21), 'r_arm': (5, 6, 7, 8, 22, 23, 1, 2, 21), 'l_arm': (9, 10, 11, 12, 24, 25, 1, 2, 21), 'r_leg': (13, 14, 15, 16, 1, 2, 21), 'l_leg': (17, 18, 19, 20, 1, 2, 21) } # assert skel_inputs.shape[1] == 75 reshaped_input = array_ops.reshape(skel_inputs, shape=(-1, 25, 3)) head_joints = [ reshaped_input[:, each - 1, :] for each in divide_config['head'] ] r_arm_joints = [ reshaped_input[:, each - 1, :] for each in divide_config['r_arm'] ] l_arm_joints = [ reshaped_input[:, each - 1, :] for each in divide_config['l_arm'] ] r_leg_joints = [ reshaped_input[:, each - 1, :] for each in divide_config['r_leg'] ] l_leg_joints = [ reshaped_input[:, each - 1, :] for each in divide_config['l_leg'] ] body_list = [ head_joints, r_arm_joints, l_arm_joints, r_leg_joints, l_leg_joints ] body_list = ops.convert_n_to_tensor(body_list) for ind, each in enumerate(body_list): tmp = array_ops.transpose(each, perm=(1, 0, 2)) batch_size = int(tmp.shape[0]) body_list[ind] = array_ops.reshape(tmp, shape=(batch_size, -1)) o_all_skel = _linear( [ body_list[0], body_list[1], body_list[2], body_list[3], body_list[4], h ], # here 111 + h_size 5 * self._num_units, True) o_all_skel = sigmoid(o_all_skel) new_c_list = [] for ind, each_part in enumerate(body_list): concat_p = _linear([each_part, h], 3 * self._num_units, weight_name='weight_%d' % ind, bias_name='bias_%d' % ind, bias=True) ip, fp, gp = array_ops.split(value=concat_p, num_or_size_splits=3, axis=1) ip, fp, gp = sigmoid(ip), sigmoid(fp), tanh(gp) new_c = cs[ind] * (fp + self._forget_bias) + ip * gp new_c_list.append(new_c) new_c_tensors = array_ops.concat(new_c_list, axis=1) new_h = o_all_skel * tanh(array_ops.concat(new_c_list, 1)) if self._state_is_tuple: new_state = LSTMStateTuple(new_c_tensors, new_h) else: new_state = array_ops.concat([new_c_tensors, new_h], 1) return new_h, new_state
def state_size(self): return (LSTMStateTuple(self._hidden_size, self._hidden_size))
def __init__(self, num_units, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=None, reuse=None, name=None, dtype=None): """Initialize the parameters for an LSTM cell. Args: num_units: int, The number of units in the LSTM cell. use_peepholes: bool, set True to enable diagonal/peephole connections. cell_clip: (optional) A float value, if provided the cell state is clipped by this value prior to the cell output activation. initializer: (optional) The initializer to use for the weight and projection matrices. num_proj: (optional) int, The output dimensionality for the projection matrices. If None, no projection is performed. proj_clip: (optional) A float value. If `num_proj > 0` and `proj_clip` is provided, then the projected values are clipped elementwise to within `[-proj_clip, proj_clip]`. num_unit_shards: Deprecated, will be removed by Jan. 2017. Use a variable_scope partitioner instead. num_proj_shards: Deprecated, will be removed by Jan. 2017. Use a variable_scope partitioner instead. forget_bias: Biases of the forget gate are initialized by default to 1 in order to reduce the scale of forgetting at the beginning of the training. Must set it manually to `0.0` when restoring from CudnnLSTM trained checkpoints. state_is_tuple: If True, accepted and returned states are 2-tuples of the `c_state` and `m_state`. If False, they are concatenated along the column axis. This latter behavior will soon be deprecated. activation: Activation function of the inner states. Default: `tanh`. reuse: (optional) Python boolean describing whether to reuse variables in an existing scope. If not `True`, and the existing scope already has the given variables, an error is raised. name: String, the name of the layer. Layers with the same name will share weights, but to avoid mistakes we require reuse=True in such cases. dtype: Default dtype of the layer (default of `None` means use the type of the first input). Required when `build` is called before `call`. When restoring from CudnnLSTM-trained checkpoints, use `CudnnCompatibleLSTMCell` instead. """ super(CustomLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype) if not state_is_tuple: logging.warn( "%s: Using a concatenated state is slower and will soon be " "deprecated. Use state_is_tuple=True.", self) if num_unit_shards is not None or num_proj_shards is not None: logging.warn( "%s: The num_unit_shards and proj_unit_shards parameters are " "deprecated and will be removed in Jan 2017. " "Use a variable scope with a partitioner instead.", self) # Inputs must be 2-dimensional. self.input_spec = base_layer.InputSpec(ndim=2) self._num_units = num_units self._use_peepholes = use_peepholes self._cell_clip = cell_clip self._initializer = initializer self._num_proj = num_proj self._proj_clip = proj_clip self._num_unit_shards = num_unit_shards self._num_proj_shards = num_proj_shards self._forget_bias = forget_bias self._state_is_tuple = state_is_tuple self._activation = activation or math_ops.tanh if num_proj: self._state_size = (LSTMStateTuple(num_units, num_proj) if state_is_tuple else num_units + num_proj) self._output_size = num_proj else: self._state_size = (LSTMStateTuple(num_units, num_units) if state_is_tuple else 2 * num_units) self._output_size = num_units
def call(self, inputs, state): """ Long short-term unitary memory cell (LSTUM). """ c, h = state C = tf.reshape( c, [self._size_batch, self._hidden_size, self._hidden_size]) concat = _linear([inputs, h], 4 * self._hidden_size, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) d = sigmoid(i) * tanh(j) e = tf.multiply( C, tf.reshape(f, [ self._size_batch, 1, self._hidden_size, ])) + tf.reshape(d, [self._size_batch, 1, self._hidden_size]) e_l = tf.Variable(tf.unstack(e, axis=2)) # 128 128 128 bList = tf.Variable([tf.nn.l2_normalize(e_l[0], 1)]) # 1 128 128 print(type(tf.shape(bList)[0])) #Gram-Schmidt loop i = tf.constant(0) loop_vars = [bList, i] shape_inv = [ tf.TensorShape([None, self._size_batch, self._hidden_size]), i.get_shape() ] cond = lambda b_l, i: tf.less(i, self._size_batch) def F(b_l, i): TensorArr = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False) array = TensorArr.unstack(b_l) print(array.read(0)) b_u = tf.unstack( tf.reshape( b_l, [tf.shape(b_l)[0], self._size_batch, self._hidden_size])) input() b_u = tf.unstack( tf.reshape(b_l, [i + 1, self._size_batch, self._hidden_size])) dot = b_l[i] * b_u reduce_dot_prime = tf.reduce_sum(dot, axis=2) reduce_dot_final = tf.reduce_sum(b_l * reduce_dot_prime, axis=0) w_n = e_l[i] - reduce_dot_final w_n = tf.nn.l2_normalize(w_n, 1, epsilon=1e-8) b_l = tf.concat([ b_l, tf.reshape(w_n, [1, self._size_batch, self._hidden_size]) ], 0) return b_l, i b_list, _ = control_flow_ops.while_loop(cond, F, loop_vars, shape_inv) print(b_list) input new_C = tf.stack(b_list, axis=1) o = tf.reshape(o, [self._size_batch, self._hidden_size, 1]) new_h = tf.matmul(self._activation(new_C), o) new_h = tf.reshape(new_h, [self._size_batch, self._hidden_size]) new_c = tf.reshape(new_C, [self._size_batch, self._hidden_size**2]) new_state = LSTMStateTuple(new_c, new_h) return new_h, new_state
def call(self, inputs, state): """Run one step of LSTM. Args: inputs: input Tensor, 2D, `[batch, num_units]. state: if `state_is_tuple` is False, this must be a state Tensor, `2-D, [batch, state_size]`. If `state_is_tuple` is True, this must be a tuple of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`. Returns: A tuple containing: - A `2-D, [batch, output_dim]`, Tensor representing the output of the LSTM after reading `inputs` when previous state was `state`. Here output_dim is: num_proj if num_proj was set, num_units otherwise. - Tensor(s) representing the new state of LSTM after reading `inputs` when the previous state was `state`. Same type and shape(s) as `state`. Raises: ValueError: If input size cannot be inferred from inputs via static shape inference. """ num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, h_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) h_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") # i = input_gate, j = new_input, f = forget_gate, o = output_gate # calculate softmaxed output y_prev = softmax(math_ops.matmul(self._y_w, h_prev) + self._y_b) lstm_matrix = math_ops.matmul( array_ops.concat([inputs, h_prev, y_prev], 1), self._kernel) lstm_matrix = nn_ops.bias_add(lstm_matrix, self._bias) i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes: c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: h = sigmoid(o + self._w_o_diag * c) * self._activation(c) else: h = sigmoid(o) * self._activation(c) if self._num_proj is not None: h = math_ops.matmul(h, self._proj_kernel) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type h = clip_ops.clip_by_value(h, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, h) if self._state_is_tuple else array_ops.concat([c, h], 1)) return h, new_state
def call(self, inputs, state): # print('state 0',inputs) sigmoid = math_ops.sigmoid tanh = math_ops.tanh if self._state_is_tuple: c0, h0 = state else: c0, h0 = array_ops.split(value=state, num_or_size_splits=2, axis=1) # 时间差, 暂时转为浮点型 # delt_t = float(array_ops.slice(delt_t,0,1)) # text向量 # text = array_ops.slice(inputs,1,128) # print('state 1') inputs_x = inputs[:, 1:] delt_t = inputs[:, 0:1] # print('state 1.1',inputs_x,h0) # 时间衰减部分 with tf.variable_scope('1'): concat_time_x = _linear([inputs_x, h0], 3 * self.num_units, bias=True) # print('state 1.2') # 文本部分 with tf.variable_scope('2'): concat_x = _linear([inputs_x, h0], 3 * self.num_units, bias=True) # print('state 1.3') with tf.variable_scope('3'): output_x = _linear([inputs_x, h0], self.num_units, bias=True) # print('state 2') # 时间衰减部分 i00, j00, f00 = array_ops.split(value=concat_time_x, num_or_size_splits=3, axis=1) # 文本部分 i10, j10, f10 = array_ops.split(value=concat_x, num_or_size_splits=3, axis=1) # print('state 2.1') # print(c0 * math_ops.exp(-1 * delt_t) * sigmoid(f00 + self._forget_bias)) # print((1 - math_ops.exp(-1 * delt_t)) * sigmoid(i00) * tanh(j00)) new_c0 = c0 * math_ops.exp( -1 * delt_t) * sigmoid(f00 + self._forget_bias) + ( 1 - math_ops.exp(-1 * delt_t)) * sigmoid(i00) * tanh(j00) # new_c0 = c0 * sigmoid(f00 + self._forget_bias) # print('state 2.2') new_c0 = new_c0 * sigmoid(f10 + self._forget_bias) + sigmoid(i10) * tanh(j10) # print('state 2.3') new_h0 = tanh(new_c0) * sigmoid(output_x) # print('state 3') if self._state_is_tuple: new_state = LSTMStateTuple(new_c0, new_h0) else: new_state = array_ops.concat([new_c0, new_h0], 1) # print('state 4') return new_h0, new_state
def call(self, inputs, state): att_score = tf.expand_dims(inputs[:, -1], -1) time_now_score = tf.expand_dims(inputs[:, -2], -1) time_last_score = tf.expand_dims(inputs[:, -3], -1) inputs = inputs[:, :-3] num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid if self._state_is_tuple: (c_prev, m_prev) = state else: c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") if self._time_kernel_w1 is None: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: with vs.variable_scope(unit_scope): self._time_input_w1 = vs.get_variable( "_time_input_w1", shape=[self._num_units], dtype=dtype) self._time_input_bias1 = vs.get_variable( "_time_input_bias1", shape=[self._num_units], dtype=dtype) self._time_input_w2 = vs.get_variable( "_time_input_w2", shape=[self._num_units], dtype=dtype) self._time_input_bias2 = vs.get_variable( "_time_input_bias2", shape=[self._num_units], dtype=dtype) self._time_kernel_w1 = vs.get_variable( "_time_kernel_w1", shape=[input_size, self._num_units], dtype=dtype, ) self._time_kernel_t1 = vs.get_variable( "_time_kernel_t1", shape=[self._num_units, self._num_units], dtype=dtype, ) self._time_bias1 = vs.get_variable("_time_bias1", shape=[self._num_units], dtype=dtype) self._time_kernel_w2 = vs.get_variable( "_time_kernel_w2", shape=[input_size, self._num_units], dtype=dtype, ) self._time_kernel_t2 = vs.get_variable( "_time_kernel_t2", shape=[self._num_units, self._num_units], dtype=dtype, ) self._time_bias2 = vs.get_variable("_time_bias2", shape=[self._num_units], dtype=dtype) self._o_kernel_t1 = vs.get_variable( "_o_kernel_t1", shape=[self._num_units, self._num_units], dtype=dtype, ) self._o_kernel_t2 = vs.get_variable( "_o_kernel_t2", shape=[self._num_units, self._num_units], dtype=dtype, ) time_now_input = tf.nn.tanh(time_now_score * self._time_input_w1 + self._time_input_bias1) time_last_input = tf.nn.tanh(time_last_score * self._time_input_w2 + self._time_input_bias2) time_now_state = ( math_ops.matmul(inputs, self._time_kernel_w1) + math_ops.matmul(time_now_input, self._time_kernel_t1) + self._time_bias1) time_last_state = ( math_ops.matmul(inputs, self._time_kernel_w2) + math_ops.matmul(time_last_input, self._time_kernel_t2) + self._time_bias2) if self._linear1 is None: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: if self._num_unit_shards is not None: unit_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_unit_shards)) self._linear1 = _Linear([inputs, m_prev], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate lstm_matrix = self._linear1([inputs, m_prev]) i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) o = (o + math_ops.matmul(time_now_input, self._o_kernel_t1) + math_ops.matmul(time_last_input, self._o_kernel_t2)) # Diagonal connections if self._use_peepholes and not self._w_f_diag: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: with vs.variable_scope(unit_scope): self._w_f_diag = vs.get_variable("w_f_diag", shape=[self._num_units], dtype=dtype) self._w_i_diag = vs.get_variable("w_i_diag", shape=[self._num_units], dtype=dtype) self._w_o_diag = vs.get_variable("w_o_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * sigmoid(time_last_state) * c_prev + sigmoid( i + self._w_i_diag * c_prev) * sigmoid( time_now_state) * self._activation(j) else: c = sigmoid(f + self._forget_bias) * sigmoid( time_last_state) * c_prev + sigmoid(i) * sigmoid( time_now_state) * self._activation(j) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: if self._linear2 is None: scope = vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer): with vs.variable_scope("projection") as proj_scope: if self._num_proj_shards is not None: proj_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_proj_shards)) self._linear2 = _Linear(m, self._num_proj, False) m = self._linear2(m) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type c = att_score * c + (1.0 - att_score) * c m = att_score * m + (1.0 - att_score) * m new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def optimized_gcn_encode(self): # [node_size, hidden_layer_dim] embedded_node_rep = self.encode_node_feature(self.word_embeddings, self.feature_info) fw_sampler = UniformNeighborSampler(self.fw_adj_info) bw_sampler = UniformNeighborSampler(self.bw_adj_info) nodes = tf.reshape(self.batch_nodes, [-1, ]) # batch_size = tf.shape(nodes)[0] # the fw_hidden and bw_hidden is the initial node embedding # [node_size, dim_size] fw_hidden = tf.nn.embedding_lookup(embedded_node_rep, nodes) bw_hidden = tf.nn.embedding_lookup(embedded_node_rep, nodes) # [node_size, adj_size] fw_sampled_neighbors = fw_sampler((nodes, self.sample_size_per_layer)) bw_sampled_neighbors = bw_sampler((nodes, self.sample_size_per_layer)) fw_sampled_neighbors_len = tf.constant(0) bw_sampled_neighbors_len = tf.constant(0) # sample for layer in range(self.sample_layer_size): if layer == 0: dim_mul = 1 else: dim_mul = 2 if layer > 6: fw_aggregator = self.fw_aggregators[6] else: fw_aggregator = MeanAggregator(dim_mul * self.hidden_layer_dim, self.hidden_layer_dim, concat=self.concat, mode=self.mode) self.fw_aggregators.append(fw_aggregator) # [node_size, adj_size, word_embedding_dim] if layer == 0: neigh_vec_hidden = tf.nn.embedding_lookup(embedded_node_rep, fw_sampled_neighbors) # compute the neighbor size tmp_sum = tf.reduce_sum(tf.nn.relu(neigh_vec_hidden), axis=2) tmp_mask = tf.sign(tmp_sum) fw_sampled_neighbors_len = tf.reduce_sum(tmp_mask, axis=1) else: neigh_vec_hidden = tf.nn.embedding_lookup( tf.concat([fw_hidden, tf.zeros([1, dim_mul * self.hidden_layer_dim])], 0), fw_sampled_neighbors) fw_hidden = fw_aggregator((fw_hidden, neigh_vec_hidden, fw_sampled_neighbors_len)) if self.graph_encode_direction == "bi": if layer > 6: bw_aggregator = self.bw_aggregators[6] else: bw_aggregator = MeanAggregator(dim_mul * self.hidden_layer_dim, self.hidden_layer_dim, concat=self.concat, mode=self.mode) self.bw_aggregators.append(bw_aggregator) if layer == 0: neigh_vec_hidden = tf.nn.embedding_lookup(embedded_node_rep, bw_sampled_neighbors) # compute the neighbor size tmp_sum = tf.reduce_sum(tf.nn.relu(neigh_vec_hidden), axis=2) tmp_mask = tf.sign(tmp_sum) bw_sampled_neighbors_len = tf.reduce_sum(tmp_mask, axis=1) else: neigh_vec_hidden = tf.nn.embedding_lookup( tf.concat([bw_hidden, tf.zeros([1, dim_mul * self.hidden_layer_dim])], 0), bw_sampled_neighbors) bw_hidden = bw_aggregator((bw_hidden, neigh_vec_hidden, bw_sampled_neighbors_len)) # hidden stores the representation for all nodes fw_hidden = tf.reshape(fw_hidden, [-1, self.single_graph_nodes_size, 2 * self.hidden_layer_dim]) if self.graph_encode_direction == "bi": bw_hidden = tf.reshape(bw_hidden, [-1, self.single_graph_nodes_size, 2 * self.hidden_layer_dim]) hidden = tf.concat([fw_hidden, bw_hidden], axis=2) else: hidden = fw_hidden hidden = tf.nn.relu(hidden) pooled = tf.reduce_max(hidden, 1) if self.graph_encode_direction == "bi": graph_embedding = tf.reshape(pooled, [-1, 4 * self.hidden_layer_dim]) else: graph_embedding = tf.reshape(pooled, [-1, 2 * self.hidden_layer_dim]) graph_embedding = LSTMStateTuple(c=graph_embedding, h=graph_embedding) # shape of hidden: [batch_size, single_graph_nodes_size, 4 * hidden_layer_dim] # shape of graph_embedding: ([batch_size, 4 * hidden_layer_dim], [batch_size, 4 * hidden_layer_dim]) return hidden, graph_embedding
def __init__( self, num_units, use_peepholes=False, cell_clip=None, initializer=None, num_proj=None, proj_clip=None, num_unit_shards=None, num_proj_shards=None, forget_bias=1.0, state_is_tuple=True, activation=None, reuse=None, ): super(Time4LSTMCell, self).__init__(_reuse=reuse) if not state_is_tuple: logging.warn( "%s: Using a concatenated state is slower and will soon be " "deprecated. Use state_is_tuple=True.", self, ) if num_unit_shards is not None or num_proj_shards is not None: logging.warn( "%s: The num_unit_shards and proj_unit_shards parameters are " "deprecated and will be removed in Jan 2017. " "Use a variable scope with a partitioner instead.", self, ) self._num_units = num_units self._use_peepholes = use_peepholes self._cell_clip = cell_clip self._initializer = initializer self._num_proj = num_proj self._proj_clip = proj_clip self._num_unit_shards = num_unit_shards self._num_proj_shards = num_proj_shards self._forget_bias = forget_bias self._state_is_tuple = state_is_tuple self._activation = activation or math_ops.tanh if num_proj: self._state_size = (LSTMStateTuple(num_units, num_proj) if state_is_tuple else num_units + num_proj) self._output_size = num_proj else: self._state_size = (LSTMStateTuple(num_units, num_units) if state_is_tuple else 2 * num_units) self._output_size = num_units self._linear1 = None self._linear2 = None self._time_input_w1 = None self._time_input_w2 = None self._time_kernel_w1 = None self._time_kernel_t1 = None self._time_bias1 = None self._time_kernel_w2 = None self._time_kernel_t2 = None self._time_bias2 = None self._o_kernel_t1 = None self._o_kernel_t2 = None if self._use_peepholes: self._w_f_diag = None self._w_i_diag = None self._w_o_diag = None
def call(self, inputs, state): """Multiple Input LSTM Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size, num_units]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size, 2 * num_units]`. Returns: A pair containing the new hidden state, and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ def calc_cell_state_tilde(input, h, w, b): """ :param input: shape (B, q) :param h: shape (B, p) :param w: shape ((p+q), p) :param b: shape (p,) TODO check :return: shape (B, p) """ C_t = matmul( concat([input, h], 1), w) # [B, (p+q)] * [(p+q), p] = B * p C_t = nn_ops.bias_add(C_t, b) return tanh(C_t) def calc_input_gate(input, h, w, b): """ :param input: shape (B, q) :param h: shape (B, p) :param w: shape ((p+q), p) :param b: shape (p,) :return: shape (B, p) """ input_gate = matmul( concat([input, h], 1), w) # (B, (p+q)) * ((p+q), p) = (B, p) input_gate = nn_ops.bias_add(input_gate, b) return sigmoid(input_gate) def calc_pre_attention(l, w_attn, pre_cell_state, b_attn): u = matmul( a=l, b=w_attn) # (B,p) * (p,p) = (B,p) u = multiply( # TODO, check is here correct? element u, pre_cell_state) # (B,p) * (B, p) = (B, p) u = tf.reduce_sum(u, 1) u = tf.expand_dims(u, 1) # u = tf.reshape(u, [u.shape[0], 1]) u = nn_ops.bias_add(u, b_attn) return tanh(u) one = constant_op.constant(1, dtype=dtypes.int32) zero = constant_op.constant(0, dtype=dtypes.int32) # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = split(value=state, num_or_size_splits=2, axis=one) # TODO check """ Thus matrix W cp , W cn , W ci and biases b cp , b cn , b ci are all initialized to 0 which means that the auxiliary factors are ignored in the very beginning. Hopefully the information from auxiliary factors will gradually flow in with the training process under the control of mainstream. """ W_f, W_c, W_cp, W_cn, W_ci, W_i, W_ip, W_in, W_ii, W_o \ = split(value=self._kernel, num_or_size_splits=10, axis=one) # ((p+q), p) b_f, b_c, b_cp, b_cn, b_ci, b_i, b_ip, b_in, b_ii, b_o \ = split(value=self._bias, num_or_size_splits=10, axis=zero) # (1, p) # split the inputs into multiple pieces pieces = self._input_divider input_pieces = split(value=inputs, num_or_size_splits=pieces, axis=one) input_y = input_pieces[0] input_p = input_pieces[1] input_n = input_pieces[2] input_i = input_pieces[3] C_tilde_t = calc_cell_state_tilde(input_y, h, W_c, b_c) # shape = (B,p) C_tilde_pt = calc_cell_state_tilde(input_p, h, W_cp, b_cp) C_tilde_nt = calc_cell_state_tilde(input_n, h, W_cn, b_cn) C_tilde_it = calc_cell_state_tilde(input_i, h, W_ci, b_ci) i_t = calc_input_gate(input_y, h, W_i, b_i) # shape = (B,p) i_pt = calc_input_gate(input_y, h, W_ip, b_ip) i_nt = calc_input_gate(input_y, h, W_in, b_in) i_it = calc_input_gate(input_y, h, W_ii, b_ii) l_t = multiply(C_tilde_t, i_t) # shape = (B,p) l_pt = multiply(C_tilde_pt, i_pt) l_nt = multiply(C_tilde_nt, i_nt) l_it = multiply(C_tilde_it, i_it) # get the attention weights and bias w_attn = self._w_attn # shape = (p,p) # b_attn = b_attn_t, b_attn_pt, b_ttn_nt, b_attn_it, \ = split(value=self._b_attn, num_or_size_splits=self._input_divider, axis=zero) u_t = calc_pre_attention(l_t, w_attn, c, b_attn_t) # shape = (B,1) u_pt = calc_pre_attention(l_pt, w_attn, c, b_attn_pt) u_nt = calc_pre_attention(l_nt, w_attn, c, b_ttn_nt) u_it = calc_pre_attention(l_it, w_attn, c, b_attn_it) attn = tf.nn.softmax(concat([u_t, u_pt, u_nt, u_it], axis=1)) # shape of logits: (B, 4) attn_t, attn_pt, attn_nt, attn_it = split(value=attn, num_or_size_splits=4, axis=one) # shape = (B, 1) # the final cell state input l, shape = (B,p) # TODO check the multiply behavior l = multiply(l_t, attn_t) + multiply(l_pt, attn_pt) + multiply(l_nt, attn_nt) + multiply(l_it, attn_it) # The forget gate and output gate of LSTM remain the same compared with the original LSTM # shapes -- # input_y: (B, q) # h: (B, p) # W_f: ((p+q), p) # b_f: (p, p) f_t = calc_input_gate(input_y, h, W_f, b_f) # shape (B, p) o_t = calc_input_gate(input_y, h, W_o, b_o) new_c = multiply(c, f_t) + l # shape of c and new_c: (B,p) new_h = multiply(tanh(new_c), o_t) # shape new_h: (B,p) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = concat([new_c, new_h], 1) return new_h, new_state
def call(self, inputs, state): """ Long short-term unitary memory cell (LSTUM). """ if self._isMatrix: c, h = state C = tf.reshape( c, [self._size_batch, self._hidden_size, self._hidden_size]) else: c, h = state concat = _linear([inputs, h], 4 * self._hidden_size, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1) d = sigmoid(i) * tanh(j) if self._isMatrix: d_temp = tf.matmul( C, tf.reshape(d, [self._size_batch, self._hidden_size, 1])) d = tf.reshape(d_temp, [self._size_batch, self._hidden_size]) #get the rotation matrix from f to d step1 = tf.nn.l2_normalize(f, 1, epsilon=1e-8) step2 = tf.nn.l2_normalize(d, 1, epsilon=1e-8) costh = tf.reduce_sum(step1 * step2, 1) sinth = tf.sqrt(1 - costh**2) step4 = tf.reshape(costh, [self._size_batch, 1]) step5 = tf.reshape(sinth, [self._size_batch, 1]) step6 = tf.concat([step4, -step5, step5, step4], axis=1) Rth = tf.reshape(step6, [self._size_batch, 2, 2]) #get the u and v vectors u = step1 step8 = d - tf.reshape(tf.reduce_sum(u * d, 1), [self._size_batch, 1]) * u v = tf.nn.l2_normalize(step8, 1, epsilon=1e-8) #concatenate the two vectors step9 = tf.reshape(u, [self._size_batch, 1, self._hidden_size]) step14 = tf.reshape(v, [self._size_batch, 1, self._hidden_size]) step15 = tf.concat([step9, step14], axis=1) step16 = tf.transpose(step15, [0, 2, 1]) #do the batch matmul step10 = tf.reshape(u, [self._size_batch, self._hidden_size, 1]) step11 = tf.transpose(step10, [0, 2, 1]) uuT = tf.matmul(step10, step11) step12 = tf.reshape(v, [self._size_batch, self._hidden_size, 1]) step13 = tf.transpose(step12, [0, 2, 1]) vvT = tf.matmul(step12, step13) #put all together I = tf.eye(self._hidden_size, batch_shape=[self._size_batch]) step17 = tf.matmul(tf.matmul(step16, Rth), step15) res = I - uuT - vvT - step17 if self._isMatrix: new_C = res o = tf.reshape(o, [self._size_batch, self._hidden_size, 1]) new_h = tf.matmul(self._activation(new_C), o) new_h = tf.reshape(new_h, [self._size_batch, self._hidden_size]) new_c = tf.reshape(new_C, [self._size_batch, self._hidden_size**2]) else: new_c = tf.reshape( tf.matmul( res, tf.reshape(c, [self._size_batch, self._hidden_size, 1])), [self._size_batch, self._hidden_size]) new_h = self._activation(new_c) * o new_state = LSTMStateTuple(new_c, new_h) return new_h, new_state
def call(self, inputs, state): """Long short-term memory cell with attention (LSTMA).""" if self._state_is_tuple: state, attns, attn_states,histotry = state else: states = state state = array_ops.slice(states, [0, 0], [-1, self._cell.state_size]) attns = array_ops.slice( states, [0, self._cell.state_size], [-1, self._attn_size]) attn_states = array_ops.slice( states, [0, self._cell.state_size + self._attn_size], [-1, self._attn_size * self._attn_length]) attn_states = array_ops.reshape(attn_states, [-1, self._attn_length, self._attn_size]) input_size = self._input_size if input_size is None: input_size = inputs.get_shape().as_list()[1] if self._linear1 is None: self._linear1 = _Linear([inputs, attns], input_size, True) inputs = self._linear1([inputs, attns]) cell_output, new_state = self._cell(inputs, state) #print("new state",new_state) if self._state_is_tuple: new_state_cat = array_ops.concat(nest.flatten(new_state), 1) else: new_state_cat = new_state new_attns, new_attn_states = self._attention(new_state_cat, attn_states) with vs.variable_scope("attn_output_projection"): if self._linear2 is None: self._linear2 = _Linear([cell_output, new_attns], self._attn_size, True) output = self._linear2([cell_output, new_attns]) #print("output",output) new_attn_states = array_ops.concat( [new_attn_states, array_ops.expand_dims(output, 1)], 1) new_attn_states = array_ops.reshape( new_attn_states, [-1, self._attn_length * self._attn_size]) c_new, h_new = new_state #print("c_new", c_new) #print("h_new", h_new) label_emb = tf.nn.relu(tf.matmul(output, self.emb_M3)) # label_emb = tf.expand_dims(label_emb, axis=1) #print("label emb",label_emb) #print("new stat", new_state) pre_history = histotry pre_history= tf.reshape(pre_history, shape=[-1, self.config.use_K_histroy, self.config.label_emb_size]) #print("pre_history",pre_history) new_history = tf.slice(pre_history, [0, 1, 0], [-1, self.config.use_K_histroy - 1, self.config.label_emb_size]) #print("new_history", new_history) # print("label_emb", label_emb) concat_his = tf.concat([new_history, tf.expand_dims(label_emb,axis=1)], axis=1) #print("concat_his_tmp", concat_his) concat_all = tf.concat([concat_his, tf.expand_dims(c_new,axis=1)], axis=1) #print("c_new",c_new) concat_all_flatten = tf.reshape(concat_all, shape=[-1, (self.config.use_K_histroy + 1) * self.config.label_emb_size]) concat_his_flatten = tf.reshape(concat_his, shape=[-1, self.config.use_K_histroy * self.config.label_emb_size]) c = tf.nn.relu(tf.matmul(concat_all_flatten, self.emb_M4k)) new_state= LSTMStateTuple(c, h_new) new_wrapper_state = (new_state, new_attns, new_attn_states, concat_his_flatten) return output, new_wrapper_state
def call(self, inputs, state): num_proj = self._num_units if self._num_proj is None else self._num_proj sigmoid = math_ops.sigmoid (c_prev, m_prev) = state dtype = inputs.dtype input_size = inputs.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError( "Could not infer input size from inputs.get_shape()[-1]") if self._linear1 is None: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: if self._num_unit_shards is not None: unit_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_unit_shards)) self._linear1 = _Linear([inputs, m_prev], 4 * self._num_units, True) # i = input_gate, j = new_input, f = forget_gate, o = output_gate lstm_matrix = self._linear1([inputs, m_prev]) i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1) # Diagonal connections if self._use_peepholes and not self._w_f_diag: scope = vs.get_variable_scope() with vs.variable_scope( scope, initializer=self._initializer) as unit_scope: with vs.variable_scope(unit_scope): self._w_f_diag = vs.get_variable("w_f_diag", shape=[self._num_units], dtype=dtype) self._w_i_diag = vs.get_variable("w_i_diag", shape=[self._num_units], dtype=dtype) self._w_o_diag = vs.get_variable("w_o_diag", shape=[self._num_units], dtype=dtype) if self._use_peepholes: c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * self._activation(j)) else: c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) * self._activation(j)) if self._cell_clip is not None: # pylint: disable=invalid-unary-operand-type c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip) # pylint: enable=invalid-unary-operand-type if self._use_peepholes: m = sigmoid(o + self._w_o_diag * c) * self._activation(c) else: m = sigmoid(o) * self._activation(c) if self._num_proj is not None: if self._linear2 is None: scope = vs.get_variable_scope() with vs.variable_scope(scope, initializer=self._initializer): with vs.variable_scope("projection") as proj_scope: if self._num_proj_shards is not None: proj_scope.set_partitioner( partitioned_variables.fixed_size_partitioner( self._num_proj_shards)) self._linear2 = _Linear(m, self._num_proj, False) m = self._linear2(m) if self._proj_clip is not None: # pylint: disable=invalid-unary-operand-type m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip) # pylint: enable=invalid-unary-operand-type new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else array_ops.concat([c, m], 1)) return m, new_state
def call(self, inputs, state): """Run one step of the GraphLSTM cell. Args: inputs: `2-D` tensor with shape `[batch_size x input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size x self.state_size]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size x 2 * self.state_size]`. Returns: A tuple, containing the new hidden state and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ sigmoid = math_ops.sigmoid tanh = math_ops.tanh # initialize cell weights weight_dict = self._init_weights(inputs) # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: m_i, h_i = state else: m_i, h_i = array_ops.split(value=state, num_or_size_splits=2, axis=1) # "shared weight metrics Ufn for all nodes are learned to guarantee the spatial transformation # invariance and enable the learning with various neighbors": GraphLSTM cells have to be generalized to be able # to be applied to any random image superpixel region, whereas for hand pose estimation, we want each cell to # specialize on its joint # in the paper, all cells are generalized and thus do not need to know about the nature of their # neighbours. However, we want cells specifically trained for certain joint, so information about which # neighbouring cell belongs to which node might be interesting ... kind of a "hard wired" Graph LSTM # But: that's good! -> Own contribution, learn generic hand model / even learn individual hand sizes? # self._neighbour_states: a list of n `LSTMStateTuples` of state tensors (m_j, h_j) if not hasattr(self, "_neighbour_states"): raise LookupError( "Could not find variable 'self._neighbour_states' during 'GraphLSTMCell.call'.\n" "This likely means 'call' was called directly, instead of through '__call__' (which " "should be the case when called from inside the tensorflow framework)." ) # extract two vectors of n ms and n hs from state vector of n (m,h) tuples m_j_all, h_j_all = zip(*self._neighbour_states) # IMPLEMENTATION DIFFERS FROM PAPER: in eq. (2) g^f_ij uses h_j,t regardless of if node j has been updated # already or not. Implemented here is h_j,t for non-updated nodes and h_j,t+1 for updated nodes # which both makes sense intuitively (most recent information) # and is more lightweight (no need to keep track of old states) # Eq. 1: averaged hidden states for neighbouring nodes h^-_{i,t} h_j_avg = math_ops.reduce_mean(h_j_all, axis=0) # fetch weights and biases w_u = weight_dict[_W_U] w_f = weight_dict[_W_F] w_c = weight_dict[_W_C] w_o = weight_dict[_W_O] u_u = weight_dict[_U_U] u_f = weight_dict[_U_F] u_c = weight_dict[_U_C] u_o = weight_dict[_U_O] u_un = weight_dict[_U_UN] u_fn = weight_dict[_U_FN] u_cn = weight_dict[_U_CN] u_on = weight_dict[_U_ON] b_u = weight_dict[_B_U] b_f = weight_dict[_B_F] b_c = weight_dict[_B_C] b_o = weight_dict[_B_O] # Eq. 2 # input gate # g_u = sigmoid ( f_{i,t+1} * W_u + h_{i,t} * U_u + h^-_{i,t} * U_{un} + b_u ) g_u = sigmoid( _graphlstm_linear([w_u, u_u, u_un, b_u], [inputs, h_i, h_j_avg])) # adaptive forget gate # g_fij = sigmoid ( f_{i,t+1} * W_f + h_{j,t} * U_fn + b_f ) for every neighbour j g_fij = [ sigmoid(_graphlstm_linear([w_f, u_fn, b_f], [inputs, h_j])) for h_j in h_j_all ] # forget gate # g_fi = sigmoid ( f_{i,t+1} * W_f + h_{i,t} * U_f + b_f ) g_fi = sigmoid(_graphlstm_linear([w_f, u_f, b_f], [inputs, h_i])) # output gate # g_o = sigmoid ( f_{i,t+1} * W_o + h_{i,t} * U_o + h^-_{i,t} * U_{on} + b_o ) g_o = sigmoid( _graphlstm_linear([w_o, u_o, u_on, b_o], [inputs, h_i, h_j_avg])) # memory gate # g_c = tanh ( f_{i,t+1} * W_c + h_{i,t} * U_c + h^-_{i,t} * U_{cn} + b_c ) g_c = tanh( _graphlstm_linear([w_c, u_c, u_cn, b_c], [inputs, h_i, h_j_avg])) # new memory states # m_i_new = sum ( g_fij .* most recent state of each neighbouring node ) / number of neighbouring nodes ... # ... + g_fi .* m_i + g_u .* g_c m_i_new = math_ops.reduce_mean( [g * m_j for g, m_j in zip(g_fij, m_j_all)], axis=0) + g_fi * m_i + g_u * g_c # new hidden states # h_i_new = tanh ( g_o .* m_i_new ) h_i_new = tanh(g_o * m_i_new) # Eq. 3 (return values) if self._state_is_tuple: new_state = LSTMStateTuple(m_i_new, h_i_new) else: new_state = array_ops.concat([m_i_new, h_i_new], 1) return h_i_new, new_state
dtype=tf.float32, swap_memory=False, time_major=True, scope=None) ) # bidirectional step(forward and backward) : expensive but better prediction encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2) encoder_final_state_c = tf.concat( (encoder_fw_final_state.c, encoder_bw_final_state.c), 1) encoder_final_state_h = tf.concat( (encoder_fw_final_state.h, encoder_bw_final_state.h), 1) # combine all together(backward and forward final state) for decoder feed encoder_final_state = LSTMStateTuple( c=encoder_final_state_c, h=encoder_final_state_h ) # defining decoder :batch size is the most important one !! # LSTM (Long short term memory units) decoder_cell = LSTMCell(decoder_hidden_units) encoder_max_time, batch_size = tf.unstack(tf.shape(encoder_inputs)) decoder_lengths = encoder_inputs_length + 3 # ass 3 bcz 2 additional steps below # 1 for the leading end of sentence token for the decoder input # we want it to be a little bigger for the end of sentence token which indicates the end of sequence # dividing into small batch size=> make prediction better (little more computationally expensive), not always # GRU has less gates than LSTM (less expensive but tends to have better results specifically for dynamic network=>coooooool!!!) # defining weights and biases
def state_size(self): return (LSTMStateTuple(self._num_units, self._num_units) if self._state_is_tuple else 2 * self._num_units)
def build_graph(self, hparams, scope=None): """Subclass must implement this method. Creates a sequence-to-sequence model with dynamic RNN decoder API. Args: hparams: Hyperparameter configurations. scope: VariableScope for the created subgraph; default "dynamic_seq2seq". Returns: A tuple of the form (logits, loss, final_context_state), where: logits: float32 Tensor [batch_size x num_decoder_symbols]. loss: the total loss / batch_size. final_context_state: The final state of decoder RNN. Raises: ValueError: if encoder_type differs from mono and bi, or attention_option is not (luong | scaled_luong | bahdanau | normed_bahdanau). """ utils.print_out("# creating %s graph ..." % self.mode) dtype = tf.float32 num_layers = hparams.num_layers num_gpus = hparams.num_gpus with tf.variable_scope(scope or "dynamic_seq2seq", dtype=dtype): # Encoder encoder_outputs, encoder_state = self._build_encoder(hparams) # by default, `self` is of type `mnt.model.Model` and `encoder_outputs` is of type `Tensor` # if we are infer, output/print the data of `encoder_state` if self.mode == tf.contrib.learn.ModeKeys.INFER: # use `tf.Print` to print all data of `encode_state`, which is the content vectors summarize_size = 1024 * 16 first_n_size = -1 # state_tuple_index = 0 # def decorate_state_tuple_with_print(state_tuple): # nonlocal state_tuple_index # return_tuple = LSTMStateTuple( # tf.Print(state_tuple.c, # [state_tuple.c], # "EncodeState%d = " % (state_tuple_index * 2), # first_n=first_n_size, # summarize=summarize_size), # tf.Print(state_tuple.h, # [state_tuple.h], # "EncodeState = " % (state_tuple_index * 2 + 1), # first_n=first_n_size, # summarize=summarize_size) # ) # state_tuple_index += 1 # return return_tuple # # wrapped_encoder_state = tuple(map(decorate_state_tuple_with_print, encoder_state)) # @see http://www.cnblogs.com/rocketfan/p/6257137.html # It seems that the `encoder_state[0]` is the state of the hidden layer of the encoder, # and the `encoder_state[0]` is the state of the output layer of the encoder, # and `h` in a `LSTMStateTuple` is the output. wrapped_encoder_state = \ (LSTMStateTuple( tf.Print(encoder_state[0].c, [encoder_state[0].c], "EncodeState0C = ", first_n=first_n_size, summarize=summarize_size), tf.Print(encoder_state[0].h, [encoder_state[0].h], "EncodeState0H = ", first_n=first_n_size, summarize=summarize_size)), LSTMStateTuple( tf.Print(encoder_state[1].c, [encoder_state[1].c], "EncodeState1C = ", first_n=first_n_size, summarize=summarize_size), tf.Print(encoder_state[1].h, [encoder_state[1].h], "EncodeState1H = ", first_n=first_n_size, summarize=summarize_size)) ) encoder_state = wrapped_encoder_state ## Decoder logits, sample_id, final_context_state = self._build_decoder( encoder_outputs, encoder_state, hparams) ## Loss if self.mode != tf.contrib.learn.ModeKeys.INFER: with tf.device( model_helper.get_device_str(num_layers - 1, num_gpus)): loss = self._compute_loss(logits) else: loss = None return logits, loss, final_context_state, sample_id