def _mask_sequences_tensor(sequence, sequence_length, dtype=None, time_major=False, tensor_rank=2): """Masks out sequence entries that are beyond the respective sequence lengths. Masks along the time dimension. Args: sequence: A Tensor of sequence values. If `time_major=False` (default), this must be a Tensor of shape: `[batch_size, max_time, d_2, ..., d_rank]`, where the rank of the Tensor is specified with :attr:`tensor_rank`. If `time_major=True`, this must be a Tensor of shape: `[max_time, batch_size, d_2, ..., d_rank].` sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond the respective sequence lengths will be made zero. dtype (dtype): Type of :attr:`sequence`. If `None`, infer from :attr:`sequence` automatically. time_major (bool): The shape format of the inputs. If `True`, :attr:`sequence` must have shape `[max_time, batch_size, d_2, ..., d_rank]`. If `False` (default), :attr:`sequence` must have shape `[batch_size, max_time, d_2, ..., d_rank]`. tensor_rank (int): The number of dimensions of :attr:`sequence`. Default is 2, i.e., :attr:`sequence` is a 2D Tensor consisting of batch and time dimensions. Returns: The masked sequence, i.e., a Tensor of the same shape as :attr:`sequence` but with masked-out entries (set to zero). """ if tensor_rank is None: tensor_rank = 2 if tensor_rank < 2: raise ValueError( "tensor_rank must be > 2. Got tensor_rank = {}".format( tensor_rank)) if time_major: sequence = rnn._transpose_batch_time(sequence) max_time = tf.to_int32(tf.shape(sequence)[1]) if dtype is None: dtype = sequence.dtype mask = tf.sequence_mask(tf.to_int32(sequence_length), max_time, dtype=dtype) for _ in range(2, tensor_rank): mask = tf.expand_dims(mask, axis=-1) sequence = sequence * mask if time_major: sequence = rnn._transpose_batch_time(sequence) return sequence
def decoder_p3(self, inputs, reuse, max_time, char_sequence_length): # _inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time,name='context_array') # _inputs_ta = _inputs_ta.unstack(tf.transpose(inputs,[1,0,2])) _inputs_ta = inputs outputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time, name='pred_char_array') cell = tf.contrib.rnn.LSTMCell(self.decoder_p3_units) def loop_fn(time, cell_output, cell_state, loop_state): next_loop_state = loop_state emit_output = cell_output # == None for time == 0 if cell_output is None: # time == 0 next_cell_state = cell.zero_state(self.batch_size, tf.float32) next_input = tf.concat( [tf.zeros(shape=[self.batch_size, self.dict_length], dtype=tf.float32), _inputs_ta.read(time)], axis=-1) next_loop_state = outputs_ta else: next_cell_state = cell_state prediction = tf.layers.dense(inputs=cell_output, activation=None, units=self.dict_length) next_loop_state = loop_state.write(time - 1, prediction) next_input = tf.concat([prediction, _inputs_ta.read(time)], axis=-1) # argmax seems to be working a bit better, funny as it's not differentiable # next_input = tf.concat([tf.one_hot(tf.argmax(prediction, -1), depth=self.dict_length, axis=-1), _inputs_ta.read(time)],axis=-1) elements_finished = (time >= char_sequence_length - 1) return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) with tf.variable_scope('decoder_p3', reuse=reuse): _, _, loop_ta = tf.nn.raw_rnn(cell, loop_fn) output = _transpose_batch_time(loop_ta.stack()) return output
def vanilla_decoder(self, inputs, reuse): outputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time, name='pred_char_array') cell = tf.contrib.rnn.LSTMCell(self.decoder_p3_units) def loop_fn(time, cell_output, cell_state, loop_state): next_loop_state = loop_state emit_output = cell_output # == None for time == 0 if cell_output is None: # time == 0 next_cell_state = cell.zero_state(self.batch_size, tf.float32) next_input = tf.concat( [tf.zeros(shape=[self.batch_size, self.dict_length + self.lat_word_dim], dtype=tf.float32)], axis=-1) next_loop_state = outputs_ta else: next_cell_state = cell_state prediction = tf.layers.dense(inputs=cell_output, activation=None, units=self.dict_length) next_loop_state = loop_state.write(time - 1, prediction) next_input = tf.concat([prediction, inputs], axis=-1) elements_finished = (time >= sequence_length - 1) return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) with tf.variable_scope('vanilla_decoder', reuse=reuse): _, _, loop_ta = tf.nn.raw_rnn(cell, loop_fn) output = _transpose_batch_time(loop_ta.stack()) return output
def __init__( self, inputs, sequence_length, time_major= False, is_training= False, name= None ): self._inputs = inputs; self._sequence_length = tf.convert_to_tensor(sequence_length, name="sequence_length") if self._sequence_length.get_shape().ndims != 1: raise ValueError( "Expected sequence_length to be a vector, but received shape: %s" % self._sequence_length.get_shape() ) self.time_major = time_major; self.is_training = is_training; self._batch_size = tf.shape(self._inputs)[0]; if not time_major: inputs = rnn._transpose_batch_time(inputs); self._zero_inputs = tf.zeros_like(inputs[0, :]) self._input_tas = tf.TensorArray( dtype=inputs.dtype, size=tf.shape(inputs)[0], element_shape=inputs.get_shape()[1:] ).unstack(inputs)
def assemble_mem_view(final_loop_state, series_list, vocab_size): filters = [ [ [ [ [ 1.0, 0.0, 0.0 ] ] ] ], #R [ [ [ [ 0.0, 1.0, 0.0 ] ] ] ], #G [ [ [ [ 0.0, 0.0, 1.0 ] ] ] ] #B ] memory_view = tuple (_transpose_batch_time(view.stack()) for view in final_loop_state) read_weightings = memory_view[0] write_weightings = memory_view[1] #inputs outputs and targets series_list = [tf.one_hot(s, depth=vocab_size) for s in series_list] series_imgs_ = [] for i, s in enumerate(series_list): series_imgs_.append( tf.tile(tf.expand_dims(s, -1), [1,1,1,3]) * filters[i%3]) series_imgs = tf.concat(series_imgs_, 2) #memory views read_weightings = tf.expand_dims(memory_view[0][:, :, 0, :], -1) write_weightings = tf.expand_dims(memory_view[1][:, :, 0, :], -1) mem_imgs = tf.concat([read_weightings, write_weightings, tf.zeros_like(write_weightings)], -1) usage_vectors = tf.tile(tf.expand_dims(memory_view[2], -1), [1, 1, 1, 3]) #return concatenated memory views and series views return tf.transpose(tf.concat([series_imgs, usage_vectors, mem_imgs], 2), [0, 2, 1, 3])
def decoder2(self, inputs, reuse, hap_lens, units_lstm): outputs_ta = tf.TensorArray(dtype=tf.float32, size=self.max_hap_len) cell = tf.contrib.rnn.LSTMCell(units_lstm) def loop_fn(time, cell_output, cell_state, loop_state): emit_output = cell_output # == None for time == 0 if cell_output is None: # time == 0 next_cell_state = cell.zero_state(self.batch_size, tf.float32) next_loop_state = outputs_ta next_input = tf.zeros( shape=[self.batch_size, self.dim_ancs + 1], dtype=tf.float32) else: next_cell_state = cell_state prediction = tf.layers.dense(inputs=cell_output, activation=None, units=1) next_input = tf.concat([inputs, prediction], axis=-1) next_loop_state = loop_state.write(time - 1, prediction) elements_finished = (time >= hap_lens) return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) with tf.variable_scope('decoder_p2', reuse=reuse): _, _, loop_state_ta = tf.nn.raw_rnn(cell, loop_fn) loop_state_out = _transpose_batch_time(loop_state_ta.stack()) predictions = tf.nn.sigmoid( tf.reshape(loop_state_out, [self.batch_size, self.max_hap_len])) return predictions
def unrolled_prior(self,values, num_units, global_latent, word_lens, reuse): #inputs_ta = tf.TensorArray(dtype=tf.float32, size=self.max_num_lat_words) values = tf.transpose(values,[1,0,2]) #values.set_shape([self.max_num_lat_words,self.batch_size,self.lat_word_dim]) #inputs_ta.unstack(values) mean_ta = tf.TensorArray(dtype=tf.float32, size=self.max_num_lat_words) logsigmas_ta = tf.TensorArray(dtype=tf.float32, size=self.max_num_lat_words) cell = tf.contrib.rnn.LSTMCell(num_units) def loop_fn(time, cell_output, cell_state, loop_state): emit_output = cell_output # == None for time == 0 if cell_output is None: # time == 0 next_cell_state = cell.zero_state(self.batch_size, tf.float32) next_loop_state = (mean_ta, logsigmas_ta) # self.lat_word_dim is very important, need from kevin next_input = tf.concat( [tf.zeros(shape=[self.batch_size, self.lat_word_dim], dtype=tf.float32), global_latent], axis=-1) else: next_cell_state = cell_state with tf.variable_scope('prior_pred', reuse=reuse): w = tf.get_variable(name='prior_dense_w', shape=[self.lat_word_dim, self.lat_word_dim * 2], dtype=tf.float32) b = tf.get_variable(name='prior_dense_b', shape=self.lat_word_dim * 2, dtype=tf.float32) cell_output = tf.reshape(tf.matmul(cell_output, w) + b, [self.batch_size, self.lat_word_dim * 2]) mu, logsig = tf.split(cell_output, axis=-1, num_or_size_splits=2) #eps = tf.random_normal(shape=[self.batch_size, self.lat_word_dim], dtype=tf.float32) #samples_word = eps * tf.exp(logsig) + mu next_input = tf.concat([values[time-1], global_latent], axis=-1) next_loop_state = (loop_state[0].write(time - 1, mu),loop_state[1].write(time - 1, logsig)) elements_finished = (time >= word_lens) return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) with tf.variable_scope('prior', reuse=reuse): _, _, loop_state_ta = tf.nn.raw_rnn(cell, loop_fn) mean_state_out = _transpose_batch_time(loop_state_ta[0].stack()) logsigma_state_out = _transpose_batch_time(loop_state_ta[1].stack()) return [mean_state_out,logsigma_state_out]
def dynamic_raw_rnn(cell, input_, batch_size, seq_length, horizon, output_dim, rate, policy_number): # raw_rnn expects time major inputs as TensorArrays inputs_ta = tf.TensorArray(dtype=tf.float32, size=seq_length, clear_after_read=False) inputs_ta = inputs_ta.unstack(_transpose_batch_time(input_)) # model_input is the input placeholder input_dim = input_.get_shape()[-1].value # the dimensionality of the input to each time step output_dim = output_dim # the dimensionality of the model's output at each time step player_fts = 4 def loop_fn(time, cell_output, cell_state, loop_state): # check if finished elements_finished = (time >= seq_length) finished = tf.reduce_all(elements_finished) if cell_output is None: next_cell_state = cell.zero_state(batch_size, tf.float32) emit_output = tf.zeros([output_dim]) # create input next_input = inputs_ta.read(time) else: next_cell_state = cell_state # emit_output = cell_output # since we want the 2d x, y position output dense = tf.contrib.layers.fully_connected(inputs=cell_output, num_outputs=output_dim) emit_output = tf.layers.dropout(inputs=dense, rate=rate) # create input next_input = tf.cond(finished, lambda: tf.zeros([batch_size, input_dim], dtype=tf.float32), lambda: tf.cond(tf.equal(tf.mod(time, horizon+1), tf.constant(0)), lambda: inputs_ta.read(time), lambda: tf.concat((inputs_ta.read(time)[:, :policy_number*player_fts], emit_output, inputs_ta.read(time)[:, policy_number*player_fts+2:]), axis=1))) next_loop_state = None return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) outputs_ta, last_state, _ = tf.nn.raw_rnn(cell, loop_fn) outputs = _transpose_batch_time(outputs_ta.stack()) final_state = last_state return outputs, final_state
def dynamic_rnn(input_data, cell, loop_state_fn, initial_loop_state): inputs_shape_g = tf.shape(input_data) input_shape_l = input_data.get_shape().as_list() pad_input = tf.zeros([ inputs_shape_g[0], ] + input_shape_l[2:]) seq_lengths = inputs_shape_g[1] # raw_rnn uses TensorArray for the input and outputs, in which Tensor must be in [time, batch_size, input_depth] shape. inputs_ta = tf.TensorArray(size=inputs_shape_g[1], dtype=tf.float32).unstack( _transpose_batch_time(input_data), 'TBD_Input') initial_state = cell.zero_state(inputs_shape_g[0], None) def loop_fn(time, previous_output, previous_state, previous_loop_state): # this operation produces boolean tensor of [batch_size] defining if corresponding sequence has ended # all False at the initial step (time == 0) finished = time >= seq_lengths if previous_state is None: # time == 0 return (finished, inputs_ta.read(time), initial_state, previous_output, initial_loop_state) else: step_input = tf.cond(tf.reduce_all(finished), lambda: pad_input, lambda: inputs_ta.read(time)) previous_loop_state = loop_state_fn(time, previous_loop_state, previous_state) return (finished, step_input, previous_state, previous_output, previous_loop_state) outputs_ta, final_state, final_loop_state = tf.nn.raw_rnn(cell, loop_fn) output = _transpose_batch_time(outputs_ta.stack()) return output, final_state, final_loop_state
def generation(self, samples): outputs_ta = tf.TensorArray(dtype=tf.float32, size=self.max_num_lat_words) cell = tf.contrib.rnn.LSTMCell(self.decoder_units) print('GENER samples {}'.format(np.shape(samples))) def loop_fn(time, cell_output, cell_state, loop_state): emit_output = cell_output # == None for time == 0 if cell_output is None: # time == 0 next_cell_state = cell.zero_state(self.batch_size, tf.float32) next_loop_state = outputs_ta # self.lat_word_dim is very important, need from kevin next_input = tf.concat( [tf.zeros(shape=[self.batch_size, self.lat_word_dim], dtype=tf.float32), samples], axis=-1) else: next_cell_state = cell_state with tf.variable_scope('prior_pred', reuse=True): w = tf.get_variable(name='prior_dense_w') b = tf.get_variable(name='prior_dense_b') cell_output = tf.reshape(tf.matmul(cell_output, w) + b, [self.batch_size, self.lat_word_dim * 2]) mu, logsig = tf.split(cell_output, axis=-1, num_or_size_splits=2) eps = tf.random_normal(shape=[self.batch_size, self.lat_word_dim], dtype=tf.float32) samples_word = eps * tf.exp(logsig) + mu next_input = tf.concat([samples_word, samples], axis=-1) next_loop_state = loop_state.write(time - 1, samples_word) elements_finished = (time >= self.max_num_lat_words) return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) with tf.variable_scope('prior', reuse=True): _, _, loop_state_ta = tf.nn.raw_rnn(cell, loop_fn) loop_state_out = _transpose_batch_time(loop_state_ta.stack()) context = self.decoder_p2(num_hidden_word_units=self.lat_word_dim, inputs=loop_state_out, char_sequence_length=np.repeat(self.num_sentence_characters, self.batch_size, axis=-1), global_latent=samples, reuse=True, context_dim=self.decoder_units, max_time=self.num_sentence_characters) predictions = self.decoder_p3(inputs=context, reuse=True, char_sequence_length=np.repeat(self.num_sentence_characters, self.batch_size, axis=-1), max_time=self.num_sentence_characters) return predictions
def transpose_batch_time(inputs): """Transposes inputs between time-major and batch-major. Args: inputs: A Tensor of shape `[batch_size, max_time, ...]` (batch-major) or `[max_time, batch_size, ...]` (time-major), or a (possibly nested) tuple of such elements. Returns: A Tensor with transposed batch and time dimensions of inputs. """ flat_input = nest.flatten(inputs) flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input] # pylint: disable=protected-access flat_input = [rnn._transpose_batch_time(input_) for input_ in flat_input] return nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
def alt_encoder2_rnn(self, dense_intmdt_pred_units, input_encoder, temperature, units_lstm, train, hap_lens, reuse): input_encoder=tf.cast(input_encoder,dtype=tf.float32) with tf.variable_scope('enc_p2', reuse=reuse): # Ancestors Anc = tf.get_variable(name='Ancs', shape=[self.len_ancs, self.dim_ancs]) w_proj = tf.get_variable(shape=[units_lstm, self.dim_ancs], dtype=tf.float32, name='w_proj') b_proj = tf.get_variable(shape=[self.dim_ancs], dtype=tf.float32, name='b_proj') cell = tf.contrib.rnn.LSTMCell(units_lstm*2) inputs = tf.transpose(input_encoder, perm=[1, 0, 2]) # had to concat these zeros, kind of awkward, not sure why inputs = tf.concat([inputs, tf.zeros([1, self.batch_size, tf.shape(inputs)[-1]], dtype=tf.float32)], axis=0) output_ta = (tf.TensorArray(size=self.max_hap_len, dtype=tf.float32), tf.TensorArray(size=self.max_hap_len, dtype=tf.float32), tf.TensorArray(size=self.max_hap_len, dtype=tf.float32), tf.TensorArray(size=self.max_hap_len, dtype=tf.float32), tf.TensorArray(size=self.max_hap_len, dtype=tf.float32), tf.TensorArray(size=self.max_hap_len, dtype=tf.float32)) # inputs_ta = tf.TensorArray(dynamic_size=False,dtype=tf.float32,size=self.max_hap_len,clear_after_read=False) # inputs_ta.unstack(inputs) print(input_encoder) print(output_ta) print(tf.transpose(input_encoder, perm=[1, 0, 2])) # take out when using placeholders print('here') def loop_fn(time, cell_output, cell_state, loop_state): print('cell_output {}'.format(cell_output)) print('cell_state {}'.format(cell_state)) # print(inputs_ta) emit_output = cell_output # don't care about this one, only care about loop_state in this case because loop_state doesn't have to be same shape as rnn output if cell_output is None: # time == 0 print('here1') next_cell_state = cell.zero_state(self.batch_size, tf.float32) print('here2') print(time) next_anc = tf.concat( [tf.zeros(shape=[self.batch_size, self.len_ancs * 2], dtype=tf.float32), inputs[[time]]], axis=-1) # inputs_ta.read(time) ], axis=-1) print('here2.5') print('here3') next_loop_state = output_ta else: print('here4') next_cell_state = cell_state hap_1, hap_2 = tf.split(cell_output, num_or_size_splits=2, axis=-1) with tf.variable_scope('enc_p2', reuse=True): pre_next_anc1 = tf.nn.relu(tf.matmul(hap_1, w_proj) + b_proj) pre_next_anc2 = tf.nn.relu(tf.matmul(hap_2, w_proj) + b_proj) print('here5') anc_distribution_h1 = self.att_dot(query=pre_next_anc1, values=Anc) anc_distribution_h2 = self.att_dot(query=pre_next_anc2, values=Anc) if train: dist_h1 = tf.contrib.distributions.ExpRelaxedOneHotCategorical(temperature=temperature, probs=anc_distribution_h1) next_anc_sample_h1 = dist_h1.sample() dist_h2 = tf.contrib.distributions.ExpRelaxedOneHotCategorical(temperature=temperature, probs=anc_distribution_h2) next_anc_sample_h2 = dist_h2.sample() # next_anc_sample = anc_distribution next_anc = tf.concat([tf.concat([next_anc_sample_h1, next_anc_sample_h2], axis=-1), inputs[[time]]], axis=-1) else: dist_h1 = tf.contrib.distributions.Categorical(probs=anc_distribution_h1) next_anc_sample_h1 = tf.cast(tf.one_hot(dist_h1.sample(), depth=self.len_ancs, axis=-1),dtype=tf.float32) dist_h2 = tf.contrib.distributions.Categorical(probs=anc_distribution_h2) next_anc_sample_h2 = tf.cast(tf.one_hot(dist_h2.sample(), depth=self.len_ancs, axis=-1),dtype=tf.float32) # next_anc_sample = anc_distribution next_anc = tf.concat([tf.concat([next_anc_sample_h1, next_anc_sample_h2], axis=-1), inputs[[time]]], axis=-1) anc_h1 = tf.reduce_sum(tf.reshape(tf.matmul(tf.reshape(tf.matrix_diag(next_anc_sample_h1),[-1,self.len_ancs]), Anc),[self.batch_size,self.len_ancs,self.dim_ancs]),1) anc_h2 = tf.reduce_sum(tf.reshape(tf.matmul(tf.reshape(tf.matrix_diag(next_anc_sample_h2),[-1,self.len_ancs]), Anc),[self.batch_size,self.len_ancs,self.dim_ancs]),1) anc_h1_2 = tf.layers.dense(anc_h1, units=dense_intmdt_pred_units, activation=tf.nn.relu) anc_h2_2 = tf.layers.dense(anc_h2, units=dense_intmdt_pred_units, activation=tf.nn.relu) pred_current_h1 = tf.layers.dense(anc_h1_2, units=1, activation=None) pred_next_h1 = tf.layers.dense(anc_h1_2, units=1, activation=None) pred_current_h2 = tf.layers.dense(anc_h2_2, units=1, activation=None) pred_next_h2 = tf.layers.dense(anc_h2_2, units=1, activation=None) # this is sent as input to the next iteration of the cell # inputs_ta.read(time)], axis=-1) print('here7') # output to store for the iteration next_loop_state = ( loop_state[0].write(time - 1, next_anc_sample_h1), loop_state[1].write(time - 1, next_anc_sample_h1), loop_state[2].write(time - 1, pred_current_h1), loop_state[3].write(time - 1, pred_next_h1), loop_state[4].write(time - 1, pred_current_h2), loop_state[5].write(time - 1, pred_next_h2)) print('out_loop') # this gives us a vector in the size of the batch, telling us which elements have finished elements_finished = time >= hap_lens print(elements_finished) # because we are not interested in the state return (elements_finished, next_anc, next_cell_state, emit_output, next_loop_state) with tf.variable_scope('state', reuse=reuse): _, _, loop_state_ta = tf.nn.raw_rnn(cell, loop_fn) print('Anc_O {}'.format(_transpose_batch_time(loop_state_ta[0].stack()))) print('params_O {}'.format(_transpose_batch_time(loop_state_ta[1].stack()))) X_sampled_h1 = _transpose_batch_time(loop_state_ta[0].stack()) X_sampled_h2 = _transpose_batch_time(loop_state_ta[1].stack()) reconstruction_h1 = tf.nn.sigmoid(_transpose_batch_time(loop_state_ta[2].stack())) pred_next_rec_h1 = tf.nn.sigmoid(_transpose_batch_time(loop_state_ta[3].stack())[:, 0:-1]) reconstruction_h2 = tf.nn.sigmoid(_transpose_batch_time(loop_state_ta[4].stack())) pred_next_rec_h2 = tf.nn.sigmoid(_transpose_batch_time(loop_state_ta[5].stack())[:, 0:-1]) ####DONT FORGET TO CUT OFF LAST next allele PREDICTION, MEANINGLESS return X_sampled_h1, X_sampled_h2, reconstruction_h1, pred_next_rec_h1, reconstruction_h2, pred_next_rec_h2
epoch_loss_avg(loss_value) #add current batch loss #end epoch train_loss_results.append(epoch_loss_avg.result()) if epoch % 5 == 0: print('Epoch: {}, Loss: {}'.format(epoch, train_loss_results[epoch])) #save model weights model.save_weights('C:\\deep_SSM\\model_rank0.h5') ''' creating latent variables used in the second phase ''' #producing latent variables as new features for the next phase (environment state) rep = 100 #number of running replication, to reduce variance latents_to_average = np.zeros(shape=(rep, seq_length, num_seq, model.latent_dim)) for i in range(rep): latents_to_average[i] = SSM_model(model, train_data)[1] latents_to_average = tf.convert_to_tensor(latents_to_average) env_state = tf.reduce_mean(latents_to_average, axis=0) env_state = _transpose_batch_time(env_state) #adjust with common format env_state = tf.reshape(env_state, (-1, model.latent_dim)) #make it 2D #saving the results np.savetxt('C:\\deep_SSM\\envstate_rank0.csv', env_state, delimiter=',')
def mask_and_reduce(sequence, sequence_length, rank=2, average_across_batch=True, average_across_timesteps=False, average_across_remaining=False, sum_over_batch=False, sum_over_timesteps=True, sum_over_remaining=True, dtype=None, time_major=False): """Masks out sequence entries that are beyond the respective sequence lengths, and reduces (average or sum) away dimensions. This is a combination of :func:`~texar.tf.utils.shapes.mask_sequences` and :func:`~texar.tf.losses.losses_utils.reduce_batch_time`. Args: sequence: A Tensor of sequence values. If `time_major=False` (default), this must be a Tensor of shape `[batch_size, max_time, d_2, ..., d_rank]`, where the rank of the Tensor is specified with :attr:`rank`. The batch and time dimensions are exchanged if `time_major` is True. sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond the respective sequence lengths will be made zero. If `None`, not masking is performed. rank (int): The rank of :attr:`sequence`. Must be >= 2. Default is 2, i.e., `sequence` is a 2D Tensor consisting of batch and time dimensions. average_across_timesteps (bool): If set, average the sequence across the time dimension. Must not set `average_across_timesteps` and `sum_over_timesteps` at the same time. average_across_batch (bool): If set, average the sequence across the batch dimension. Must not set `average_across_batch`' and `sum_over_batch` at the same time. average_across_remaining (bool): If set, average the sequence across the remaining dimensions. Must not set `average_across_remaining`' and `sum_over_remaining` at the same time. sum_over_timesteps (bool): If set, sum the loss across the time dimension. Must not set `average_across_timesteps` and `sum_over_timesteps` at the same time. sum_over_batch (bool): If set, sum the loss across the batch dimension. Must not set `average_across_batch` and `sum_over_batch` at the same time. sum_over_remaining (bool): If set, sum the loss across the remaining dimension. Must not set `average_across_remaining` and `sum_over_remaining` at the same time. time_major (bool): The shape format of the inputs. If `True`, :attr:`sequence` must have shape `[max_time, batch_size, ...]`. If `False` (default), `sequence` must have shape `[batch_size, max_time, ...]`. dtype (dtype): Type of :attr:`sequence`. If `None`, infer from :attr:`sequence` automatically. Returns A Tensor containing the masked and reduced sequence. """ if rank < 2: raise ValueError('`rank` must be >= 2.') if time_major: sequence = rnn._transpose_batch_time(sequence) if sequence_length is not None: sequence = mask_sequences(sequence, sequence_length, dtype=dtype, time_major=False, tensor_rank=rank) if rank > 2: if average_across_remaining and sum_over_remaining: raise ValueError("Only one of `average_across_remaining` and " "`sum_over_remaining` can be set.") if average_across_remaining: sequence = tf.reduce_mean(sequence, axis=np.arange(2, rank)) elif sum_over_remaining: sequence = tf.reduce_sum(sequence, axis=np.arange(2, rank)) sequence = reduce_batch_time(sequence, sequence_length, average_across_batch, average_across_timesteps, sum_over_batch, sum_over_timesteps) reduce_time = average_across_timesteps or sum_over_timesteps reduce_batch = average_across_batch or sum_over_batch if not reduce_time and not reduce_batch and time_major: sequence = rnn._transpose_batch_time(sequence) return sequence
def __init__(self, max_seq_len, input_size, rnn_size, batch_size, lr, train_keep_prob, decay_rate=0.95, lambda_a=0.1, lambda_z=0.1, df_size=200, num_class=60, class_lr=1e-3, dtype=tf.float32): self.max_seq_len = max_seq_len self.rnn_size = rnn_size self.df_size = df_size self.batch_size = tf.placeholder_with_default(batch_size, shape=()) self.input_size = input_size self.class_lr = tf.Variable(float(class_lr), trainable=False, dtype=dtype) self.lr = tf.Variable(float(lr), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.lr.assign(self.lr * decay_rate) self.cls_lr_decay = self.lr.assign(self.class_lr * decay_rate) self.keep_prob = tf.placeholder_with_default(1.0, shape=()) self.global_step = tf.Variable(0, trainable=False) # print('rnn_size = {0}'.format(rnn_size)) with tf.variable_scope("prediction"): with tf.variable_scope("inputs"): self.enc_in = tf.placeholder( dtype, shape=[None, self.max_seq_len, input_size], name='enc_in') self.dec_in = tf.placeholder( dtype, shape=[None, self.max_seq_len, input_size], name='dec_in') self.dec_rel = tf.placeholder( dtype, shape=[None, self.max_seq_len, input_size], name='dec_in') self.seq_len = tf.placeholder(tf.int32, [None]) self.label = tf.placeholder(tf.float32, shape=[None, num_class], name='labels') mask = tf.sign(tf.reduce_max(tf.abs(self.enc_in[:, 1:, :]), 2)) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): cell_fw = [ tf.nn.rnn_cell.GRUCell(self.rnn_size // 2) for _ in range(3) ] cell_bw = [ tf.nn.rnn_cell.GRUCell(self.rnn_size // 2) for _ in range(3) ] ref_outputs, ref_fw_state, ref_bw_state = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( cell_fw, cell_bw, self.enc_in, dtype=tf.float32, sequence_length=self.seq_len) self.encoder_all_states = ref_outputs self.ref_concat = tf.keras.layers.concatenate( [ref_fw_state[-1], ref_bw_state[-1]], axis=1) self.ref_final_state = self.ref_concat pred_cell = tf.nn.rnn_cell.GRUCell(self.rnn_size) cell_ = LinearSpaceDecoderWrapper(pred_cell, self.input_size) cell = ResidualWrapper(cell_) with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): def loop_fn(time, cell_output, cell_state, loop_state): """ Loop function that allows to control input to the rnn cell and manipulate cell outputs. :param time: current time step :param cell_output: output from previous time step or None if time == 0 :param cell_state: cell state from previous time step :param loop_state: custom loop state to share information between different iterations of this loop fn :return: tuple consisting of elements_finished: tensor of size [bach_size] which is True for sequences that have reached their end, needed because of variable sequence size next_input: input to next time step next_cell_state: cell state forwarded to next time step emit_output: The first return argument of raw_rnn. This is not necessarily the output of the RNN cell, but could e.g. be the output of a dense layer attached to the rnn layer. next_loop_state: loop state forwarded to the next time step """ if cell_output is None: # time == 0, used for initialization before first call to cell next_cell_state = self.ref_final_state # the emit_output in this case tells TF how future emits look emit_output = tf.zeros([self.input_size]) else: # t > 0, called right after call to cell, i.e. cell_output is the output from time t-1. # here you can do whatever ou want with cell_output before assigning it to emit_output. # In this case, we don't do anything next_cell_state = self.ref_final_state # cell_state# emit_output = cell_output # check which elements are finished elements_finished = (time >= self.seq_len - 1) finished = tf.reduce_all(elements_finished) # assemble cell input for upcoming time step current_output = emit_output if cell_output is not None else None input_original = self.enc_in[:, 0, :] # tensor of shape (None, input_dim) if current_output is None: # this is the initial step, i.e. there is no output from a previous time step, what we feed here # can highly depend on the data. In this case we just assign the actual input in the first time step. next_in = input_original else: # time > 0, so just use previous output as next input # here you could do fancier things, whatever you want to do before passing the data into the rnn cell # if here you were to pass input_original than you would get the normal behaviour of dynamic_rnn next_in = current_output next_input = tf.cond( finished, lambda: tf.zeros([self.batch_size, self.input_size], dtype=tf.float32), # copy through zeros lambda: next_in ) # if not finished, feed the previous output as next input # set shape manually, otherwise it is not defined for the last dimensions next_input.set_shape([None, self.input_size]) # loop state not used in this example next_loop_state = None return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) outputs_ta, dec_final_state, _ = tf.nn.raw_rnn(cell, loop_fn) dec_outputs = _transpose_batch_time(outputs_ta.stack()) # dec_outputs, dec_final_state = tf.nn.dynamic_rnn(pred_cell, tf.zeros_like(self.dec_in), \ # initial_state= self.ref_concat, dtype=tf.float32, \ # sequence_length=self.seq_len) # dec_outputs, dec_final_state = tf.nn.bidirectional_dynamic_rnn(pred_fw_cell, pred_bw_cell, self.dec_in, \ # initial_state_fw=tf.contrib.rnn.LSTMStateTuple(enc_fw_state[-1].c, enc_fw_state[-1].h), \ # initial_state_bw=tf.contrib.rnn.LSTMStateTuple(enc_bw_state[-1].c, enc_bw_state[-1].h), # dtype=tf.float32, \ # sequence_length=self.seq_len) with tf.variable_scope("pred_fc", reuse=tf.AUTO_REUSE): # FC = tf.layers.Dense(units=75,activation=None,name='pred_skel') # pred_skel = FC(tf.keras.layers.concatenate([dec_outputs[0], dec_outputs[1]], axis = 0)) # pred_skel = FC(dec_outputs) self.pred_skel = dec_outputs # pred_skel # self.enc_in[:,1:,:] # print(mask) loss_l2 = tf.reduce_sum(tf.abs(self.pred_skel - self.enc_in[:, 1:, :]), 2) * mask loss_l2 = tf.reduce_sum(loss_l2, axis=1) loss_l2 /= tf.reduce_sum(mask, 1) self.loss_pred = tf.reduce_mean(loss_l2) # print("loss shape: ", self.loss_pred) self.loss = self.loss_pred # + 0.6*self.f_loss#self.enc_loss self.pred_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "prediction") # params = tf.trainable_variables(self.pred_vars) opt = tf.train.AdamOptimizer(self.lr) gradients, self.pred_vars = zip(*opt.compute_gradients(self.loss)) clipped_gradients, norm = tf.clip_by_global_norm(gradients, 25) self.gradient_norms = norm self.updates = opt.apply_gradients(zip(clipped_gradients, self.pred_vars), global_step=self.global_step) with tf.variable_scope("classifier") as scope: logits = self.Classifier(self.ref_final_state) self.cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=self.label), name='cost') # Classification learning rate optimizer = tf.train.AdamOptimizer(class_lr) self.encoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "prediction/encoder") self.classifier_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "classifier") # print(self.encoder_vars, self.classifier_vars) self.classification_vars = self.encoder_vars + self.classifier_vars # print("classifier varibales", self.classification_vars) self.train_finetune = optimizer.minimize( self.cost, var_list=self.classification_vars) self.train_fixed = optimizer.minimize(self.cost, var_list=self.classifier_vars) correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(self.label, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name='accuracy') self.pred_label = tf.argmax(logits, 1) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
def build_generator(z_prior, embeddings, num_classes, hidden_layer_size, embedding_size, z_prior_size, max_sentence_length, real_sentences=None, after_sentence_id=None): """ real_sentences: if not None, each sentence in real_sentences is the sentence which generated the corresponding entry in z_prior. TODO wording real_sentences is used for pretraining. shape: [batch_size, sentence_length] each entry is a word id, not a word embedding. after_sentence_id: must not be None if real_sentences is not None. """ with tf.variable_scope('generator') as function_scope: batch_size = tf.shape(z_prior)[0] # tf.Assert(tf.rank(z_prior) == 2, [z_prior]) # tf.Assert(tf.shape(z_prior)[0] == batch_size, [z_prior]) # tf.Assert(tf.shape(z_prior)[1] == prior_size, [z_prior]) cell = tf.nn.rnn_cell.LSTMCell(hidden_layer_size, state_is_tuple=True) #cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True) init_state = cell.zero_state(batch_size, tf.float32) total_log_probability = None if real_sentences is not None: # See Gan 2016 section 2.1 (LSTM decoder) for an explanation total_log_probability = 0 increasing = tf.range( start=0, limit=tf.cast(batch_size, tf.int64), delta=1, dtype=tf.int64) def loop_fn(time, cell_output, cell_state, loop_state): if cell_output is None: # time=0, everything here will be used for initialization only # TODO not sure about this # what i do know is that, according to the __call__ method of cells, # the state shape should be [batch size, state size], or [1, state size] for us # tf.tanh(tf.matmul(z_prior, C) + Cb) with tf.variable_scope('C', reuse=tf.AUTO_REUSE): h1 = tf.layers.dense( z_prior, hidden_layer_size, activation=tf.tanh, kernel_regularizer=None, # TODO bias_regularizer=None) next_cell_state = tf.contrib.rnn.LSTMStateTuple( c=init_state.c, h=h1) h = h1 # [batch_size, num_classes] # mul = tf.matmul(h1, V) + Vb # next_word_id = tf.argmax(mul, axis=1) # TODO C is NaN when running textgan! # but after a single batch? # maybe gradient needs to be clipped! # next_word_id = tf.Print(next_word_id, [C], summarize= 100) # section 2.5 of Zhang discusses this "soft-argmax". in simpler terms, # this is needed because argmax has no gradient and thus breaks the path # between the loss function and the variables V, Vb, etc. # The other way is to use something like REINFORCE, but zhang thankfully # proposes this simpler solution. # next_word = tf.matmul( # tf.nn.softmax(L * mul, axis=1), embeddings) # This is the old way #next_word = tf.map_fn(lambda id: tf.nn.embedding_lookup(embeddings, id), next_word_id, dtype=tf.float32) # this is what should be emitted next # next_loop_state = (next_word_id, next_word) # this tells raw_rnn what the rest of our emits will look like. # first item: the id of the word that was generated # second item: the embedding of the word that was generated, calculated # via soft-argmax. # basically a placeholder for what INDIVIDUAL batch items will be emitting on # each iteration. emit_output = ( tf.zeros([], dtype=tf.int64), tf.zeros([embedding_size], dtype=tf.float32), tf.zeros([], dtype=tf.float32)) # negative log probability else: # If this first emit_output return value is None, then the emit_ta # result of raw_rnn will have the same structure and dtypes as # cell.output_size. Otherwise emit_ta will have the same structure, # shapes (prepended with a batch_size dimension), and dtypes as # emit_output. # so we needed to expand this so that its first dim is the batch size #emit_output = tf.expand_dims(loop_state,0) # this shouldn't be the case anymore...we should be able to directly do: # Note: moved this below emit_output = loop_state next_cell_state = cell_state h = next_cell_state.h with tf.variable_scope('V', reuse=tf.AUTO_REUSE): mul = tf.layers.dense( h, num_classes, activation=None, kernel_regularizer=None, # TODO bias_regularizer=None) next_word_id = tf.argmax(mul, axis=1) # see above for the explanation of this soft-argmax next_word = tf.matmul(tf.nn.softmax(L * mul, axis=1), embeddings) #next_word = tf.map_fn(lambda id: tf.nn.embedding_lookup(embeddings, id), next_word_id, dtype=tf.float32) # next_loop_state = (next_word_id, next_word) # TODO this should be improved elements_finished = (time >= max_sentence_length) if real_sentences is not None: # For each sentence, we get the negative log probability of # the ACTUAL word that should have been generated. # The sum of all of these probabilities forms the objective # function. See Gan 2016. # https://stackoverflow.com/questions/36824580 # I don't know why there's not an easier way to do this. # Concatenate batch index and true label # Note that in Tensorflow < 1.0.0 you must call tf.pack # Note the cond: basically just avoiding an error when we # finish the sentence. Note that this whole block gets run # when elements_finished is true, but the output isn't used # so there's probably a cleaner way to do this. mask = tf.stack( [ increasing, real_sentences[:, tf.cond(time < max_sentence_length, lambda: time, lambda: 0)] ], axis=1) # Extract values sm = tf.nn.softmax(mul) masked = tf.gather_nd(params=sm, indices=mask) # only take the softmax values that correspond to valid words. # otherwise, use 1, so that the sum of logs will not be affected. masked = tf.where( tf.not_equal(mask[:, 1], after_sentence_id), masked, tf.ones([batch_size], dtype=tf.float32)) neg_log_probability = -tf.log(masked) # TODO not sure what to do here. Zeros after the softmax lead # to infinities after the log. replace = tf.ones_like(neg_log_probability) * tf.constant(1e2) neg_log_probability = tf.where( tf.is_inf(neg_log_probability), replace, neg_log_probability) # Determine what should be emitted next time. if real_sentences is not None: next_loop_state = (next_word_id, next_word, neg_log_probability) else: next_loop_state = (next_word_id, next_word, tf.zeros([batch_size], dtype=tf.float32)) return (elements_finished, next_word, next_cell_state, emit_output, next_loop_state) emit_ta, final_state, final_loop_state = tf.nn.raw_rnn(cell, loop_fn) word_ids, words, neg_log_probability_ta = emit_ta out_log_prob = _transpose_batch_time(neg_log_probability_ta.stack()) # must transpose first two dimensions from [sentence_length, batch_size] # to [batch_size, sentence_length] return _transpose_batch_time(word_ids.stack()), _transpose_batch_time( words.stack()), out_log_prob
def forward(self, x, keep_prob): """ x -- input features, [batch_size, n_seg, n_input] """ if self.reverse: # reverse the sequence if needed, claimed to be useful for NMT x = x[:, ::-1, :] batch_size = tf.shape(x)[0] seq_len = tf.ones((batch_size, ), dtype='int32') * self.n_seg ###################### Encoder ################### def RNN(x): dropout_cell = tf.contrib.rnn.DropoutWrapper( self.encoder_cell, input_keep_prob=keep_prob) # onlyt input dropout is used encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn( dropout_cell, x, seq_len, dtype=tf.float32, scope="Seq2seqTSN/encoder") return encoder_outputs[:, -1], encoder_final_state # encode x_flat = tf.reshape(x, [-1, self.n_input]) h_encode = tf.nn.relu( tf.nn.xw_plus_b(x_flat, self.W_encode, self.b_encode)) h_encode = tf.reshape(h_encode, [-1, self.n_seg, self.emb_dim]) self.hidden, encoder_final_state = RNN(h_encode) ###################### Decoder ################### def loop_fn(time, cell_output, cell_state, loop_state): def get_next_input(): if cell_state is None: next_input = tf.zeros([batch_size, self.n_input], dtype=tf.float32) else: #next_input = tf.nn.xw_plus_b(cell_output, self.W_ho, self.b_o) # conditioned next_input = tf.zeros([batch_size, self.n_input], dtype=tf.float32) # un-conditioned return next_input emit_output = cell_output if cell_state is None: next_cell_state = encoder_final_state else: next_cell_state = cell_state elements_finished = (time >= seq_len) finished = tf.reduce_all(elements_finished) next_input = tf.cond( finished, lambda: tf.zeros([batch_size, self.n_input], dtype=tf.float32), get_next_input) next_loop_state = None return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) # decode outputs_ta, final_state, _ = tf.nn.raw_rnn(self.decoder_cell, loop_fn, scope="Seq2seqTSN/decoder") outputs = _transpose_batch_time(outputs_ta.stack( )) # outputs and shape [batch_size, time ,output_dim] outputs = tf.reshape(outputs, [-1, self.emb_dim]) h_decode = tf.nn.relu( tf.nn.xw_plus_b(outputs, self.W_decode1, self.b_decode1)) x_recon = tf.nn.xw_plus_b(h_decode, tf.transpose(self.W_encode), self.b_decode2) self.x_recon = tf.reshape(x_recon, [-1, self.n_seg, self.n_input])
act_direct = act_direct.reshape((act_direct.shape[0], 1)) act_direct.shape #to make it (30000,1) #call true_latent data (from Phase 1 nodel with rank 2) masterlatent = np.genfromtxt('C:\\deep_SSM\\envstate_rank2.csv', delimiter=',') masterlatent = np.array(masterlatent, dtype='float32') masterlatent.shape #Global variable pre_seq_length = 20 #original trajectory length of train data seq_length = 19 #effective length used for modelling phase 2 is 20-1 (the first opponent action is assummed given) num_seq = 1500 #number of sequences/trajectories in training data #preparing the shape of training data temp_action = tf.reshape(act_direct, shape=(num_seq, pre_seq_length, -1)) temp_action = _transpose_batch_time(temp_action) action = temp_action[-seq_length:, :, :] temp_latent = tf.reshape(masterlatent, shape=(num_seq, pre_seq_length, -1)) temp_latent = _transpose_batch_time(temp_latent) latent = temp_latent[:seq_length, :, :] train_data = tf.concat([action, latent], -1) train_data.shape #define a class of the model (simple: rank 0), in fact it is also an RNN cell class SSM_phase2(tf.keras.Model): def __init__(self, latent_dim=2, emission_dim=1, phase1latent_dim=4): super(SSM_phase2, self).__init__() self.latent_dim = latent_dim
def _build_net(self): with tf.variable_scope(self.name): #### PLACEHOLDER DECLARATION self.mb_size = tf.placeholder(tf.int32, [], name='batch_size') self.lr_rate = tf.placeholder(tf.float32) self.keep_prob = tf.placeholder(tf.float32) #keeping rate self.a = tf.placeholder(tf.float32) self.b = tf.placeholder(tf.float32) self.c = tf.placeholder(tf.float32) self.x = tf.placeholder(tf.float32, shape=[None, self.max_length, self.x_dim]) self.x_mi = tf.placeholder( tf.float32, shape=[None, self.max_length, self.x_dim] ) #this is the missing indicator (including for cont. & binary) (includes delta) self.k = tf.placeholder( tf.float32, shape=[None, 1]) #event/censoring label (censoring:0) self.t = tf.placeholder(tf.float32, shape=[None, 1]) self.fc_mask1 = tf.placeholder( tf.float32, shape=[None, self.num_Event, self.num_Category]) #for denominator self.fc_mask2 = tf.placeholder( tf.float32, shape=[None, self.num_Event, self.num_Category]) #for Loss 1 self.fc_mask3 = tf.placeholder(tf.float32, shape=[None, self.num_Category ]) #for Loss 2 seq_length = get_seq_length(self.x) tmp_range = tf.expand_dims(tf.range(0, self.max_length, 1), axis=0) self.rnn_mask1 = tf.cast( tf.less_equal(tmp_range, tf.expand_dims(seq_length - 1, axis=1)), tf.float32) self.rnn_mask2 = tf.cast( tf.equal(tmp_range, tf.expand_dims(seq_length - 1, axis=1)), tf.float32) ### DEFINE LOOP FUNCTION FOR RAW_RNN w/ TEMPORAL ATTENTION def loop_fn_att(time, cell_output, cell_state, loop_state): emit_output = cell_output if cell_output is None: # time == 0 next_cell_state = cell.zero_state(self.mb_size, tf.float32) next_loop_state = loop_state_ta else: next_cell_state = cell_state tmp_h = utils.create_concat_state(next_cell_state, self.num_layers_RNN, self.RNN_type) e = utils.create_FCNet(tf.concat([tmp_h, all_last], axis=1), self.num_layers_ATT, self.h_dim2, tf.nn.tanh, 1, None, self.initial_W, keep_prob=self.keep_prob) e = tf.exp(e) next_loop_state = ( loop_state[0].write(time - 1, e), # save att power (e_{j}) loop_state[1].write(time - 1, tmp_h) ) # save all the hidden states # elements_finished = (time >= seq_length) elements_finished = (time >= self.max_length - 1) #this gives the break-point (no more recurrence after the max_length) finished = tf.reduce_all(elements_finished) next_input = tf.cond( finished, lambda: tf.zeros([self.mb_size, 2 * self.x_dim], dtype=tf.float32), # [x_hist, mi_hist] lambda: inputs_ta.read(time)) return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) # divide into the last x and previous x's x_last = tf.slice(self.x, [0, (self.max_length - 1), 1], [-1, -1, -1]) #current measurement x_last = tf.reshape(x_last, [-1, (self.x_dim_cont + self.x_dim_bin) ]) #remove the delta of the last measurement x_last = tf.reduce_sum( tf.tile(tf.expand_dims(self.rnn_mask2, axis=2), [1, 1, self.x_dim]) * self.x, reduction_indices=1 ) #sum over time since all others time stamps are 0 x_last = tf.slice( x_last, [0, 1], [-1, -1]) #remove the delta of the last measurement x_hist = self.x * ( 1. - tf.tile(tf.expand_dims(self.rnn_mask2, axis=2), [1, 1, self.x_dim]) ) #since all others time stamps are 0 and measurements are 0-padded x_hist = tf.slice(x_hist, [0, 0, 0], [-1, (self.max_length - 1), -1]) # do same thing for missing indicator mi_last = tf.slice(self.x_mi, [0, (self.max_length - 1), 1], [-1, -1, -1]) #current measurement mi_last = tf.reshape(mi_last, [-1, (self.x_dim_cont + self.x_dim_bin) ]) #remove the delta of the last measurement mi_last = tf.reduce_sum( tf.tile(tf.expand_dims(self.rnn_mask2, axis=2), [1, 1, self.x_dim]) * self.x_mi, reduction_indices=1 ) #sum over time since all others time stamps are 0 mi_last = tf.slice( mi_last, [0, 1], [-1, -1]) #remove the delta of the last measurement mi_hist = self.x_mi * ( 1. - tf.tile(tf.expand_dims(self.rnn_mask2, axis=2), [1, 1, self.x_dim]) ) #since all others time stamps are 0 and measurements are 0-padded mi_hist = tf.slice(mi_hist, [0, 0, 0], [-1, (self.max_length - 1), -1]) all_hist = tf.concat([x_hist, mi_hist], axis=2) all_last = tf.concat([x_last, mi_last], axis=1) #extract inputs for the temporal attention: mask (to incorporate only the measured time) and x_{M} seq_length = get_seq_length(x_hist) rnn_mask_att = tf.cast( tf.not_equal(tf.reduce_sum(x_hist, reduction_indices=2), 0), dtype=tf.float32 ) #[mb_size, max_length-1], 1:measurements 0:no measurements ##### SHARED SUBNETWORK: RNN w/ TEMPORAL ATTENTION #change the input tensor to TensorArray format with [max_length, mb_size, x_dim] inputs_ta = tf.TensorArray(dtype=tf.float32, size=self.max_length - 1).unstack( _transpose_batch_time(all_hist), name='Shared_Input') #create a cell with RNN hyper-parameters (RNN types, #layers, #nodes, activation functions, keep proability) cell = utils.create_rnn_cell(self.h_dim1, self.num_layers_RNN, self.keep_prob, self.RNN_type, self.RNN_active_fn) #define the loop_state TensorArray for information from rnn time steps loop_state_ta = ( tf.TensorArray(size=self.max_length - 1, dtype=tf.float32), #e values (e_{j}) tf.TensorArray(size=self.max_length - 1, dtype=tf.float32)) #hidden states (h_{j}) rnn_outputs_ta, self.rnn_final_state, loop_state_ta = tf.nn.raw_rnn( cell, loop_fn_att) #rnn_outputs_ta : TensorArray #rnn_final_state : Tensor #rnn_states_ta : (TensorArray, TensorArray) rnn_outputs = _transpose_batch_time(rnn_outputs_ta.stack()) # rnn_outputs = tf.reshape(rnn_outputs, [-1, self.max_length-1, self.h_dim1]) rnn_states = _transpose_batch_time(loop_state_ta[1].stack()) att_weight = _transpose_batch_time( loop_state_ta[0].stack()) #e_{j} att_weight = tf.reshape(att_weight, [ -1, self.max_length - 1 ]) * rnn_mask_att # masking to set 0 for the unmeasured e_{j} #get a_{j} = e_{j}/sum_{l=1}^{M-1}e_{l} self.att_weight = div( att_weight, (tf.reduce_sum(att_weight, axis=1, keepdims=True) + _EPSILON)) #softmax (tf.exp is done, previously) # 1) expand att_weight to hidden state dimension, 2) c = \sum_{j=1}^{M} a_{j} x h_{j} self.context_vec = tf.reduce_sum(tf.tile( tf.reshape(self.att_weight, [-1, self.max_length - 1, 1]), [1, 1, self.num_layers_RNN * self.h_dim1]) * rnn_states, axis=1) self.z_mean = FC_Net(rnn_outputs, self.x_dim, activation_fn=None, weights_initializer=self.initial_W, scope="RNN_out_mean1") self.z_std = tf.exp( FC_Net(rnn_outputs, self.x_dim, activation_fn=None, weights_initializer=self.initial_W, scope="RNN_out_std1")) epsilon = tf.random_normal( [self.mb_size, self.max_length - 1, self.x_dim], mean=0.0, stddev=1.0, dtype=tf.float32) self.z = self.z_mean + self.z_std * epsilon ##### CS-SPECIFIC SUBNETWORK w/ FCNETS inputs = tf.concat([x_last, self.context_vec], axis=1) #1 layer for combining inputs h = FC_Net(inputs, self.h_dim2, activation_fn=self.FC_active_fn, weights_initializer=self.initial_W, scope="Layer1") h = tf.nn.dropout(h, keep_prob=self.keep_prob) # (num_layers_CS-1) layers for cause-specific (num_Event subNets) out = [] for _ in range(self.num_Event): cs_out = utils.create_FCNet(h, (self.num_layers_CS), self.h_dim2, self.FC_active_fn, self.h_dim2, self.FC_active_fn, self.initial_W, self.reg_W, self.keep_prob) out.append(cs_out) out = tf.stack(out, axis=1) # stack referenced on subject out = tf.reshape(out, [-1, self.num_Event * self.h_dim2]) out = tf.nn.dropout(out, keep_prob=self.keep_prob) out = FC_Net(out, self.num_Event * self.num_Category, activation_fn=tf.nn.softmax, weights_initializer=self.initial_W, weights_regularizer=self.reg_W_out, scope="Output") self.out = tf.reshape(out, [-1, self.num_Event, self.num_Category]) ##### GET LOSS FUNCTIONS self.loss_Log_Likelihood() #get loss1: Log-Likelihood loss self.loss_Ranking() #get loss2: Ranking loss self.loss_RNN_Prediction() #get loss3: RNN prediction loss self.LOSS_TOTAL = self.a * self.LOSS_1 + self.b * self.LOSS_2 + self.c * self.LOSS_3 + tf.losses.get_regularization_loss( ) self.LOSS_BURNIN = self.LOSS_3 + tf.losses.get_regularization_loss( ) self.solver = tf.train.AdamOptimizer( learning_rate=self.lr_rate).minimize(self.LOSS_TOTAL) self.solver_burn_in = tf.train.AdamOptimizer( learning_rate=self.lr_rate).minimize(self.LOSS_BURNIN)
def __init__( self, architecture, max_seq_len, human_size, rnn_size, # hidden recurrent layer size num_layers, max_gradient_norm, stddev, batch_size, learning_rate, learning_rate_decay_factor, summaries_dir, loss_to_use, number_of_actions, one_hot=True, residual_velocities=False, dtype=tf.float32): """Create the model. Args: architecture: [basic, tied] whether to tie the decoder and decoder. source_seq_len: lenght of the input sequence. #target_seq_len: lenght of the target sequence. rnn_size: number of units in the rnn. num_layers: number of rnns to stack. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. summaries_dir: where to log progress for tensorboard. loss_to_use: [supervised, sampling_based]. Whether to use ground truth in each timestep to compute the loss after decoding, or to feed back the prediction from the previous time-step. number_of_actions: number of classes we have. one_hot: whether to use one_hot encoding during train/test (sup models). residual_velocities: whether to use a residual connection that models velocities. dtype: the data type to use to store internal variables. """ self.HUMAN_SIZE = human_size self.input_size = self.HUMAN_SIZE + number_of_actions if one_hot else self.HUMAN_SIZE print("One hot is ", one_hot) print("Input size is %d" % self.input_size) # Summary writers for train and test runs self.train_writer = tf.summary.FileWriter( os.path.normpath(os.path.join(summaries_dir, 'train'))) self.test_writer = tf.summary.FileWriter( os.path.normpath(os.path.join(summaries_dir, 'test'))) self.max_seq_len = max_seq_len self.rnn_size = rnn_size self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # === Create the RNN that will keep the state === print('rnn_size = {0}'.format(rnn_size)) cell = tf.contrib.rnn.GRUCell(self.rnn_size) if num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.GRUCell(self.rnn_size) for _ in range(num_layers) ]) # === Transform the inputs === with tf.name_scope("inputs_gts"): inputs = tf.placeholder( dtype, shape=[None, self.max_seq_len + 1, self.input_size], name="inputs") gts = tf.placeholder( dtype, shape=[None, self.max_seq_len, self.input_size], name="gts") seq_len = tf.placeholder(tf.int32, shape=[None], name="seq_len") self.inputs = inputs self.gts = gts self.seq_len = seq_len ''' inputs = tf.transpose(inputs, [1, 0, 2]) gts = tf.transpose(gts, [1, 0, 2]) inputs = tf.reshape(inputs, [-1, self.input_size]) gts = tf.reshape(gts, [-1, self.input_size]) inputs = tf.split(inputs, self.max_seq_len, axis=0) gts = tf.split(gts, self.max_seq_len, axis=0) ''' inputs = _transpose_batch_time(inputs) gts = _transpose_batch_time(gts) # === Add space decoder === cell = rnn_cell_extensions.LinearSpaceDecoderWrapper( cell, self.input_size) # Finally, wrap everything in a residual layer if we want to model velocities if residual_velocities: cell = rnn_cell_extensions.ResidualWrapper(cell) # Store the outputs here outputs = [] self.stddev = stddev def addGN(inputs): noise = tf.random_normal(shape=tf.shape(inputs), mean=0.0, stddev=self.stddev, dtype=tf.float32) return inputs + noise self.is_training = tf.placeholder(dtype=tf.bool) # Build the RNN if architecture == "basic": cell_init_state = tf.Variable(np.zeros([1, cell.state_size]), trainable=True, dtype=tf.float32) init_input = tf.Variable(np.zeros([63]), trainable=True, dtype=tf.float32) output_ta = tf.TensorArray(size=self.max_seq_len, dtype=tf.float32) def loop_fn(time, cell_output, cell_state, loop_state): emit_output = cell_output if cell_output is None: #next_cell_state = cell.zero_state(self.batch_size, tf.float32) next_cell_state = tf.tile(cell_init_state, [tf.shape(inputs[0])[0], 1]) next_input = tf.cond( self.is_training, lambda: tf.concat([ tf.tile(tf.expand_dims(init_input, 0), [tf.shape(inputs[0])[0], 1]), addGN(inputs[time]) ], axis=1), lambda: tf.concat([ tf.tile(tf.expand_dims(init_input, 0), [tf.shape(inputs[0])[0], 1]), inputs[time] ], axis=1)) next_loop_state = output_ta else: next_cell_state = cell_state next_input = tf.cond( self.is_training, lambda: tf.concat( [cell_output, addGN(inputs[time])], axis=1), lambda: tf.concat([cell_output, inputs[time]], axis=1)) next_loop_state = loop_state.write(time - 1, cell_output) finished = (time > self.max_seq_len - 1) #finished = False return (finished, next_input, next_cell_state, emit_output, next_loop_state) # Basic RNN does not have a loop function in its API, so copying here. with vs.variable_scope("raw_rnn"): _, _, loop_state_ta = tf.nn.raw_rnn(cell, loop_fn) #outputs = _transpose_batch_time(loop_state_ta.stack()) outputs = loop_state_ta.stack() self.outputs = outputs mask1 = tf.tile( tf.expand_dims( tf.transpose( tf.sequence_mask(self.seq_len, dtype=tf.float32, maxlen=self.max_seq_len)), -1), [1, 1, self.input_size]) mask2 = tf.tile( tf.expand_dims( tf.transpose( tf.sequence_mask(self.seq_len - 1, dtype=tf.float32, maxlen=self.max_seq_len - 1)), -1), [1, 1, self.input_size]) with tf.name_scope("loss_pos"): loss_pos = tf.reduce_mean( tf.square( tf.subtract(tf.multiply(outputs, mask1), tf.multiply(gts, mask1)))) with tf.name_scope("loss_smooth"): loss_smooth = tf.reduce_mean( tf.square( tf.multiply(tf.subtract(outputs[1:], outputs[:-1]), mask2))) #self.loss = tf.add(loss_pos, loss_smooth*1000) self.loss = loss_pos self.loss_summary = tf.summary.scalar('loss/loss', self.loss) self.loss_each_data = tf.reduce_mean(tf.square(tf.subtract(tf.multiply(gts,mask1), tf.multiply(outputs,mask1))), axis=[0,2]) \ + tf.reduce_mean(tf.square(tf.multiply(tf.subtract( outputs[1:], outputs[:-1]),mask2)),axis=[0,2]) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() opt = tf.train.AdamOptimizer(learning_rate=learning_rate) # Update all the trainable parameters gradients = tf.gradients(self.loss, params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms = norm self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) self.learning_rate_summary = tf.summary.scalar( 'learning_rate/learning_rate', self.learning_rate) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
def __call__(self, input, conditioned_lst, reuse=False): """ Use this to construct tensorflow network graph. :param input: tf.Placeholder the shape of tensor should be (batch_size, time_steps, feature_size) :param conditioned_lst: tf.Placeholder the shape of tensor should be (time_steps) :param reuse: Bool if reuse variable :return: network out tensor the shape of tensor should be [batch_size, time_steps, feature_size] """ with tf.variable_scope(self.name) as scope: if reuse: scope.reuse_variables() lstm_layer = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.BasicLSTMCell(self.nb_lstm_units, name='lstm_{}'.format(i)) for i in range(self.nb_lstm_layers) ]) batch_size = self.batch_size initial_state = lstm_layer.zero_state(batch_size=batch_size, dtype=tf.float32) # raw_rnn expects time major inputs as TensorArrays time_steps = self.nb_time_steps inputs_ta = tf.TensorArray(dtype=tf.float32, size=time_steps, clear_after_read=False, name='Inputs') inputs_ta = inputs_ta.unstack(_transpose_batch_time( input)) # model_input is the input placeholder input_dim = input.get_shape( )[-1].value # the dimensionality of the input to each time step output_dim = input_dim # the dimensionality of the model's output at each time step conditioned_ta = tf.TensorArray(dtype=tf.bool, size=time_steps, clear_after_read=False, name='Conditioned') conditioned_ta = conditioned_ta.unstack(conditioned_lst) def loop_fn(time, cell_output, cell_state, loop_state): elements_finished = (time >= time_steps) finished = tf.reduce_all(elements_finished) if cell_output is None: next_cell_state = initial_state emit_output = tf.zeros([output_dim]) # create input next_input = inputs_ta.read(time) else: next_cell_state = cell_state emit_output = tf.layers.dense(cell_output, output_dim, reuse=tf.AUTO_REUSE) if self.layer_norm: emit_output = layers.layer_norm(emit_output, center=True, scale=True) emit_output = tf.nn.relu(emit_output) # if conditioned_lst[time] is 0, use current_output next_input = tf.cond( finished, lambda: tf.zeros([batch_size, input_dim], dtype=tf.float32), lambda: tf.cond(conditioned_ta.read(time), lambda: inputs_ta. read(time), lambda: emit_output)) # loop state not used in this example next_loop_state = None return elements_finished, next_input, next_cell_state, emit_output, next_loop_state out_ta, _, _ = tf.nn.raw_rnn(lstm_layer, loop_fn) out = _transpose_batch_time(out_ta.stack()) return out
def p2_encoder2_rnn(self, input_encoder, temperature, units_lstm, train, hap_lens, reuse): with tf.variable_scope('enc_p2', reuse=reuse): # Ancestors Anc = tf.get_variable(name='Ancs', shape=[self.len_ancs, self.dim_ancs]) w_proj = tf.get_variable(shape=[units_lstm, self.dim_ancs], dtype=tf.float32, name='w_proj') b_proj = tf.get_variable(shape=[self.dim_ancs], dtype=tf.float32, name='b_proj') cell = tf.contrib.rnn.LSTMCell(units_lstm) inputs = tf.transpose(input_encoder, perm=[1, 0, 2]) #had to concat these zeros, kind of awkward, not sure why inputs = tf.concat([ inputs, tf.zeros( [1, self.batch_size, tf.shape(inputs)[-1]], dtype=tf.float32) ], axis=0) output_ta = (tf.TensorArray(size=self.max_hap_len, dtype=tf.float32), tf.TensorArray(size=self.max_hap_len, dtype=tf.float32), tf.TensorArray(size=self.max_hap_len, dtype=tf.float32)) #inputs_ta = tf.TensorArray(dynamic_size=False,dtype=tf.float32,size=self.max_hap_len,clear_after_read=False) #inputs_ta.unstack(inputs) print(input_encoder) print(output_ta) print(tf.transpose(input_encoder, perm=[1, 0, 2])) #take out when using placeholders print('here') def loop_fn(time, cell_output, cell_state, loop_state): print('cell_output {}'.format(cell_output)) print('cell_state {}'.format(cell_state)) #print(inputs_ta) emit_output = cell_output # don't care about this one, only care about loop_state in this case because loop_state doesn't have to be same shape as rnn output if cell_output is None: # time == 0 print('here1') next_cell_state = cell.zero_state(self.batch_size, tf.float32) print('here2') print(time) next_anc = tf.concat([ tf.zeros(shape=[self.batch_size, self.len_ancs], dtype=tf.float32), inputs[[time]] ], axis=-1) #inputs_ta.read(time) ], axis=-1) print('here2.5') print('here3') next_loop_state = output_ta else: print('here4') next_cell_state = cell_state with tf.variable_scope('enc_p2', reuse=True): pre_next_anc1 = tf.nn.relu( tf.matmul(cell_output, w_proj) + b_proj) print('here5') anc_distribution = self.att_dot(query=pre_next_anc1, values=Anc) if train: dist = tf.contrib.distributions.ExpRelaxedOneHotCategorical( temperature=temperature, probs=anc_distribution) next_anc_sample = dist.sample() #next_anc_sample = anc_distribution next_anc = tf.concat([next_anc_sample, inputs[[time]]], axis=-1) else: dist = tf.contrib.distributions.Categorical( probs=anc_distribution) next_anc_sample = tf.cast(tf.one_hot(dist.sample(), depth=self.len_ancs, axis=-1), dtype=tf.float32) #next_anc_sample=anc_distribution next_anc = tf.concat([next_anc_sample, inputs[[time]]], axis=-1) print('ANC DIST {}'.format(anc_distribution)) print('here6') print(next_anc_sample) # this is sent as input to the next iteration of the cell #inputs_ta.read(time)], axis=-1) print('here7') # output to store for the iteration next_loop_state = (loop_state[0].write(time - 1, next_anc_sample), loop_state[1].write(time - 1, anc_distribution), loop_state[2].write(time - 1, pre_next_anc1)) print('out_loop') # this gives us a vector in the size of the batch, telling us which elements have finished elements_finished = time >= hap_lens print(elements_finished) # because we are not interested in the state print('next_anc {}'.format(next_anc)) print('next_cell {}'.format(next_cell_state)) return (elements_finished, next_anc, next_cell_state, emit_output, next_loop_state) with tf.variable_scope('state', reuse=reuse): _, _, loop_state_ta = tf.nn.raw_rnn(cell, loop_fn) print('Anc_O {}'.format(_transpose_batch_time( loop_state_ta[0].stack()))) print('params_O {}'.format( _transpose_batch_time(loop_state_ta[1].stack()))) X_sampled = _transpose_batch_time(loop_state_ta[0].stack()) dist_params = _transpose_batch_time(loop_state_ta[1].stack()) query_vecs = _transpose_batch_time(loop_state_ta[2].stack()) return X_sampled, dist_params, query_vecs
def dynamic_crnn(cell, inputs, gate_vector, sequence_length=None, initial_state=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None): if not _like_rnncell(cell): raise TypeError("cell must be an instance of RNNCell") flat_input = nest.flatten(inputs) if not time_major: # (B,T,D) => (T,B,D) flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input] flat_input = tuple( rnn._transpose_batch_time(input_) for input_ in flat_input) parallel_iterations = parallel_iterations or 32 if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length.get_shape().ndims not in (None, 1): raise ValueError( "sequence_length must be a vector of length batch_size, " "but saw shape: %s" % sequence_length.get_shape()) sequence_length = array_ops.identity( # Just to find it in the graph. sequence_length, name="sequence_length") with vs.variable_scope(scope or "rnn") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) batch_size = rnn._best_effort_input_batch_size(flat_input) if initial_state is not None: state = initial_state else: if not dtype: raise ValueError( "If there is no initial_state, you must give a dtype.") state = cell.zero_state(batch_size, dtype) def _assert_has_shape(x, shape): x_shape = array_ops.shape(x) packed_shape = array_ops.stack(shape) return rnn.control_flow_ops.Assert( math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), [ "Expected shape for Tensor %s is " % x.name, packed_shape, " but saw shape: ", x_shape ]) if sequence_length is not None: # Perform some shape validation with ops.control_dependencies( [_assert_has_shape(sequence_length, [batch_size])]): sequence_length = array_ops.identity(sequence_length, name="CheckSeqLen") inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input) (outputs, final_state) = _dynamic_crnn_loop( cell, inputs, state, parallel_iterations=parallel_iterations, gate_vector=gate_vector, swap_memory=swap_memory, sequence_length=sequence_length, dtype=dtype) if not time_major: # (T,B,D) => (B,T,D) outputs = nest.map_structure(rnn._transpose_batch_time, outputs) return (outputs, final_state)
def sampling_rnn(self, cell, initial_state, input_, seq_lengths): # raw_rnn expects time major inputs as TensorArrays max_time = seq_lengths+1 # this is the max time step per batch inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time, clear_after_read=False) inputs_ta = inputs_ta.unstack(_transpose_batch_time(input_)) # model_input is the input placeholder output_dim = self.k # the dimensionality of the model's output at each time step input_dim = input_.get_shape()[-1].value + output_dim # the dimensionality of the input to each time step def loop_fn(time, cell_output, cell_state, loop_state): """ Loop function that allows to control input to the rnn cell and manipulate cell outputs. :param time: current time step :param cell_output: output from previous time step or None if time == 0 :param cell_state: cell state from previous time step :param loop_state: custom loop state to share information between different iterations of this loop fn :return: tuple consisting of elements_finished: tensor of size [bach_size] which is True for sequences that have reached their end, needed because of variable sequence size next_input: input to next time step next_cell_state: cell state forwarded to next time step emit_output: The first return argument of raw_rnn. This is not necessarily the output of the RNN cell, but could e.g. be the output of a dense layer attached to the rnn layer. next_loop_state: loop state forwarded to the next time step """ if cell_output is None: # time == 0, used for initialization before first call to cell next_cell_state = initial_state # the emit_output in this case tells TF how future emits look emit_output = tf.zeros([output_dim]) else: # t > 0, called right after call to cell, i.e. cell_output is the output from time t-1. # here you can do whatever ou want with cell_output before assigning it to emit_output. # In this case, we don't do anything next_cell_state = cell_state emit_output = cell_output # check which elements are finished elements_finished = (time >= seq_lengths) finished = tf.reduce_all(elements_finished) # assemble cell input for upcoming time step current_output = emit_output if cell_output is not None else None input_original = inputs_ta.read(time) # tensor of shape (None, input_dim) if current_output is None: # this is the initial step, i.e. there is no output from a previous time step, what we feed here # can highly depend on the data. In this case we just assign the actual input in the first time step. next_in = tf.concat([input_original, tf.zeros([self.batch_size,output_dim])],axis=1) else: # time > 0, so just use previous output as next input # here you could do fancier things, whatever you want to do before passing the data into the rnn cell # if here you were to pass input_original than you would get the normal behaviour of dynamic_rnn next_in = tf.concat([input_original,current_output],axis=1) next_input = tf.cond(finished, lambda: tf.zeros([self.batch_size, input_dim], dtype=tf.float32), # copy through zeros lambda: next_in) # if not finished, feed the previous output as next input # set shape manually, otherwise it is not defined for the last dimensions next_input.set_shape([None, input_dim]) # loop state not used in this example next_loop_state = None return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) outputs_ta, last_state, _ = tf.nn.raw_rnn(cell, loop_fn) outputs = _transpose_batch_time(outputs_ta.stack()) final_state = last_state return outputs, final_state
def seq2seq_f(encoder_inputs, decoder_inputs, targets, last_input, track_padding_vec=None): # returns (self.LSTM_output, self.internal_states) target_input_ta = tf.TensorArray(dtype=tf.float32, size=len(targets)) for j in range(len(decoder_inputs)): target_input_ta = target_input_ta.write(j, targets[j]) if track_padding_vec is not None: track_padding_ta = tf.TensorArray(dtype=tf.bool, size=len(track_padding_vec)) for j in range(len(decoder_inputs)): track_padding_ta = track_padding_ta.write( j, track_padding_vec[j]) """ First this runs the encoder, then it saves the last internal RNN c state, and passes that into the loop parameter as the initial condition. Then it runs the decoder.""" with tf.variable_scope('seq2seq_encoder'): # So I have a list of len(time) of Tensors of shape (batch, RNN dim) reordered_encoder_inputs = tf.stack(encoder_inputs, axis=1) encoder_outputs, last_enc_state = tf.nn.dynamic_rnn( self._RNN_layers, inputs=reordered_encoder_inputs, dtype=tf.float32) """RNN loop function, the heart of this network. """ def loop_fn(time, cell_output, cell_state, loop_state): emit_output = cell_output if cell_output is None: # Set initial params next_cell_state = last_enc_state # I have defined last 'encoder input' as actually the first decoder input. It is data for time T_0 next_input = decoder_inputs[ 0] # Encoder inputs already have input layer applied next_loop_state = (output_ta[0], output_ta[1], output_ta[2].write(time, last_input), output_ta[3], output_ta[4]) else: next_cell_state = cell_state projected_output = MDN_output_function(cell_output) # Take a single sample of the MDN. This may be ignored later, depending on the use-case. sampled = MDN.sample( projected_output, temperature=self.parameters['sample_temperature']) upscale_sampled = _upscale_sampled_output(sampled) # If the no feedforward flag, just give the next time-step of the network zeros. # This is the equivalent of the RNN-ZF (zero feed) network in the paper. if self.parameters['no_feedforward']: next_sampled_input = tf.zeros( [ upscale_sampled.shape[0], scaling_layer[0].shape[0] ], dtype=tf.float32) # Size batch, input width elif self.parameters['input_mask'][2:4] == [0, 0]: next_sampled_input = _pad_missing_output_with_zeros( upscale_sampled) # Else take a sample, and feed this as the next input for the next sequence. # All of this is done within tensorflow, as it allows it to run INSIDE the GPU. # This section is often done sampled once outside of tensorflow using Numpy to resolve the MDN # and performing it this way does not allow else: next_sampled_input = MDN.compute_derivates( loop_state[2].read(time - 1), upscale_sampled, self.parameters['input_columns'], self.parameters['velocity_threshold'], subsample_rate=self.parameters['subsample']) target_ta = target_input_ta.read( time - 1) # Only allowed to call read() once. Dunno why. next_datapoint = next_sampled_input # tf.cond(feed_forward, lambda: target_ta, lambda: next_sampled_input) next_input = _apply_scaling_and_input_layer(next_datapoint) # That dotted loopy line in the diagram loss = MDN.lossfunc_wrapper(target_ta, projected_output) timewise_track_padding = track_padding_ta.read(time - 1) timewise_track_padding_logits = _padding_bool_to_logits( timewise_track_padding) if track_padding_vec is not None: # If we have declared padding is being used. # use padding as binary mask for mixture based loss # i.e. if the ground truth says this timestep is padding data, set that timestep's loss to zero loss = tf.multiply( loss, tf.minimum( tf.to_float( parameters['padding_loss_mixture_weight']), tf.expand_dims( tf.to_float( tf.logical_not(timewise_track_padding) ), # Hyperparam search sometimes makes this a float64 axis=-1), name='mixture_loss')) padding_output = pad_output_function( cell_output ) # compute what the network thinks about padding # Normalize the softmax loss w.r.t. number of prediction steps # If weight is zero, don't bother computing if abs(parameters['padding_loss_logit_weight'] ) > 1e-12: loss = tf.add( loss, tf.expand_dims( tf.multiply( tf. divide( # Normalize by prediction_steps tf.nn. softmax_cross_entropy_with_logits( logits=padding_output, labels= timewise_track_padding_logits), self.prediction_steps), tf.to_float(parameters[ 'padding_loss_logit_weight'])), axis=-1, name="padding_logit_loss" ) # Without this tf.add( shape(100,), shape(100,1)) becomes (100, 100) for some reason ) # compare to GT else: padding_output = None # loop_state write needs something at least next_loop_state = (loop_state[0].write( time - 1, next_sampled_input), loop_state[1].write( time - 1, loss), loop_state[2].write( time, next_datapoint), loop_state[3].write( time - 1, MDN.upscale_and_resolve_mixtures( projected_output, scaling_layer)), loop_state[4].write( time - 1, padding_output)) #Its an off by one error I'd rather solve with a new array for readability elements_finished = ( time >= self.prediction_steps ) # whether or not this RNN in the batch has declared itself done return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) with tf.variable_scope('seq2seq_decoder'): from tensorflow.python.ops.rnn import _transpose_batch_time emit_ta, final_state, loop_state_ta = tf.nn.raw_rnn( self._RNN_layers, loop_fn) # Here emit_ta should contain all the MDN's for each timestep. To confirm. output_sampled = _transpose_batch_time( loop_state_ta[0].stack()) losses = _transpose_batch_time(loop_state_ta[1].stack()) MDN_output = _transpose_batch_time(loop_state_ta[3].stack()) track_padding_output = _transpose_batch_time( loop_state_ta[4].stack()) return ( output_sampled, losses, # tf.reduce_sum(losses,axis=1)/len(self.decoder_inputs),\ final_state, MDN_output, track_padding_output)
def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=True, scope=None): """Creates a recurrent neural network specified by RNNCell `cell`. Performs fully dynamic unrolling of `inputs`. Example: ```python # create a BasicRNNCell rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size] # defining initial state initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32) # 'state' is a tensor of shape [batch_size, cell_state_size] outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data, initial_state=initial_state, dtype=tf.float32) ``` ```python # create 2 LSTMCells rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]] # create a RNN cell composed sequentially of a number of RNNCells multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers) # 'outputs' is a tensor of shape [batch_size, max_time, 256] # 'state' is a N-tuple where N is the number of LSTMCells containing a # tf.contrib.rnn.LSTMStateTuple for each cell outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell, inputs=data, dtype=tf.float32) ``` Args: cell: An instance of RNNCell. inputs: The RNN inputs. If `time_major == False` (default), this must be a `Tensor` of shape: `[batch_size, max_time, ...]`, or a nested tuple of such elements. If `time_major == True`, this must be a `Tensor` of shape: `[max_time, batch_size, ...]`, or a nested tuple of such elements. This may also be a (possibly nested) tuple of Tensors satisfying this property. The first two dimensions must match across all the inputs, but otherwise the ranks and other shape components may differ. In this case, input to `cell` at each time-step will replicate the structure of these tuples, except for the time dimension (from which the time is taken). The input to `cell` at each time step will be a `Tensor` or (possibly nested) tuple of Tensors each with dimensions `[batch_size, ...]`. sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. Used to copy-through state and zero-out outputs when past a batch element's sequence length. So it's more for performance than correctness. initial_state: (optional) An initial state for the RNN. If `cell.state_size` is an integer, this must be a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. dtype: (optional) The data type for the initial state and expected output. Required if initial_state is not provided or RNN state has a heterogeneous dtype. parallel_iterations: (Default: 32). The number of iterations to run in parallel. Those operations which do not have any temporal dependency and can be run in parallel, will be. This parameter trades off time for space. Values >> 1 use more memory but take less time, while smaller values use less memory but computations take longer. swap_memory: Transparently swap the tensors produced in forward inference but needed for back prop from GPU to CPU. This allows training RNNs which would typically not fit on a single GPU, with very minimal (or no) performance penalty. time_major: The shape format of the `inputs` and `outputs` Tensors. If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using `time_major = True` is a bit more efficient because it avoids transposes at the beginning and end of the RNN calculation. However, most TensorFlow data is batch-major, so by default this function accepts input and emits output in batch-major form. scope: VariableScope for the created subgraph; defaults to "rnn". Returns: A pair (outputs, state) where: outputs: The RNN output `Tensor`. If time_major == False (default), this will be a `Tensor` shaped: `[batch_size, max_time, cell.output_size]`. If time_major == True, this will be a `Tensor` shaped: `[max_time, batch_size, cell.output_size]`. Note, if `cell.output_size` is a (possibly nested) tuple of integers or `TensorShape` objects, then `outputs` will be a tuple having the same structure as `cell.output_size`, containing Tensors having shapes corresponding to the shape data in `cell.output_size`. state: The final state. If `cell.state_size` is an int, this will be shaped `[batch_size, cell.state_size]`. If it is a `TensorShape`, this will be shaped `[batch_size] + cell.state_size`. If it is a (possibly nested) tuple of ints or `TensorShape`, this will be a tuple having the corresponding shapes. If cells are `LSTMCells` `state` will be a tuple containing a `LSTMStateTuple` for each cell. Raises: TypeError: If `cell` is not an instance of RNNCell. ValueError: If inputs is None or an empty list. RuntimeError: If not using control flow v2. """ # Currently only support time_major == True case. assert time_major # TODO(b/123051275): We need to check if the cells are TfLiteLSTMCells or # TfLiteRNNCells. rnn_cell_impl.assert_like_rnncell("cell", cell) if not control_flow_util.ENABLE_CONTROL_FLOW_V2: raise RuntimeError("OpHint dynamic rnn only supports control flow v2.") parent_first_child_input = [{ "parent_ophint_input_index": 0, "first_child_ophint_input_index": 0 }] parent_last_child_output = [{ "parent_output_index": 0, # For LstmCell, the index is 2. # For RnnCell, the index is 1. # So we use -1 meaning it's the last one. "child_output_index": -1 }] internal_children_input_output = [{ "child_input_index": 0, # For LstmCell, the index is 2. # For RnnCell, the index is 1. # So we use -1 meaning it's the last one. "child_output_index": -1 }] inputs_outputs_mappings = { "parent_first_child_input": parent_first_child_input, "parent_last_child_output": parent_last_child_output, "internal_children_input_output": internal_children_input_output } tflite_wrapper = op_hint.OpHint( "TfLiteDynamicRnn", level=2, children_inputs_mappings=inputs_outputs_mappings) with vs.variable_scope(scope or "rnn") as varscope: # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. if _should_cache(): if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) inputs = tflite_wrapper.add_input(inputs, name="input", index_override=0) # By default, time_major==False and inputs are batch-major: shaped # [batch, time, depth] # For internal calculations, we transpose to [time, batch, depth] flat_input = nest.flatten(inputs) if not time_major: # (batch, time, depth) => (time, batch, depth) flat_input = [ ops.convert_to_tensor(input_) for input_ in flat_input ] flat_input = tuple( _transpose_batch_time(input_) for input_ in flat_input) parallel_iterations = parallel_iterations or 32 if sequence_length is not None: sequence_length = math_ops.cast(sequence_length, dtypes.int32) if sequence_length.shape.rank not in (None, 1): raise ValueError( "sequence_length must be a vector of length batch_size, " "but saw shape: %s" % sequence_length.shape) sequence_length = array_ops.identity( # Just to find it in the graph. sequence_length, name="sequence_length") batch_size = _best_effort_input_batch_size(flat_input) if initial_state is not None: state = initial_state else: if not dtype: raise ValueError( "If there is no initial_state, you must give a dtype.") if getattr(cell, "get_initial_state", None) is not None: state = cell.get_initial_state(inputs=None, batch_size=batch_size, dtype=dtype) else: state = cell.zero_state(batch_size, dtype) def _assert_has_shape(x, shape): x_shape = array_ops.shape(x) packed_shape = array_ops.stack(shape) return control_flow_ops.Assert( math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), [ "Expected shape for Tensor %s is " % x.name, packed_shape, " but saw shape: ", x_shape ]) if not context.executing_eagerly() and sequence_length is not None: # Perform some shape validation with ops.control_dependencies( [_assert_has_shape(sequence_length, [batch_size])]): sequence_length = array_ops.identity(sequence_length, name="CheckSeqLen") inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input) outputs, final_state = _dynamic_rnn_loop( cell, inputs, state, parallel_iterations=parallel_iterations, swap_memory=swap_memory, sequence_length=sequence_length, dtype=dtype) # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth]. # If we are performing batch-major calculations, transpose output back # to shape [batch, time, depth] if not time_major: # (time, batch, depth) => (batch, time, depth) outputs = nest.map_structure(_transpose_batch_time, outputs) outputs = tflite_wrapper.add_output(outputs, name="outputs") return outputs, final_state
def _build_net(self): with tf.variable_scope(self.name): self.mb_size = tf.placeholder(tf.int32, [], name='batch_size') self.lr_rate1 = tf.placeholder(tf.float32, name='learning_rate1') self.lr_rate2 = tf.placeholder(tf.float32, name='learning_rate2') self.keep_prob = tf.placeholder(tf.float32, name='keep_probability') # Input and Output self.x = tf.placeholder(tf.float32, [None, self.max_length, self.x_dim], name='inputs') self.y = tf.placeholder(tf.float32, [None, self.max_length, self.y_dim], name='labels_onehot') # Embedding self.E = tf.placeholder(tf.float32, [self.K, self.z_dim], name='embeddings_input') self.EE = tf.Variable(self.E, name='embeddings_var') self.embeddings = tf.nn.tanh(self.EE) # self.embde = tf.nn.tanh(self.EE) # self.EE = tf.Variable(self.E, name='embeddings_var') self.s = tf.placeholder(tf.int32, [None], name='cluster_label') self.s_onehot = tf.one_hot(self.s, self.K) # LOSS PARAMETERS self.alpha = tf.placeholder(tf.float32, name='alpha') #For sample-wise entropy self.beta = tf.placeholder( tf.float32, name='beta') #For prediction loss (i.e., mle) self.gamma = tf.placeholder(tf.float32, name='gamma') #For batch-wise entropy self.delta = tf.placeholder(tf.float32, name='delta') #For embedding ''' ### CREATE RNN MASK - This is to flexibly handle sequences with different length - rnn_mask1: last observation; [mb_size, max_length] - rnn_mask2: all available observations; [mb_size, max_length] ''' # CREATE RNN MASK: seq_length = get_seq_length(self.x) tmp_range = tf.expand_dims(tf.range(0, self.max_length, 1), axis=0) self.rnn_mask1 = tf.cast( tf.equal(tmp_range, tf.expand_dims(seq_length - 1, axis=1)), tf.float32) #last observation self.rnn_mask2 = tf.cast( tf.less_equal(tmp_range, tf.expand_dims(seq_length - 1, axis=1)), tf.float32) #all available observation ### DEFINE SELECTOR def selector(x_, o_dim_=self.K, num_layers_=2, h_dim_=self.h_dim_h, activation_fn=self.fc_activate_fn, reuse=tf.AUTO_REUSE): out_fn = tf.nn.softmax with tf.variable_scope('selector', reuse=reuse): if num_layers_ == 1: out = tf.contrib.layers.fully_connected( inputs=x_, num_outputs=o_dim_, activation_fn=out_fn, scope='selector_out') else: #num_layers > 1 for tmp_layer in range(num_layers_ - 1): if tmp_layer == 0: net = x_ net = tf.contrib.layers.fully_connected( inputs=net, num_outputs=h_dim_, activation_fn=activation_fn, scope='selector_' + str(tmp_layer)) net = tf.nn.dropout(net, keep_prob=self.keep_prob) out = tf.contrib.layers.fully_connected( inputs=net, num_outputs=o_dim_, activation_fn=out_fn, scope='selector_out') return out ### DEFINE PREDICTOR def predictor(x_, o_dim_=self.y_dim, o_type_=self.y_type, num_layers_=1, h_dim_=self.h_dim_g, activation_fn=self.fc_activate_fn, reuse=tf.AUTO_REUSE): if o_type_ == 'continuous': out_fn = None elif o_type_ == 'categorical': out_fn = tf.nn.softmax #for classification task elif o_type_ == 'binary': out_fn = tf.nn.sigmoid else: raise Exception( 'Wrong output type. The value {}!!'.format(o_type_)) with tf.variable_scope('predictor', reuse=reuse): if num_layers_ == 1: out = tf.contrib.layers.fully_connected( inputs=x_, num_outputs=o_dim_, activation_fn=out_fn, scope='predictor_out') else: #num_layers > 1 for tmp_layer in range(num_layers_ - 1): if tmp_layer == 0: net = x_ net = tf.contrib.layers.fully_connected( inputs=net, num_outputs=h_dim_, activation_fn=activation_fn, scope='predictor_' + str(tmp_layer)) net = tf.nn.dropout(net, keep_prob=self.keep_prob) out = tf.contrib.layers.fully_connected( inputs=net, num_outputs=o_dim_, activation_fn=out_fn, scope='predictor_out') return out ### DEFINE LOOP FUNCTION FOR ENCODRER (f-g, f-h relations are created here) def loop_fn(time, cell_output, cell_state, loop_state): emit_output = cell_output if cell_output is None: # time == 0 next_cell_state = cell.zero_state(self.mb_size, tf.float32) next_loop_state = loop_state_ta else: next_cell_state = cell_state tmp_z = utils.create_concat_state_h( next_cell_state, self.num_layers_f, self.rnn_type) tmp_y = predictor(tmp_z, self.y_dim, self.y_type, self.num_layers_g, self.h_dim_g, self.fc_activate_fn) tmp_pi = selector(tmp_z, self.K, self.num_layers_h, self.h_dim_h, self.fc_activate_fn) next_loop_state = ( loop_state[0].write( time - 1, tmp_z), # save all the hidden states loop_state[1].write(time - 1, tmp_y), # save all the output loop_state[2].write(time - 1, tmp_pi) ) # save all the selector_net output (i.e., pi) elements_finished = (time >= self.max_length) #this gives the break-point (no more recurrence after the max_length) finished = tf.reduce_all(elements_finished) next_input = tf.cond( finished, lambda: tf.zeros([self.mb_size, self.x_dim], dtype=tf.float32), lambda: inputs_ta.read(time)) return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) ''' ##### CREATE RNN NETWORK - (INPUT) inputs_ta: TensorArray with [max_length, mb_size, x_dim] #x_dim included delta - (OUTPUT) . zs = rnn states (h) in LSTM/GRU ; [mb_size, max_length z_dim] . y_hats = output of predictor taking zs as inputs; [mb_size, max_length, y_dim] . pis = output of selector ; [mb_size, max_length, K] ''' inputs = self.x inputs_ta = tf.TensorArray(dtype=tf.float32, size=self.max_length).unstack( _transpose_batch_time(inputs), name='rnn_input') cell = utils.create_rnn_cell(self.h_dim_f, self.num_layers_f, self.keep_prob, self.rnn_type, self.rnn_activate_fn) #define the loop_state TensorArray for information from rnn time steps loop_state_ta = ( tf.TensorArray(size=self.max_length, dtype=tf.float32, clear_after_read=False), #zs (j=1,...,J) tf.TensorArray(size=self.max_length, dtype=tf.float32, clear_after_read=False), #y_hats (j=1,...,J) tf.TensorArray(size=self.max_length, dtype=tf.float32, clear_after_read=False) #pis (j=1,...,J) ) _, _, loop_state_ta = tf.nn.raw_rnn( cell, loop_fn) #, parallel_iterations=1) self.zs = _transpose_batch_time(loop_state_ta[0].stack()) self.y_hats = _transpose_batch_time(loop_state_ta[1].stack()) self.pis = _transpose_batch_time(loop_state_ta[2].stack()) ### SAMPLING PROCESS s_dist = tf.distributions.Categorical(probs=tf.reshape( self.pis, [-1, self.K])) #define the categorical dist. s_sample = s_dist.sample() mask_e = tf.cast( tf.equal(tf.expand_dims(tf.range(0, self.K, 1), axis=0), tf.expand_dims(s_sample, axis=1)), tf.float32) z_bars = tf.matmul(mask_e, self.embeddings) pi_sample = tf.reduce_sum(mask_e * tf.reshape(log(self.pis), [-1, self.K]), axis=1) with tf.variable_scope('rnn', reuse=True): y_bars = predictor(z_bars, self.y_dim, self.y_type, self.num_layers_g, self.h_dim_g, self.fc_activate_fn) self.z_bars = tf.reshape(z_bars, [-1, self.max_length, self.z_dim]) self.y_bars = tf.reshape(y_bars, [-1, self.max_length, self.y_dim]) self.pi_sample = tf.reshape(pi_sample, [-1, self.max_length]) self.s_sample = tf.reshape(s_sample, [-1, self.max_length]) ### DEFINE LOSS FUNCTIONS #\ell_{1}: KL divergence loss for regression and binary/categorical-classification task def loss_1(y_true_, y_pred_, y_type_=self.y_type): if y_type_ == 'continuous': tmp_loss = tf.reduce_sum((y_true_ - y_pred_)**2, axis=-1) elif y_type_ == 'categorical': tmp_loss = -tf.reduce_sum(y_true_ * log(y_pred_), axis=-1) elif y_type_ == 'binary': tmp_loss = -tf.reduce_sum( y_true_ * log(y_pred_) + (1. - y_true_) * log(1. - y_pred_), axis=-1) else: raise Exception( 'Wrong output type. The value {}!!'.format(y_type_)) return tmp_loss #batch-wise entropy tmp_pis = tf.tile(tf.expand_dims(self.rnn_mask2, axis=2), [1, 1, self.K]) * self.pis mean_pis = tf.reduce_sum(tf.reduce_sum(tmp_pis, axis=1), axis=0) / tf.reduce_sum(tf.reduce_sum( self.rnn_mask2, axis=1), axis=0, keepdims=True) ## LOSS_MLE: MLE prediction loss (for initalization) self.LOSS_MLE = tf.reduce_mean( tf.reduce_sum(self.rnn_mask2 * loss_1(self.y, self.y_hats, self.y_type), axis=1)) ## LOSS1: predictive clustering loss self.LOSS_1 = tf.reduce_mean( tf.reduce_sum(self.rnn_mask2 * loss_1(self.y, self.y_bars, self.y_type), axis=1)) self.LOSS_1_AC = tf.reduce_mean( tf.reduce_sum(self.rnn_mask2 * self.pi_sample * loss_1(self.y, self.y_bars, self.y_type), axis=1)) ## LOSS2: sample-wise entropy loss self.LOSS_2 = tf.reduce_mean( -tf.reduce_sum(self.rnn_mask2 * tf.reduce_sum(self.pis * log(self.pis), axis=2), axis=1)) predictor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name + '/rnn/predictor') selecter_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name + '/rnn/selector') embedding_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name + '/embeddings_var') encoder_vars = [ vars_ for vars_ in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if vars_ not in predictor_vars + selecter_vars + embedding_vars ] ### EMBEDDING TRAINING with tf.variable_scope('rnn', reuse=True): Ey = predictor(self.embeddings, self.y_dim, self.y_type, self.num_layers_g, self.h_dim_g, self.fc_activate_fn) # Ey = predictor(self.EE, self.y_dim, self.y_type, self.num_layers_g, self.h_dim_g, self.fc_activate_fn) ## LOSS3: embedding separation loss (prevents embedding from collapsing) self.LOSS_3 = 0 for i in range(self.K): for j in range(i + 1, self.K): self.LOSS_3 += -loss_1( Ey[i, :], Ey[j, :], y_type_=self.y_type) / ( (self.K - 1) * (self.K - 2) ) # negative because we want to increase this; ### DEFINE OPTIMIZATION SOLVERS self.solver_MLE = tf.train.AdamOptimizer(self.lr_rate1).minimize( self.LOSS_MLE, var_list=encoder_vars + predictor_vars) self.solver_L1_critic = tf.train.AdamOptimizer( self.lr_rate1).minimize(self.LOSS_1, var_list=encoder_vars + predictor_vars) self.solver_L1_actor = tf.train.AdamOptimizer( self.lr_rate2).minimize(self.LOSS_1_AC + self.alpha * self.LOSS_2, var_list=encoder_vars + selecter_vars) self.solver_E = tf.train.AdamOptimizer(self.lr_rate1).minimize( self.LOSS_1 + self.beta * self.LOSS_3, var_list=embedding_vars) ### INITIALIZE SELECTOR self.zz = tf.placeholder(tf.float32, [None, self.z_dim]) with tf.variable_scope('rnn', reuse=True): self.yy = predictor(self.zz, self.y_dim, self.y_type, self.num_layers_g, self.h_dim_g, self.fc_activate_fn ) #to check the predictor output given z self.s_out = selector(self.zz, self.K, self.num_layers_h, self.h_dim_h, self.fc_activate_fn) ## LOSS_S: selector initialization (cross-entropy wrt initialized class) self.LOSS_S = tf.reduce_mean( -tf.reduce_sum(self.s_onehot * log(self.s_out), axis=1)) self.solver_S = tf.train.AdamOptimizer(self.lr_rate1).minimize( self.LOSS_S, var_list=selecter_vars)
def _build_net(self): with tf.variable_scope(self.name): self.mb_size = tf.placeholder(tf.int32, [], name='batch_size') self.lr_rate = tf.placeholder(tf.float32, name='learning_rate') self.keep_prob = tf.placeholder(tf.float32, name='keep_probability') self.K = tf.placeholder(tf.int32, [], name='num_Cluster') self.M = tf.placeholder(tf.float32, shape=[None, self.max_length, self.num_Event], name='M_onehot') self.D = tf.placeholder(tf.float32, shape=[None, self.max_length, 1], name='delta') self.X = tf.placeholder(tf.float32, shape=[None, self.num_Feature], name='X') self.Mask = tf.placeholder(tf.float32, shape=[None, self.max_length], name='rnn_mask') self.MU = tf.placeholder(tf.float32, [None, self.z_dim], name='MU') #this will become [K, z_dim] self.S = tf.placeholder(tf.int64, [None], name='S') S_one_hot = tf.one_hot(self.S, self.K, name='S_one_hot') self.delta_range = tf.placeholder(tf.float32, [self.L], name='delta_range') # LOSS PARAMETERS self.alpha = tf.placeholder(tf.float32, name = 'alpha') self.beta = tf.placeholder(tf.float32, name = 'beta') self.beta_ms = tf.placeholder(tf.float32, name = 'beta_ms', shape=[self.num_Event - 1]) #(set [1, ..., 1] as a default) self.gamma = tf.placeholder(tf.float32, name = 'gamma') # DECLARE FUNCTIONS FOR NETWORK CONSTRUCTION def prediction_network_softplus(h, delta, reuse=tf.AUTO_REUSE): #version 0 with tf.variable_scope('prediction_net', reuse=reuse): tmp = tf.contrib.layers.fully_connected(inputs=tf.concat([h, delta], axis=1), num_outputs=self.h_dim2, activation_fn=None) #layer1 tmp = tf.nn.dropout(tmp, keep_prob=self.keep_prob) tmp = tf.contrib.layers.fully_connected(inputs=tmp, num_outputs=self.h_dim2, activation_fn=tf.nn.relu) #layer2 tmp = tf.nn.dropout(tmp, keep_prob=self.keep_prob) tmp = tf.contrib.layers.fully_connected(inputs=tmp, num_outputs=self.num_Event, activation_fn=None) #layer2 out = tf.nn.softplus(tmp) return out ### DEFINE LOOP FUNCTION FOR RAW_RNN w/ TEMPORAL ATTENTION def loop_fn_MPP(time, cell_output, cell_state, loop_state): emit_output = cell_output if cell_output is None: # time == 0 next_cell_state = cell.zero_state(self.mb_size, tf.float32) next_loop_state = (tf.TensorArray(size=self.max_length, dtype=tf.float32), #lambda(t_{j}) tf.TensorArray(size=self.max_length, dtype=tf.float32), #lambda(t_{j-1}) tf.TensorArray(size=self.max_length, dtype=tf.float32)) #hidden states (h_{j}) else: next_cell_state = cell_state tmp_h = utils.create_concat_state(next_cell_state, self.num_layers_RNN, self.RNN_type, BiRNN=None) def fn_time_last(): #the last lambda_curr will not be included in the loss function (thus, time-1 is applied to remove the error) d_next = tf.reshape(inputs_ta.read(time-1)[:,0], shape=[-1, 1]) #to prevent indexing error l_next = prediction_network_softplus(tmp_h, d_next) l_curr = prediction_network_softplus(tmp_h, tf.zeros_like(d_next)) return l_curr, l_next def fn_time_others(): #the last lambda_curr will not be included in the loss function (thus, time-1 is applied to remove the error) d_next = tf.reshape(inputs_ta.read(time)[:,0], shape=[-1, 1]) #to prevent indexing error l_next = prediction_network_softplus(tmp_h, d_next) l_curr = prediction_network_softplus(tmp_h, tf.zeros_like(d_next)) return l_curr, l_next l_curr, l_next = tf.cond( tf.equal(time, self.max_length), lambda: fn_time_last(), lambda: fn_time_others() ) next_loop_state = (loop_state[0].write(time-1, l_next), # save lambda(t_{j}) loop_state[1].write(time-1, l_curr), # save lambda(t_{j-1}) loop_state[2].write(time-1, tmp_h)) # save all the h_ins elements_finished = (time >= seq_length) #this gives the break-point (no more recurrence after the max_length) finished = tf.reduce_all(elements_finished) def fn_input_embedding(): embedding = tf.concat([inputs_ta.read(time), self.X], axis=1) # embedding = tf.nn.dropout(embedding, keep_prob=keep_prob) embedding = tf.contrib.layers.fully_connected(inputs=embedding, num_outputs=self.h_dim2, activation_fn=tf.nn.relu) return embedding next_input = tf.cond( finished, lambda: tf.zeros([self.mb_size, self.h_dim2], dtype=tf.float32), lambda: fn_input_embedding() ) return (elements_finished, next_input, next_cell_state, emit_output, next_loop_state) ### INPUTS inputs = tf.concat([self.D, self.M], axis=2, name='inputs') inputs_ta = tf.TensorArray( dtype=tf.float32, size=self.max_length, clear_after_read=False ).unstack(_transpose_batch_time(inputs), name='inputs_ta') seq_length = get_seq_length(inputs) ### RNNS cell = utils.create_rnn_cell(self.h_dim1, self.num_layers_RNN, self.keep_prob, self.RNN_type, self.RNN_active_fn) _, rnn_final_state, loop_state_ta = tf.nn.raw_rnn(cell, loop_fn_MPP) next_lambdas = _transpose_batch_time(loop_state_ta[0].stack()) curr_lambdas = _transpose_batch_time(loop_state_ta[1].stack()) rnn_states = _transpose_batch_time(loop_state_ta[2].stack()) self.Z = tf.reduce_sum(rnn_states * tf.tile(tf.expand_dims(self.Mask, axis=2), [1,1, self.z_dim]), axis=1) ''' AFTER PUTTING (m_{0}, t_{0}) - m_{0} = [1,0,0,...] (auxilary event type) - t_{0} = 0 - Thus, no need to consider the first event issue i.e., putting an additional loss function for t=1 - Instead, m=0, t=0 (i.e., the first index of event and timing) is not considered. ''' tmp_MLE1 = tf.reduce_sum(tf.reduce_sum(tf.log(next_lambdas[:, :-1, 1:] + 1e-8) * self.M[:,1:,1:], axis=2), axis=1) #do not consider m=0 (this is indicator for BOS) for m in range(1, self.num_Event): if m == 1: tmp_MLE2 = tf.reduce_sum(1/2 * (next_lambdas[:, :-1, m] + curr_lambdas[:, :-1, m]) * self.D[:, 1:, 0], axis=1) else: tmp_MLE2 += tf.reduce_sum(1/2 * (next_lambdas[:, :-1, m] + curr_lambdas[:, :-1, m]) * self.D[:, 1:, 0], axis=1) self.loss_MLE = - tf.reduce_mean(tmp_MLE1 - tmp_MLE2) ### LOSS - CLUSTERING # DISTANCE IN THE LATENT SPACE Z_expanded = tf.tile(tf.expand_dims(self.Z, axis=1), [1, self.K, 1]) #[None, num_Cluster, 2] MU_expanded = tf.tile(tf.expand_dims(self.MU, axis=0), [self.mb_size, 1, 1]) #[None, num_Cluster, 2] dist_z_expanded = tf.reduce_sum((Z_expanded - MU_expanded)**2, axis=2) #[None, num_Cluster] dist_z_homo = tf.reduce_sum(dist_z_expanded * S_one_hot, axis=1) #[None] dist_z_hetero = tf.reduce_sum(dist_z_expanded * (1. - S_one_hot), axis=1) #[None] self.dist_z_homo = tf.reduce_mean(dist_z_homo, axis=0) self.dist_z_hetero = tf.reduce_mean(dist_z_hetero, axis=0) # DISTANCE IN THE OUTPUT SPACE (LAMBDA) Y = [] PSI = [] for l in range(self.L): tmp_d1 = self.delta_range[l] * tf.ones_like(tf.reshape(self.Z[:,0], shape=[-1, 1])) tmp_d2 = self.delta_range[l] * tf.ones_like(tf.reshape(self.MU[:,0], shape=[-1, 1])) with tf.variable_scope('rnn', reuse=True): Y.append(prediction_network_softplus(self.Z, tmp_d1)) PSI.append(prediction_network_softplus(self.MU, tmp_d2)) self.Y_stacked = tf.stack(Y, axis=2) self.PSI_stacked = tf.stack(PSI, axis=2) Y_stacked_expanded = tf.tile(tf.expand_dims(self.Y_stacked, axis=1), [1, self.K, 1, 1]) #[None, num_Cluster, num_Event, L] PSI_stacked_expanded = tf.tile(tf.expand_dims(self.PSI_stacked, axis=0), [self.mb_size, 1, 1, 1]) #[None, num_Cluster, num_Event, L] tmp = ( Y_stacked_expanded - PSI_stacked_expanded )**2 # tripazoidal approximation dist_y_expanded_ms = self.delta_range[-1]/(self.L-1) * (tf.reduce_sum(tmp, axis=3) - tmp[:, :, :, 0] - tmp[:, :, :, -1]) dist_y_expanded = tf.reduce_sum(dist_y_expanded_ms[:, :, 1:] * self.beta_ms, axis=2) dist_y_homo = tf.reduce_sum(dist_y_expanded * S_one_hot, axis=1) #[None] dist_y_hetero = tf.reduce_sum(dist_y_expanded * (1. - S_one_hot), axis=1) #[None] self.dist_y_homo = tf.reduce_mean(dist_y_homo, axis=0) self.dist_y_hetero = tf.reduce_mean(dist_y_hetero, axis=0) ### FOR USER-DEFINED DISTANCE MEASURE self.ZZ = tf.placeholder(tf.float32, shape=[None, self.z_dim]) YY = [] for l in range(self.L): tmp_d1 = self.delta_range[l] * tf.ones_like(tf.reshape(self.ZZ[:,0], shape=[-1, 1])) with tf.variable_scope('rnn', reuse=True): YY.append(prediction_network_softplus(self.ZZ, tmp_d1)) self.YY_stacked = tf.stack(YY, axis=2) ### FOR THINNING-ALGORITHM self.D_IN = tf.placeholder(tf.float32, shape=[None], name='delta_in') self.Z_IN = tf.placeholder(tf.float32, shape=[None, self.z_dim]) tmp_d_in = tf.reshape(self.D_IN * tf.ones_like(self.Z_IN[:,0]), shape=[-1, 1]) with tf.variable_scope('rnn', reuse=True): self.Y_pred = prediction_network_softplus(self.Z_IN, tmp_d_in) global_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) pred_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='rnn/prediction_net') enc_vars = [tmp_var for tmp_var in global_vars if tmp_var not in pred_vars] self.loss_CLU = self.alpha*self.dist_z_homo self.loss_COM = self.beta*self.dist_y_homo - self.gamma*self.dist_y_hetero self.loss_CLU_COM = self.loss_CLU+self.loss_COM self.loss_TOTAL = self.loss_MLE + self.loss_CLU + self.loss_COM self.solver_MLE = tf.train.AdamOptimizer(self.lr_rate, beta1=0.9, beta2=0.999).minimize(self.loss_MLE, var_list=global_vars) self.solver_CLUSTER = tf.train.AdamOptimizer(self.lr_rate, beta1=0.9, beta2=0.999).minimize(self.loss_CLU_COM, var_list=enc_vars) self.solver_TOTAL = tf.train.AdamOptimizer(self.lr_rate, beta1=0.9, beta2=0.999).minimize(self.loss_TOTAL, var_list=global_vars)
def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=True, scope=None): """Creates a recurrent neural network specified by RNNCell `cell`. Performs fully dynamic unrolling of `inputs`. Example: ```python # create a BasicRNNCell rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size] # defining initial state initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32) # 'state' is a tensor of shape [batch_size, cell_state_size] outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data, initial_state=initial_state, dtype=tf.float32) ``` ```python # create 2 LSTMCells rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]] # create a RNN cell composed sequentially of a number of RNNCells multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers) # 'outputs' is a tensor of shape [batch_size, max_time, 256] # 'state' is a N-tuple where N is the number of LSTMCells containing a # tf.contrib.rnn.LSTMStateTuple for each cell outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell, inputs=data, dtype=tf.float32) ``` Args: cell: An instance of RNNCell. inputs: The RNN inputs. If `time_major == False` (default), this must be a `Tensor` of shape: `[batch_size, max_time, ...]`, or a nested tuple of such elements. If `time_major == True`, this must be a `Tensor` of shape: `[max_time, batch_size, ...]`, or a nested tuple of such elements. This may also be a (possibly nested) tuple of Tensors satisfying this property. The first two dimensions must match across all the inputs, but otherwise the ranks and other shape components may differ. In this case, input to `cell` at each time-step will replicate the structure of these tuples, except for the time dimension (from which the time is taken). The input to `cell` at each time step will be a `Tensor` or (possibly nested) tuple of Tensors each with dimensions `[batch_size, ...]`. sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. Used to copy-through state and zero-out outputs when past a batch element's sequence length. So it's more for performance than correctness. initial_state: (optional) An initial state for the RNN. If `cell.state_size` is an integer, this must be a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. dtype: (optional) The data type for the initial state and expected output. Required if initial_state is not provided or RNN state has a heterogeneous dtype. parallel_iterations: (Default: 32). The number of iterations to run in parallel. Those operations which do not have any temporal dependency and can be run in parallel, will be. This parameter trades off time for space. Values >> 1 use more memory but take less time, while smaller values use less memory but computations take longer. swap_memory: Transparently swap the tensors produced in forward inference but needed for back prop from GPU to CPU. This allows training RNNs which would typically not fit on a single GPU, with very minimal (or no) performance penalty. time_major: The shape format of the `inputs` and `outputs` Tensors. If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using `time_major = True` is a bit more efficient because it avoids transposes at the beginning and end of the RNN calculation. However, most TensorFlow data is batch-major, so by default this function accepts input and emits output in batch-major form. scope: VariableScope for the created subgraph; defaults to "rnn". Returns: A pair (outputs, state) where: outputs: The RNN output `Tensor`. If time_major == False (default), this will be a `Tensor` shaped: `[batch_size, max_time, cell.output_size]`. If time_major == True, this will be a `Tensor` shaped: `[max_time, batch_size, cell.output_size]`. Note, if `cell.output_size` is a (possibly nested) tuple of integers or `TensorShape` objects, then `outputs` will be a tuple having the same structure as `cell.output_size`, containing Tensors having shapes corresponding to the shape data in `cell.output_size`. state: The final state. If `cell.state_size` is an int, this will be shaped `[batch_size, cell.state_size]`. If it is a `TensorShape`, this will be shaped `[batch_size] + cell.state_size`. If it is a (possibly nested) tuple of ints or `TensorShape`, this will be a tuple having the corresponding shapes. If cells are `LSTMCells` `state` will be a tuple containing a `LSTMStateTuple` for each cell. Raises: TypeError: If `cell` is not an instance of RNNCell. ValueError: If inputs is None or an empty list. RuntimeError: If not using control flow v2. """ # Currently only support time_major == True case. assert time_major # TODO(b/123051275): We need to check if the cells are TfLiteLSTMCells or # TfLiteRNNCells. rnn_cell_impl.assert_like_rnncell("cell", cell) if not control_flow_util.ENABLE_CONTROL_FLOW_V2: raise RuntimeError("OpHint dynamic rnn only supports control flow v2.") parent_first_child_input = [{ "parent_ophint_input_index": 0, "first_child_ophint_input_index": 0 }] parent_last_child_output = [{ "parent_output_index": 0, # For LstmCell, the index is 2. # For RnnCell, the index is 1. # So we use -1 meaning it's the last one. "child_output_index": -1 }] internal_children_input_output = [{ "child_input_index": 0, # For LstmCell, the index is 2. # For RnnCell, the index is 1. # So we use -1 meaning it's the last one. "child_output_index": -1 }] inputs_outputs_mappings = { "parent_first_child_input": parent_first_child_input, "parent_last_child_output": parent_last_child_output, "internal_children_input_output": internal_children_input_output } tflite_wrapper = op_hint.OpHint( "TfLiteDynamicRnn", level=2, children_inputs_mappings=inputs_outputs_mappings) with vs.variable_scope(scope or "rnn") as varscope: # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. if _should_cache(): if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) inputs = tflite_wrapper.add_input(inputs, name="input", index_override=0) # By default, time_major==False and inputs are batch-major: shaped # [batch, time, depth] # For internal calculations, we transpose to [time, batch, depth] flat_input = nest.flatten(inputs) if not time_major: # (batch, time, depth) => (time, batch, depth) flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input] flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input) parallel_iterations = parallel_iterations or 32 if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length.get_shape().rank not in (None, 1): raise ValueError( "sequence_length must be a vector of length batch_size, " "but saw shape: %s" % sequence_length.get_shape()) sequence_length = array_ops.identity( # Just to find it in the graph. sequence_length, name="sequence_length") batch_size = _best_effort_input_batch_size(flat_input) if initial_state is not None: state = initial_state else: if not dtype: raise ValueError("If there is no initial_state, you must give a dtype.") if getattr(cell, "get_initial_state", None) is not None: state = cell.get_initial_state( inputs=None, batch_size=batch_size, dtype=dtype) else: state = cell.zero_state(batch_size, dtype) def _assert_has_shape(x, shape): x_shape = array_ops.shape(x) packed_shape = array_ops.stack(shape) return control_flow_ops.Assert( math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), [ "Expected shape for Tensor %s is " % x.name, packed_shape, " but saw shape: ", x_shape ]) if not context.executing_eagerly() and sequence_length is not None: # Perform some shape validation with ops.control_dependencies( [_assert_has_shape(sequence_length, [batch_size])]): sequence_length = array_ops.identity( sequence_length, name="CheckSeqLen") inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input) outputs, final_state = _dynamic_rnn_loop( cell, inputs, state, parallel_iterations=parallel_iterations, swap_memory=swap_memory, sequence_length=sequence_length, dtype=dtype) # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth]. # If we are performing batch-major calculations, transpose output back # to shape [batch, time, depth] if not time_major: # (time, batch, depth) => (batch, time, depth) outputs = nest.map_structure(_transpose_batch_time, outputs) outputs = tflite_wrapper.add_output(outputs, name="outputs") return outputs, final_state
def _build_forward(self): config = self.config N, M, JX, JQ, VW, VC, d, W ,EW, WOW= \ config.batch_size, config.max_num_sents, config.max_sent_size, \ config.max_ques_size, config.len_new_emb_mat, config.char_vocab_size, config.hidden_size, \ config.max_word_size,config.word_vocab_size-config.vw_wo_entity_size,config.vw_wo_entity_size JX = tf.shape(self.x)[2] # words JQ = tf.shape(self.q)[1] # words M = tf.shape(self.x)[1] dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size with tf.variable_scope("emb"): if config.use_char_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float') with tf.variable_scope("char"): Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx) # [N, M, JX, W, dc] Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq) # [N, JQ, W, dc] Acx = tf.reshape(Acx, [-1, JX, W, dc]) Acq = tf.reshape(Acq, [-1, JQ, W, dc]) filter_sizes = list( map(int, config.out_channel_dims.split(','))) heights = list(map(int, config.filter_heights.split(','))) assert sum(filter_sizes) == dco, (filter_sizes, dco) with tf.variable_scope("conv"): xx = multi_conv1d(Acx, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") if config.share_cnn_weights: tf.get_variable_scope().reuse_variables() qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="xx") else: qq = multi_conv1d(Acq, filter_sizes, heights, "VALID", self.is_train, config.keep_prob, scope="qq") xx = tf.reshape(xx, [-1, M, JX, dco]) qq = tf.reshape(qq, [-1, JQ, dco]) if config.use_word_emb: with tf.variable_scope("emb_var"), tf.device("/cpu:0"): if config.mode == 'train': init_word_emb = tf.random_normal_initializer(-0.5, 0.5) #entity_emb_mat = tf.get_variable("entity_emb_mat", dtype='float', shape=[EW, EW], initializer=get_initializer(config.onehot_encoded)) #entity_emb_out = _linear(entity_emb_mat, dw, True, bias_initializer=tf.constant_initializer(0.0)) #word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, dw], initializer=get_initializer(config.emb_mat)) word_emb_mat = tf.get_variable( "word_emb_mat", dtype='float', shape=[VW, dw], initializer=init_word_emb) #word_emb_mat = tf.concat(axis=0,values=[word_emb_mat, entity_emb_out]) else: word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, dw], dtype='float') #if config.use_glove_for_unk: # word_emb_mat = tf.concat(axis=0, values=[word_emb_mat, self.new_emb_mat]) with tf.name_scope("word"): Ax = tf.nn.embedding_lookup( word_emb_mat, self.x ) # [N, M, JX, d] i.e. [batch size, max sentences, max words, embedding size] Aq = tf.nn.embedding_lookup( word_emb_mat, self.q ) # [N, JQ, d] i.e. [batch size, max words, embedding size] self.tensor_dict['x'] = Ax self.tensor_dict['q'] = Aq if config.use_char_emb: xx = tf.concat(axis=3, values=[xx, Ax]) # [N, M, JX, di] qq = tf.concat(axis=2, values=[qq, Aq]) # [N, JQ, di] else: xx = Ax qq = Aq # highway network if config.highway: with tf.variable_scope("highway"): xx = highway_network(xx, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) tf.get_variable_scope().reuse_variables() qq = highway_network(qq, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) self.tensor_dict['xx'] = xx self.tensor_dict['qq'] = qq #xx = tf.Print(xx,[tf.shape(xx),xx],message="DHRUV xx=",summarize=20) cell_fw = BasicLSTMCell(d, state_is_tuple=True) cell_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell_fw = SwitchableDropoutWrapper( cell_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell_bw = SwitchableDropoutWrapper( cell_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell2_fw = BasicLSTMCell(d, state_is_tuple=True) cell2_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell2_fw = SwitchableDropoutWrapper( cell2_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell2_bw = SwitchableDropoutWrapper( cell2_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell3_fw = BasicLSTMCell(d, state_is_tuple=True) cell3_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell3_fw = SwitchableDropoutWrapper( cell3_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell3_bw = SwitchableDropoutWrapper( cell3_bw, self.is_train, input_keep_prob=config.input_keep_prob) cell4_fw = BasicLSTMCell(d, state_is_tuple=True) cell4_bw = BasicLSTMCell(d, state_is_tuple=True) d_cell4_fw = SwitchableDropoutWrapper( cell4_fw, self.is_train, input_keep_prob=config.input_keep_prob) d_cell4_bw = SwitchableDropoutWrapper( cell4_bw, self.is_train, input_keep_prob=config.input_keep_prob) x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2) # [N,M] q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1) # [N] with tf.variable_scope("prepro"): (fw_u, bw_u), ((_, fw_u_f), (_, bw_u_f)) = bidirectional_dynamic_rnn( d_cell_fw, d_cell_bw, qq, q_len, dtype='float', scope='u1') # [N, J, d], [N, d] u = tf.concat(axis=2, values=[fw_u, bw_u]) if config.share_lstm_weights: tf.get_variable_scope().reuse_variables() (fw_h, bw_h), (fw_s, bw_s) = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='u1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] else: (fw_h, bw_h), _ = bidirectional_dynamic_rnn( cell_fw, cell_bw, xx, x_len, dtype='float', scope='h1') # [N, M, JX, 2d] h = tf.concat(axis=3, values=[fw_h, bw_h]) # [N, M, JX, 2d] self.tensor_dict['u'] = u self.tensor_dict['h'] = h with tf.variable_scope("main"): if config.dynamic_att: # not true p0 = h u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]), [N * M, JQ, 2 * d]) q_mask = tf.reshape( tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]), [N * M, JQ]) first_cell_fw = AttentionCell( cell2_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) first_cell_bw = AttentionCell( cell2_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_fw = AttentionCell( cell3_fw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) second_cell_bw = AttentionCell( cell3_bw, u, mask=q_mask, mapper='sim', input_keep_prob=self.config.input_keep_prob, is_train=self.is_train) else: p0 = attention_layer( config, self.is_train, h, u, h_mask=self.x_mask, u_mask=self.q_mask, scope="p0", tensor_dict=self.tensor_dict) # p0 seems to be G in paper first_cell_fw = d_cell2_fw second_cell_fw = d_cell3_fw first_cell_bw = d_cell2_bw second_cell_bw = d_cell3_bw #p1 = tf.reshape(p0,[N , M*JX, 8*d]) (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn( first_cell_fw, first_cell_bw, p0, x_len, dtype='float', scope='g0') # [N, M, JX, 2d] g0 = tf.concat(axis=3, values=[fw_g0, bw_g0]) (fw_g1, bw_g1), (my_fw_final_state, my_bw_final_state) = bidirectional_dynamic_rnn( second_cell_fw, second_cell_bw, g0, x_len, dtype='float', scope='g1') # [N, M, JX, 2d] g1 = tf.concat(axis=3, values=[fw_g1, bw_g1]) # g1 seems to be M in paper #g1= tf.reshape(g1,[N, M , JX, 2*d]) #reshaping here again, since g1 is used ahead g1 = tf.Print(g1, [tf.shape(g1)], message="g1 shape", first_n=5, summarize=200) p0 = tf.Print(p0, [tf.shape(p0)], message="p0 shape", first_n=5, summarize=200) g11 = tf.reshape(g1, [N, -1, 2 * d]) my_encoder_final_state_c = tf.concat( values=(my_fw_final_state.c, my_bw_final_state.c), axis=1, name="my_encoder_final_state_c") my_encoder_final_state_h = tf.concat( values=(my_fw_final_state.h, my_bw_final_state.h), axis=1, name="my_encoder_final_state_h") my_encoder_final_state = tf.contrib.rnn.LSTMStateTuple( c=my_encoder_final_state_c, h=my_encoder_final_state_h) #compute indices for finding span as the second task in multi task learning logits = get_logits([g1, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits1') logits = tf.Print(logits, [tf.shape(logits)], message="logits shape", first_n=5, summarize=200) a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), tf.reshape(logits, [N, M * JX])) a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), [1, M, JX, 1]) (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn( d_cell4_fw, d_cell4_bw, tf.concat(axis=3, values=[p0, g1, a1i, g1 * a1i]), x_len, dtype='float', scope='g2') # [N, M, JX, 2d] g2 = tf.concat(axis=3, values=[fw_g2, bw_g2]) logits2 = get_logits([g2, p0], d, True, wd=config.wd, input_keep_prob=config.input_keep_prob, mask=self.x_mask, is_train=self.is_train, func=config.answer_func, scope='logits2') flat_logits = tf.reshape(logits, [-1, M * JX]) flat_logits = tf.Print(flat_logits, [tf.shape(flat_logits), flat_logits], message="flat_logits shape and contents", first_n=5, summarize=200) self.flat_yp = tf.nn.softmax(flat_logits) # [-1, M*JX] flat_logits2 = tf.reshape(logits2, [-1, M * JX]) self.flat_yp2 = tf.nn.softmax(flat_logits2) tgt_vocab_size = config.len_new_emb_mat # hparam # FIXME: Obtain embeddings differently? print("length is", config.len_new_emb_mat) nodes = d # Look up embedding decoder_emb_inp = tf.nn.embedding_lookup( word_emb_mat, self.decoder_inputs) # [batch_size, max words, embedding_size] with tf.variable_scope("rnn_decoder", reuse=tf.AUTO_REUSE): init = tf.random_normal_initializer(0.0, 0.5) W_dense = tf.get_variable(name="W_dense", shape=[2 * nodes, tgt_vocab_size], dtype=tf.float32, initializer=init) b_dense = tf.get_variable(name="b_dense", shape=[tgt_vocab_size], dtype=tf.float32, initializer=tf.zeros_initializer) W_att_dec = tf.get_variable(name="W_att_dec", shape=[2 * nodes, 2 * nodes], dtype=tf.float32, initializer=init) W_att_enc = tf.get_variable(name="W_att_enc1", shape=[1, 1, 2 * nodes, 2 * nodes], dtype=tf.float32, initializer=init) v_blend = tf.get_variable(name="v_blend", shape=[1, 2 * nodes], dtype=tf.float32, initializer=init) pad_time_slice = tf.fill([N], 0, name='PAD') pad_step_embedded = tf.nn.embedding_lookup( word_emb_mat, pad_time_slice) decoder_cell = tf.contrib.rnn.BasicLSTMCell( 2 * nodes, state_is_tuple=True ) # doesnt work without the factor of 2?? '''Loop transition function is a mapping (time, previous_cell_output, previous_cell_state, previous_loop_state) -> (elements_finished, input, cell_state, output, loop_state). It is called before RNNCell to prepare its inputs and state. Everything is a Tensor except for initial call at time=0 when everything is None (except time).''' def execute_pointer_network(attn_dist): #this is to find the word in the summary, which recieved highest probability and pass it to the next step in decoder index_pos = tf.argmax(attn_dist, axis=1) index_pos = tf.expand_dims(index_pos, 1) index_pos = tf.concat([ tf.reshape(tf.range(start=0, limit=N, dtype=tf.int64), [N, 1]), tf.zeros([N, 1], tf.int64), index_pos ], axis=1) index_pos = tf.cast(tf.gather_nd(params=self.x, indices=index_pos), dtype=tf.int64) return index_pos def execute_normal_decoder(previous_output, W_dense, b_dense): output_logits = tf.add(tf.matmul(previous_output, W_dense), b_dense) return tf.argmax(output_logits, axis=1) def loop_fn_initial(): initial_elements_finished = ( 0 >= self.target_sequence_length ) # all False at the initial step #initial_input = tf.concat([decoder_emb_inp[:,0], my_encoder_final_state_h], 1) initial_input = decoder_emb_inp[:, 0] initial_cell_state = my_encoder_final_state #setting the correct shapes , as it is used to determine the emit structure initial_cell_output = tf.cond( self.pointer_gen, lambda: tf.zeros([M * JX], tf.float32), lambda: tf.zeros([2 * nodes], tf.float32)) initial_loop_state = None # we don't need to pass any additional information return (initial_elements_finished, initial_input, initial_cell_state, initial_cell_output, initial_loop_state) encoder_output = tf.expand_dims(g11, axis=2) def loop_fn_transition(time, previous_output, previous_state, previous_loop_state): def get_next_input(): # compute Badhanau style attention #performing convolution or reshaping input to (-1,2*d) and then doing matmul, is essentially the same operation #see matrix_mult.py...conv2d might be faster?? #https://stackoverflow.com/questions/38235555/tensorflow-matmul-of-input-matrix-with-batch-data encoder_features = tf.nn.conv2d( encoder_output, W_att_enc, [1, 1, 1, 1], "SAME" ) # shape (batch_size,max_enc_steps,1,attention_vec_size) dec_portion = tf.matmul(previous_state.h, W_att_dec) decoder_features = tf.expand_dims( tf.expand_dims(dec_portion, 1), 1 ) # reshape to (batch_size, 1, 1, attention_vec_size) #python broadcasting will alllow the two features to get added e_not_masked = tf.reduce_sum( v_blend * tf.nn.tanh(encoder_features + decoder_features), [2, 3]) # calculate e, (batch_size, max_enc_steps) #The shape of output of a softmax is the same as the input: it just normalizes the values. attn_dist = tf.nn.softmax( e_not_masked) # (batch_size, max_enc_steps) attn_dist = tf.Print(attn_dist, [tf.shape(attn_dist)], message="attn_dist", first_n=5, summarize=200) #Multiplying all the 2d vectors with same attn_dist values,and finally keeping 1 2d vector for every batch example context_vector = tf.reduce_sum( tf.reshape(attn_dist, [N, -1, 1, 1]) * encoder_output, [1, 2]) # shape (batch_size, attn_size). context_vector = tf.reshape(context_vector, [-1, 2 * nodes]) #next_input = tf.cond(self.is_train, lambda: tf.concat( # [tf.reshape(decoder_emb_inp[:, time], (N, dw)), context_vector], 1), # lambda: tf.concat([tf.nn.embedding_lookup(word_emb_mat, prediction), context_vector], 1)) #output_logits = tf.add(tf.matmul(previous_output, W_dense), b_dense) prediction = tf.cond( self.pointer_gen, lambda: execute_pointer_network(attn_dist), lambda: execute_normal_decoder( previous_output, W_dense, b_dense)) with tf.variable_scope("modified_dec_inputs", reuse=tf.AUTO_REUSE): next_input = tf.cond( self.is_train, lambda: _linear(args=[context_vector] + [ tf.reshape(decoder_emb_inp[:, time], (N, dw)) ], output_size=dw, bias=True), lambda: _linear([context_vector] + [ tf.nn.embedding_lookup( word_emb_mat, prediction) ], dw, True)) return next_input, attn_dist elements_finished = ( time >= self.target_sequence_length ) # this operation produces boolean tensor of [batch_size] # defining if corresponding sequence has ended finished = tf.reduce_all( elements_finished) # -> boolean scalar #input = tf.cond(finished, lambda: tf.concat([pad_step_embedded, my_encoder_final_state_h], 1),get_next_input) input, attn_distribution = tf.cond( finished, lambda: (pad_step_embedded, tf.zeros([N, M * JX], tf.float32)), get_next_input) attn_distribution = tf.Print(attn_distribution, [tf.shape(attn_distribution)], message="attn_distribution", first_n=5, summarize=200) state = previous_state output = tf.cond(self.pointer_gen, lambda: attn_distribution, lambda: previous_output) output = tf.Print(output, [tf.shape(output)], message="OUTPUT", first_n=5, summarize=200) loop_state = None return (elements_finished, input, state, output, loop_state) def loop_fn(time, previous_output, previous_state, previous_loop_state): if previous_state is None: # time == 0 assert previous_output is None and previous_state is None return loop_fn_initial() else: return loop_fn_transition(time, previous_output, previous_state, previous_loop_state) decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn( decoder_cell, loop_fn) decoder_outputs = decoder_outputs_ta.stack() decoder_outputs = tf.Print(decoder_outputs, [tf.shape(decoder_outputs)], message="decoder_outputs", first_n=5, summarize=200) # To do output projection, we have to temporarilly flatten decoder_outputs from [max_steps, batch_size, hidden_dim] to # [max_steps*batch_size, hidden_dim], as tf.matmul needs rank-2 tensors at most. decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack( tf.shape(decoder_outputs)) decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim)) #if pointer networks, no need to pass through dense layer decoder_logits_flat = tf.cond( self.pointer_gen, lambda: decoder_outputs_flat, lambda: tf.add( tf.matmul(decoder_outputs_flat, W_dense), b_dense)) decoder_logits = tf.cond( self.pointer_gen, lambda: tf.reshape( decoder_logits_flat, (decoder_max_steps, decoder_batch_size, decoder_dim)), lambda: tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, tgt_vocab_size))) decoder_logits = _transpose_batch_time(decoder_logits) #decoder_prediction = tf.argmax(decoder_logits, -1) #self.decoder_logits_train = final_outputs.rnn_output self.decoder_logits_train = decoder_logits self.index_start = flat_logits self.index_end = flat_logits2
def model_fn(features, labels, mode, params, config): cur_batch_D = params.num_char if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: X_s, X_l, X_r, X_u = features cur_batch_B = tf.shape(X_s)[0] cur_batch_T = tf.shape(X_s)[1] Xs_embd = tf.one_hot(X_s, cur_batch_D) X_ta = tf.TensorArray(size=cur_batch_T, dtype=tf.float32).unstack( _transpose_batch_time(Xs_embd), 'TBD_Formatted_X') else: cur_batch_B = params.infer_batch_size cur_batch_T = params.infer_seq_length acell = { 'lstm': lambda: LSTMCell(params.num_hidden), 'sru': lambda: SRUCell(params.num_hidden) }[params.cell]() output_layer_info = { 'units': cur_batch_D, # this is the size of vocabulary 'name': 'out_to_character', # linear 'activation': tf.nn.softmax } with tf.variable_scope('Shared_Dense', reuse=False) as dense_layer_scope: # this will be replaced by the cell_output later zeros_placeholder = tf.zeros([1, acell.output_size]) tf.layers.dense(zeros_placeholder, **output_layer_info) def get_logits(cell_out): # useful when measuring the cross-entropy loss with tf.variable_scope(dense_layer_scope, reuse=True): return tf.layers.dense(cell_out, **output_layer_info) def get_dist(cell_out): return Categorical(logits=get_logits(cell_out), name='categorical_dist', allow_nan_stats=False, dtype=tf.int32) def get_sample(cell_out): return tf.one_hot(get_dist(cell_out).sample(), cur_batch_D) def get_prob(cell_out, obs): # the observation is in return get_dist(cell_out).prob(obs) with tf.variable_scope('Initial_State'): h_init = tf.tile( tf.get_variable('init_state_h', [1, params.num_hidden], initializer=tf.random_uniform_initializer(0)), [cur_batch_B, 1]) c_init = tf.tile( tf.get_variable('init_state_c', [1, params.num_hidden], initializer=tf.random_uniform_initializer(0)), [cur_batch_B, 1]) cell_init_state = LSTMStateTuple(c_init, h_init) first_step = tf.zeros(shape=[cur_batch_B, cur_batch_D], dtype=tf.float32, name='first_character') with tf.name_scope('NADE'): output_ta = tf.TensorArray(size=cur_batch_T, dtype=tf.float32) def loop_fn(time, cell_output, cell_state, loop_state): emit_output = cell_output # == None for time == 0 if cell_output is None: next_cell_state = cell_init_state next_step = first_step next_loop_state = output_ta else: # pass the last state to the next next_cell_state = cell_state if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: next_step = X_ta.read(time - 1) else: next_step = get_sample(cell_output) next_loop_state = loop_state.write(time - 1, next_step) if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: elements_finished = (time >= X_l) else: elements_finished = (time >= cur_batch_T) return elements_finished, next_step, next_cell_state, emit_output, next_loop_state output_ta, _, loop_state_ta = tf.nn.raw_rnn(acell, loop_fn) with tf.name_scope('Output'): outputs = _transpose_batch_time(output_ta.stack()) logits = get_logits(outputs) if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: logp_loss = -tf.reduce_mean(tf.log(1e-6 + get_prob(outputs, X_s))) xentropy_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( labels=Xs_embd, logits=logits), name='xtropy_loss') train_op = tf.train.RMSPropOptimizer( learning_rate=params.learning_rate).minimize( loss=logp_loss, global_step=tf.train.get_global_step()) logging_hook = tf.train.LoggingTensorHook( tensors={"xtropy_loss": "xtropy_loss"}, every_n_iter=100) return tf.estimator.EstimatorSpec(mode=mode, loss=logp_loss, train_op=train_op, training_chief_hooks=[logging_hook]) else: X_sampled = tf.argmax(_transpose_batch_time(loop_state_ta.stack()), axis=2) return tf.estimator.EstimatorSpec(mode=mode, predictions=X_sampled)