def decoder_teacher_forcing( self, encoder_output, target=None, encoder_end_state=None ): # ================ Setup ================ batch_size = encoder_output.shape[0] # Prepare target for decoding target_sequence_length = sequence_length_2D(target) start_tokens = tf.tile([self.GO_SYMBOL], [batch_size]) end_tokens = tf.tile([self.END_SYMBOL], [batch_size]) if self.is_timeseries: start_tokens = tf.cast(start_tokens, tf.float32) end_tokens = tf.cast(end_tokens, tf.float32) targets_with_go_and_eos = tf.concat([ tf.expand_dims(start_tokens, 1), target, # todo tf2: right now cast to tf.int32, fails if tf.int64 tf.expand_dims(end_tokens, 1)], 1) target_sequence_length_with_eos = target_sequence_length + 1 # Decoder Embeddings decoder_emb_inp = self.decoder_embedding(targets_with_go_and_eos) # Setting up decoder memory from encoder output if self.attention_mechanism is not None: encoder_sequence_length = sequence_length_3D(encoder_output) self.attention_mechanism.setup_memory( encoder_output, memory_sequence_length=encoder_sequence_length ) decoder_initial_state = self.build_decoder_initial_state( batch_size, encoder_state=encoder_end_state, dtype=tf.float32 ) decoder = tfa.seq2seq.BasicDecoder( self.decoder_rnncell, sampler=self.sampler, output_layer=self.dense_layer ) # BasicDecoderOutput outputs, final_state, generated_sequence_lengths = decoder( decoder_emb_inp, initial_state=decoder_initial_state, sequence_length=target_sequence_length_with_eos ) logits = outputs.rnn_output mask = tf.sequence_mask( generated_sequence_lengths, maxlen=logits.shape[1], dtype=tf.float32 ) logits = logits * mask[:, :, tf.newaxis] return logits # , outputs, final_state, generated_sequence_lengths
def call(self, y_true, y_pred): # y_true: shape [batch_size, sequence_size] # y_pred: shape [batch_size, sequence_size, num_classes] y_pred_tensor = y_pred[LOGITS] y_true_tensor = tf.cast(y_true, dtype=tf.int64) # pad the shorter sequence y_pred_seq_len = tf.shape(y_pred_tensor)[1] y_true_seq_len = tf.shape(y_true_tensor)[1] y_pred_pad_len = tf.maximum(0, y_true_seq_len - y_pred_seq_len) y_true_pad_len = tf.maximum(0, y_pred_seq_len - y_true_seq_len) y_pred_tensor = tf.pad(y_pred_tensor, [[0, 0], [0, y_pred_pad_len], [0, 0]]) y_true_tensor = tf.pad(y_true_tensor, [[0, 0], [0, y_true_pad_len]]) longest_sequence_length = tf.maximum(sequence_length_2D(y_true_tensor), sequence_length_3D(y_pred_tensor)) longest_sequence_length += 1 # for EOS longest_sequence_length = tf.minimum(longest_sequence_length, tf.shape(y_true_tensor)[1]) mask = tf.sequence_mask(longest_sequence_length, maxlen=tf.shape(y_true_tensor)[1], dtype=tf.float32) # compute loss based on valid time steps loss = self.loss_function(y_true_tensor, y_pred_tensor) loss = loss * mask loss = tf.reduce_sum(loss) / tf.reduce_sum(mask) return loss
def reduce_last(sequence, **kwargs): batch_size = tf.shape(sequence)[0] sequence_length = sequence_length_3D(sequence) # gather the correct outputs from the the RNN outputs (the outputs after sequence_length are all 0s) return tf.gather_nd( sequence, tf.stack([tf.range(batch_size), tf.maximum(sequence_length - 1, 0)], axis=1))
def call(self, inputs, training=None, mask=None): batch_size = tf.shape(inputs)[0] sequence_length = sequence_length_3D(inputs) # gather the correct outputs from the the RNN outputs (the outputs after sequence_length are all 0s) gathered = tf.gather_nd( inputs, tf.stack( [tf.range(batch_size), tf.maximum(sequence_length - 1, 0)], axis=1)) return gathered
def call(self, y_true, y_pred): # y_true: shape [batch_size, sequence_size] # y_pred: shape [batch_size, sequence_size, num_classes] y_pred = y_pred[LOGITS] y_true = tf.convert_to_tensor(y_true, dtype=tf.int64) # pad the shorter sequence if y_true.shape[1] > y_pred.shape[1]: pad = tf.zeros([ y_pred.shape[0], y_true.shape[1] - y_pred.shape[1], y_pred.shape[2] ], dtype=y_pred.dtype) y_pred = tf.concat([y_pred, pad], axis=1) elif y_pred.shape[1] > y_true.shape[1]: pad = tf.zeros([ y_true.shape[0], y_pred.shape[1] - y_true.shape[1], ], dtype=y_true.dtype) y_true = tf.concat([y_true, pad], axis=1) longest_sequence_length = tf.maximum(sequence_length_2D(y_true), sequence_length_3D(y_pred)) longest_sequence_length += 1 # for EOS longest_sequence_length = tf.minimum(longest_sequence_length, y_true.shape[1]) mask = tf.sequence_mask(longest_sequence_length, maxlen=y_true.shape[1], dtype=tf.float32) # compute loss based on valid time steps loss = self.loss_function(y_true, y_pred) loss = loss * mask loss = tf.reduce_sum(loss) / tf.reduce_sum(mask) return loss
def decoder_teacher_forcing( self, encoder_output, target=None, encoder_end_state=None ): # ================ Setup ================ batch_size = tf.shape(encoder_output)[0] # Prepare target for decoding target_sequence_length = sequence_length_2D(target) start_tokens = tf.tile([self.GO_SYMBOL], [batch_size]) end_tokens = tf.tile([self.END_SYMBOL], [batch_size]) if self.is_timeseries: start_tokens = tf.cast(start_tokens, tf.float32) end_tokens = tf.cast(end_tokens, tf.float32) targets_with_go_and_eos = tf.concat([ tf.expand_dims(start_tokens, 1), target, # right now cast to tf.int32, fails if tf.int64 tf.expand_dims(end_tokens, 1)], 1) target_sequence_length_with_eos = target_sequence_length + 1 # Decoder Embeddings decoder_emb_inp = self.decoder_embedding(targets_with_go_and_eos) # Setting up decoder memory from encoder output if self.attention_mechanism is not None: encoder_sequence_length = sequence_length_3D(encoder_output) self.attention_mechanism.setup_memory( encoder_output, memory_sequence_length=encoder_sequence_length ) decoder_initial_state = self.build_decoder_initial_state( batch_size, encoder_state=encoder_end_state, dtype=tf.float32 ) # use Ludwig custom BasicDecoder decoder = BasicDecoder( self.decoder_rnncell, sampler=self.sampler, output_layer=self.dense_layer ) # BasicDecoderOutput outputs, final_state, generated_sequence_lengths = decoder( decoder_emb_inp, initial_state=decoder_initial_state, sequence_length=target_sequence_length_with_eos ) logits = outputs.rnn_output # mask = tf.sequence_mask( # generated_sequence_lengths, # maxlen=tf.shape(logits)[1], # dtype=tf.float32 # ) # logits = logits * mask[:, :, tf.newaxis] # append a trailing 0, useful for # those datapoints that reach maximum length # and don't have a eos at the end logits = tf.pad( logits, [[0, 0], [0, 1], [0, 0]] ) # EXPECTED SIZE OF RETURNED TENSORS # logits: shape[batch_size, seq_size, num_classes] used for evaluation # projection_input: shape[batch_size, seq_size, state_size] for sampled softmax return { LOGITS: logits, PROJECTION_INPUT: outputs.projection_input }
def __call__(self, output_feature, targets, hidden, hidden_size, regularizer, is_timeseries=False): logging.info(' hidden shape: {0}'.format(hidden.shape)) if len(hidden.shape) != 3: raise ValueError( 'Decoder inputs rank is {}, but should be 3 [batch x sequence x hidden] ' 'when using a tagger sequential decoder. ' 'Consider setting reduce_output to null / None if a sequential encoder / combiner is used.' .format(len(hidden.shape))) if is_timeseries: output_feature['num_classes'] = 1 if not self.regularize: regularizer = None sequence_length = tf.shape(hidden)[1] if self.attention: hidden, hidden_size = feed_forward_memory_attention( hidden, hidden, hidden_size) targets_sequence_length = sequence_length_2D(targets) initializer_obj = get_initializer(self.initializer) class_weights = tf.get_variable('weights', initializer=initializer_obj([ hidden_size, output_feature['num_classes'] ]), regularizer=regularizer) logging.debug(' weights: {0}'.format(class_weights)) class_biases = tf.get_variable('biases', [output_feature['num_classes']]) logging.debug(' biases: {0}'.format(class_biases)) hidden_reshape = tf.reshape(hidden, [-1, hidden_size]) logits_to_reshape = tf.matmul(hidden_reshape, class_weights) + class_biases logits = tf.reshape( logits_to_reshape, [-1, sequence_length, output_feature['num_classes']]) logging.debug(' logits: {0}'.format(logits)) if is_timeseries: probabilities_sequence = tf.zeros_like(logits) predictions_sequence = tf.reshape(logits, [-1, sequence_length]) else: probabilities_sequence = tf.nn.softmax( logits, name='probabilities_{}'.format(output_feature['name'])) predictions_sequence = tf.argmax(logits, -1, name='predictions_{}'.format( output_feature['name']), output_type=tf.int32) predictions_sequence_length = sequence_length_3D(hidden) return predictions_sequence, probabilities_sequence, \ predictions_sequence_length, \ probabilities_sequence, targets_sequence_length, \ logits, hidden, class_weights, class_biases
def __call__( self, inputs, # encoder outputs training=None, mask=None, **kwargs): if (self.main_sequence_feature is None or self.main_sequence_feature not in inputs): for if_name, if_outputs in inputs.items(): # todo: when https://github.com/uber/ludwig/issues/810 is closed # convert following test from using shape to use explicit # if_outputs['type'] values for sequence features if len(if_outputs['encoder_output'].shape) == 3: self.main_sequence_feature = if_name break if self.main_sequence_feature is None: raise Exception( 'No sequence feature available for sequence combiner') main_sequence_feature_encoding = inputs[self.main_sequence_feature] representation = main_sequence_feature_encoding['encoder_output'] representations = [representation] sequence_max_length = representation.shape[1] sequence_length = sequence_length_3D(representation) # ================ Concat ================ for if_name, if_outputs in inputs.items(): if if_name != self.main_sequence_feature: if_representation = if_outputs['encoder_output'] if len(if_representation.shape) == 3: # The following check makes sense when # both representations have a specified # sequence length dimension. If they do not, # then this check is simply checking if None == None # and will not catch discrepancies in the different # feature length dimension. Those errors will show up # at training time. Possible solutions to this is # to enforce a length second dimension in # sequential feature placeholders, but that # does not work with BucketedBatcher that requires # the second dimension to be undefined in order to be # able to trim the data points and speed up computation. # So for now we are keeping things like this, make sure # to write in the documentation that training time # dimensions mismatch may occur if the sequential # features have different lengths for some data points. if if_representation.shape[1] != representation.shape[1]: raise ValueError( 'The sequence length of the input feature {} ' 'is {} and is different from the sequence ' 'length of the main sequence feature {} which ' 'is {}.\n Shape of {}: {}, shape of {}: {}.\n' 'Sequence lengths of all sequential features ' 'must be the same in order to be concatenated ' 'by the sequence concat combiner. ' 'Try to impose the same max sequence length ' 'as a preprocessing parameter to both features ' 'or to reduce the output of {}.'.format( if_name, if_representation.shape[1], self.main_sequence_feature, representation.shape[1], if_name, if_representation.shape, if_name, representation.shape, if_name)) # this assumes all sequence representations have the # same sequence length, 2nd dimension representations.append(if_representation) elif len(if_representation.shape) == 2: multipliers = tf.constant([1, sequence_max_length, 1]) tiled_representation = tf.tile( tf.expand_dims(if_representation, 1), multipliers) representations.append(tiled_representation) else: raise ValueError( 'The representation of {} has rank {} and cannot be' ' concatenated by a sequence concat combiner. ' 'Only rank 2 and rank 3 tensors are supported.'.format( if_outputs['name'], len(if_representation.shape))) hidden = tf.concat(representations, 2) logger.debug(' concat_hidden: {0}'.format(hidden)) # ================ Mask ================ # todo future: maybe modify this with TF2 mask mechanics sequence_mask = tf.sequence_mask(sequence_length, sequence_max_length) hidden = tf.multiply( hidden, tf.cast(tf.expand_dims(sequence_mask, -1), dtype=tf.float32)) # ================ Reduce ================ hidden = self.reduce_sequence(hidden) return_data = {'combiner_output': hidden} if len(inputs) == 1: for key, value in [d for d in inputs.values()][0].items(): if key != 'encoder_output': return_data[key] = value return return_data
def recurrent_decoder(encoder_outputs, targets, max_sequence_length, vocab_size, cell_type='rnn', state_size=256, embedding_size=50, num_layers=1, attention_mechanism=None, beam_width=1, projection=True, tied_target_embeddings=True, embeddings=None, initializer=None, regularizer=None, is_timeseries=False): with tf.variable_scope('rnn_decoder', reuse=tf.AUTO_REUSE, regularizer=regularizer): # ================ Setup ================ if beam_width > 1 and is_timeseries: raise ValueError('Invalid beam_width: {}'.format(beam_width)) GO_SYMBOL = vocab_size END_SYMBOL = 0 batch_size = tf.shape(encoder_outputs)[0] # ================ Projection ================ # Project the encoder outputs to the size of the decoder state encoder_outputs_size = encoder_outputs.shape[-1] if projection and encoder_outputs_size != state_size: with tf.variable_scope('projection'): encoder_output_rank = len(encoder_outputs.shape) if encoder_output_rank > 2: sequence_length = tf.shape(encoder_outputs)[1] encoder_outputs = tf.reshape(encoder_outputs, [-1, encoder_outputs_size]) encoder_outputs = fc_layer(encoder_outputs, encoder_outputs.shape[-1], state_size, activation=None, initializer=initializer) encoder_outputs = tf.reshape( encoder_outputs, [-1, sequence_length, state_size]) else: encoder_outputs = fc_layer(encoder_outputs, encoder_outputs.shape[-1], state_size, activation=None, initializer=initializer) # ================ Targets sequence ================ # Calculate the length of inputs and the batch size with tf.variable_scope('sequence'): targets_sequence_length = sequence_length_2D(targets) start_tokens = tf.tile([GO_SYMBOL], [batch_size]) end_tokens = tf.tile([END_SYMBOL], [batch_size]) if is_timeseries: start_tokens = tf.cast(start_tokens, tf.float32) end_tokens = tf.cast(end_tokens, tf.float32) targets_with_go_and_eos = tf.concat([ tf.expand_dims(start_tokens, 1), targets, tf.expand_dims(end_tokens, 1) ], 1) logging.debug( ' targets_with_go: {0}'.format(targets_with_go_and_eos)) targets_sequence_length_with_eos = targets_sequence_length + 1 # the EOS symbol is 0 so it's not increasing the real length of the sequence # ================ Embeddings ================ if is_timeseries: targets_embedded = tf.expand_dims(targets_with_go_and_eos, -1) targets_embeddings = None else: with tf.variable_scope('embedding'): if embeddings is not None: embedding_size = embeddings.shape.as_list()[-1] if tied_target_embeddings: state_size = embedding_size elif tied_target_embeddings: embedding_size = state_size if embeddings is not None: embedding_go = tf.get_variable( 'embedding_GO', initializer=tf.random_uniform([1, embedding_size], -1.0, 1.0)) targets_embeddings = tf.concat([embeddings, embedding_go], axis=0) else: initializer_obj = get_initializer(initializer) targets_embeddings = tf.get_variable( 'embeddings', initializer=initializer_obj( [vocab_size + 1, embedding_size]), regularizer=regularizer) logging.debug( ' targets_embeddings: {0}'.format(targets_embeddings)) targets_embedded = tf.nn.embedding_lookup( targets_embeddings, targets_with_go_and_eos, name='decoder_input_embeddings') logging.debug(' targets_embedded: {0}'.format(targets_embedded)) # ================ Class prediction ================ if tied_target_embeddings: class_weights = tf.transpose(targets_embeddings) else: initializer_obj = get_initializer(initializer) class_weights = tf.get_variable('class_weights', initializer=initializer_obj( [state_size, vocab_size + 1]), regularizer=regularizer) logging.debug(' class_weights: {0}'.format(class_weights)) class_biases = tf.get_variable('class_biases', [vocab_size + 1]) logging.debug(' class_biases: {0}'.format(class_biases)) projection_layer = Projection(class_weights, class_biases) # ================ RNN ================ initial_state = encoder_outputs with tf.variable_scope('rnn_cells') as vs: # Cell cell_fun = get_cell_fun(cell_type) if num_layers == 1: cell = cell_fun(state_size) if cell_type.startswith('lstm'): initial_state = LSTMStateTuple(c=initial_state, h=initial_state) elif num_layers > 1: cell = MultiRNNCell( [cell_fun(state_size) for _ in range(num_layers)], state_is_tuple=True) if cell_type.startswith('lstm'): initial_state = LSTMStateTuple(c=initial_state, h=initial_state) initial_state = tuple([initial_state] * num_layers) else: raise ValueError( 'num_layers in recurrent decoser: {}. ' 'Number of layers in a recurrenct decoder cannot be <= 0'. format(num_layers)) # Attention if attention_mechanism is not None: if attention_mechanism == 'bahdanau': attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=state_size, memory=encoder_outputs, memory_sequence_length=sequence_length_3D( encoder_outputs)) elif attention_mechanism == 'luong': attention_mechanism = tf.contrib.seq2seq.LuongAttention( num_units=state_size, memory=encoder_outputs, memory_sequence_length=sequence_length_3D( encoder_outputs)) else: raise ValueError( 'Attention mechanism {} not supported'.format( attention_mechanism)) cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=state_size) initial_state = cell.zero_state(dtype=tf.float32, batch_size=batch_size) initial_state = initial_state.clone( cell_state=reduce_sequence(encoder_outputs, 'last')) for v in tf.global_variables(): if v.name.startswith(vs.name): logging.debug(' {}: {}'.format(v.name, v)) # ================ Decoding ================ def decode(initial_state, cell, helper, beam_width=1, projection_layer=None): # The decoder itself if beam_width > 1: # Tile inputs for beam search decoder beam_initial_state = tf.contrib.seq2seq.tile_batch( initial_state, beam_width) decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=cell, embedding=targets_embeddings, start_tokens=start_tokens, end_token=END_SYMBOL, initial_state=beam_initial_state, beam_width=beam_width, output_layer=projection_layer) else: decoder = BasicDecoder(cell=cell, helper=helper, initial_state=initial_state, output_layer=projection_layer) # The decoding operation outputs = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=False if beam_width > 1 else True, maximum_iterations=max_sequence_length) return outputs # ================ Decoding helpers ================ if is_timeseries: train_helper = TimeseriesTrainingHelper( inputs=targets_embedded, sequence_length=targets_sequence_length_with_eos) final_outputs_pred, final_state_pred, final_sequence_lengths_pred = decode( initial_state, cell, train_helper, projection_layer=projection_layer) eval_logits = final_outputs_pred.rnn_output train_logits = final_outputs_pred.projection_input predictions_sequence = tf.reshape(eval_logits, [batch_size, -1]) predictions_sequence_length_with_eos = final_sequence_lengths_pred else: train_helper = tf.contrib.seq2seq.TrainingHelper( inputs=targets_embedded, sequence_length=targets_sequence_length_with_eos) final_outputs_train, final_state_train, final_sequence_lengths_train = decode( initial_state, cell, train_helper, projection_layer=projection_layer) eval_logits = final_outputs_train.rnn_output train_logits = final_outputs_train.projection_input # train_predictions = final_outputs_train.sample_id pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=targets_embeddings, start_tokens=start_tokens, end_token=END_SYMBOL) final_outputs_pred, final_state_pred, final_sequence_lengths_pred = decode( initial_state, cell, pred_helper, beam_width, projection_layer=projection_layer) if beam_width > 1: predictions_sequence = final_outputs_pred.beam_search_decoder_output.predicted_ids[:, :, 0] # final_outputs_pred..predicted_ids[:,:,0] would work too, but it contains -1s for padding predictions_sequence_scores = final_outputs_pred.beam_search_decoder_output.scores[:, :, 0] predictions_sequence_length_with_eos = final_sequence_lengths_pred[:, 0] else: predictions_sequence = final_outputs_pred.sample_id predictions_sequence_scores = final_outputs_pred.rnn_output predictions_sequence_length_with_eos = final_sequence_lengths_pred logging.debug(' train_logits: {0}'.format(train_logits)) logging.debug(' eval_logits: {0}'.format(eval_logits)) logging.debug(' predictions_sequence: {0}'.format(predictions_sequence)) logging.debug(' predictions_sequence_scores: {0}'.format( predictions_sequence_scores)) return predictions_sequence, predictions_sequence_scores, predictions_sequence_length_with_eos, \ targets_sequence_length_with_eos, eval_logits, train_logits, class_weights, class_biases
def __call__(self, input_sequence, regularizer, dropout_rate, is_training=True): if not self.regularize: regularizer = None # Calculate the length of input_sequence and the batch size sequence_length = sequence_length_3D(input_sequence) # RNN cell cell_fn = get_cell_fun(self.cell_type) # initial state # init_state = tf.get_variable( # 'init_state', # [1, state_size], # initializer=tf.constant_initializer(0.0), # ) # init_state = tf.tile(init_state, [batch_size, 1]) # main RNN operation with tf.variable_scope('rnn_stack', reuse=tf.AUTO_REUSE, regularizer=regularizer) as vs: if self.bidirectional: # forward direction cell fw_cell = lambda state_size: cell_fn(state_size) bw_cell = lambda state_size: cell_fn(state_size) fw_cells = [ fw_cell(self.state_size) for _ in range(self.num_layers) ] bw_cells = [ bw_cell(self.state_size) for _ in range(self.num_layers) ] rnn_outputs, final_state_fw, final_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( cells_fw=fw_cells, cells_bw=bw_cells, dtype=tf.float32, sequence_length=sequence_length, inputs=input_sequence) else: cell = lambda state_size: cell_fn(state_size) cells = MultiRNNCell( [cell(self.state_size) for _ in range(self.num_layers)], state_is_tuple=True) rnn_outputs, final_state = tf.nn.dynamic_rnn( cells, input_sequence, sequence_length=sequence_length, dtype=tf.float32) # initial_state=init_state) for v in tf.global_variables(): if v.name.startswith(vs.name): logging.debug(' {}: {}'.format(v.name, v)) logging.debug(' rnn_outputs: {0}'.format(rnn_outputs)) rnn_output = reduce_sequence(rnn_outputs, self.reduce_output) logging.debug(' reduced_rnn_output: {0}'.format(rnn_output)) # dropout if self.dropout and dropout_rate is not None: rnn_output = tf.layers.dropout(rnn_output, rate=dropout_rate, training=is_training) logging.debug(' dropout_rnn: {0}'.format(rnn_output)) return rnn_output, rnn_output.shape.as_list()[-1]
def concat_dependencies(self, hidden, other_features_hidden): if len(self.dependencies) > 0: dependencies_hidden = [] for dependency in self.dependencies: # the dependent feature is ensured to be present in final_hidden # because we did the topological sort of the features before dependency_final_hidden = other_features_hidden[dependency] if len(hidden.shape) > 2: if len(dependency_final_hidden.shape) > 2: # matrix matrix -> concat assert hidden.shape[1] == \ dependency_final_hidden.shape[1] dependencies_hidden.append(dependency_final_hidden) else: # matrix vector -> tile concat sequence_max_length = hidden.shape[1] multipliers = tf.concat( [[1], [sequence_max_length], [1]], 0 ) tiled_representation = tf.tile( tf.expand_dims(dependency_final_hidden, 1), multipliers ) # todo future: maybe modify this with TF2 mask mechanics sequence_length = sequence_length_3D(hidden) mask = tf.sequence_mask( sequence_length, sequence_max_length ) tiled_representation = tf.multiply( tiled_representation, tf.cast(mask[:, :, tf.newaxis], dtype=tf.float32) ) dependencies_hidden.append(tiled_representation) else: if len(dependency_final_hidden.shape) > 2: # vector matrix -> reduce concat reducer = self.dependency_reducers[dependency] dependencies_hidden.append( reducer(dependency_final_hidden) ) else: # vector vector -> concat dependencies_hidden.append(dependency_final_hidden) try: hidden = tf.concat([hidden] + dependencies_hidden, -1) except: raise ValueError( 'Shape mismatch while concatenating dependent features of ' '{}: {}. Concatenating the feature activations tensor {} ' 'with activation tensors of dependencies: {}. The error is ' 'likely due to a mismatch of the second dimension (sequence' ' length) or a difference in ranks. Likely solutions are ' 'setting the maximum_sequence_length of all sequential ' 'features to be the same, or reduce the output of some ' 'features, or disabling the bucketing setting ' 'bucketing_field to None / null, as activating it will ' 'reduce the length of the field the bucketing is performed ' 'on.'.format( self.feature_name, self.dependencies, hidden, dependencies_hidden ) ) return hidden
def decoder_beam_search(self, encoder_output, encoder_end_state=None, training=None): # ================ Setup ================ batch_size = encoder_output.shape[0] encoder_sequence_length = sequence_length_3D(encoder_output) # ================ predictions ================= # decoder_input = tf.expand_dims([self.GO_SYMBOL] * batch_size, 1) start_tokens = tf.fill([batch_size], self.GO_SYMBOL) end_token = self.END_SYMBOL # code sequence based on example found here # https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/BeamSearchDecoder tiled_encoder_output = tfa.seq2seq.tile_batch( encoder_output, multiplier=self.beam_width) tiled_encoder_end_state = tfa.seq2seq.tile_batch( encoder_end_state, multiplier=self.beam_width) tiled_encoder_sequence_length = tfa.seq2seq.tile_batch( encoder_sequence_length, multiplier=self.beam_width) if self.attention_mechanism is not None: self.attention_mechanism.setup_memory( tiled_encoder_output, memory_sequence_length=tiled_encoder_sequence_length) decoder_initial_state = self.build_decoder_initial_state( batch_size * self.beam_width, encoder_state=tiled_encoder_end_state, dtype=tf.float32) decoder = tfa.seq2seq.beam_search_decoder.BeamSearchDecoder( cell=self.decoder_rnncell, beam_width=self.beam_width, output_layer=self.dense_layer) # ================ generate logits ================== maximum_iterations = self.max_sequence_length # initialize inference decoder decoder_embedding_matrix = self.decoder_embedding.variables[0] ( first_finished, first_inputs, first_state ) = decoder.initialize( decoder_embedding_matrix, start_tokens=start_tokens, end_token=end_token, # following construct required to work around inconsistent handling # of encoder_end_state by tfa initial_state=decoder_initial_state \ if len(decoder_initial_state) != 1 \ else decoder_initial_state[0] ) inputs = first_inputs state = first_state # create empty logits tensor logits = tf.convert_to_tensor(np.array([]).reshape( [batch_size, 0, self.num_classes]), dtype=tf.float32) # create empty predictions tensor predictions = tf.convert_to_tensor( np.array([]).reshape([batch_size, 0]), dtype=tf.int32 # todo tf2 need to change to tf.int64 ) # create lengths tensor lengths = tf.zeros([batch_size], dtype=tf.int32) # beam search for j in range(maximum_iterations): outputs, next_state, next_inputs, finished = decoder.step( j, inputs, state, training=training) inputs = next_inputs state = next_state # logtis don't work, temporary workaround one_logit = tf.zeros([batch_size, 1, self.num_classes]) logits = tf.concat([logits, one_logit], axis=1) one_predicted_token = tf.expand_dims(outputs.predicted_ids[:, 0], axis=1) predictions = tf.concat([predictions, one_predicted_token], axis=1) # todo tf2: we should first run all the iterations and only at the end # collect logits and predictions. The current implementation is WRONG # todo tf2: solve cases when predictions become 0 and then return # to be a number, which confuses the last_predictions later last_predictions = tf.gather_nd(predictions, tf.stack([ tf.range(tf.shape(predictions)[0]), tf.maximum(lengths - 1, 0) ], axis=1), name='last_predictions_{}'.format( self.name)) probabilities = tf.zeros_like(logits) return logits, lengths, predictions, last_predictions, probabilities
def decoder_beam_search(self, encoder_output, encoder_end_state=None, training=None): # ================ Setup ================ batch_size = encoder_output.shape[0] encoder_sequence_length = sequence_length_3D(encoder_output) # ================ predictions ================= # decoder_input = tf.expand_dims([self.GO_SYMBOL] * batch_size, 1) start_tokens = tf.fill([batch_size], self.GO_SYMBOL) end_token = self.END_SYMBOL # code sequence based on example found here # https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/BeamSearchDecoder tiled_encoder_output = tfa.seq2seq.tile_batch( encoder_output, multiplier=self.beam_width) tiled_encoder_end_state = tfa.seq2seq.tile_batch( encoder_end_state, multiplier=self.beam_width) tiled_encoder_sequence_length = tfa.seq2seq.tile_batch( encoder_sequence_length, multiplier=self.beam_width) if self.attention_mechanism is not None: self.attention_mechanism.setup_memory( tiled_encoder_output, memory_sequence_length=tiled_encoder_sequence_length) decoder_initial_state = self.build_decoder_initial_state( batch_size * self.beam_width, encoder_state=tiled_encoder_end_state, dtype=tf.float32) decoder = tfa.seq2seq.beam_search_decoder.BeamSearchDecoder( cell=self.decoder_rnncell, beam_width=self.beam_width, output_layer=self.dense_layer, output_all_scores=True, ) # ================ generate logits ================== maximum_iterations = self.max_sequence_length # initialize inference decoder decoder_embedding_matrix = self.decoder_embedding.variables[0] # beam search decoder_output, decoder_state, decoder_lengths = tfa.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=False, maximum_iterations=maximum_iterations, decoder_init_input=decoder_embedding_matrix, decoder_init_kwargs=dict( start_tokens=start_tokens, end_token=end_token, # following construct required to work around inconsistent handling # of encoder_end_state by tfa initial_state=decoder_initial_state \ if len(decoder_initial_state) != 1 \ else decoder_initial_state[0] ), ) predictions = decoder_output.beam_search_decoder_output.predicted_ids[:, :, 0] logits = decoder_output.beam_search_decoder_output.scores[:, :, 0, :] lengths = decoder_lengths[:, 0] last_predictions = tf.gather_nd(predictions, tf.stack([ tf.range(tf.shape(predictions)[0]), tf.maximum(lengths - 1, 0) ], axis=1), name='last_predictions_{}'.format( self.name)) probabilities = tf.nn.softmax(logits) return logits, lengths, predictions, last_predictions, probabilities
def __call__(self, feature_encodings, regularizer, dropout_rate, **kwargs): if (self.main_sequence_feature is None or self.main_sequence_feature not in feature_encodings): for fe_name, fe_properties in feature_encodings.items(): if fe_properties['type'] in SEQUENCE_TYPES: self.main_sequence_feature = fe_name break if self.main_sequence_feature is None: raise Exception( 'No sequence feature available for sequence combiner') main_sequence_feature_encoding = \ feature_encodings[self.main_sequence_feature] representation = main_sequence_feature_encoding['representation'] representations_size = representation.shape[2].value representations = [representation] scope_name = 'sequence_concat_combiner' sequence_length = sequence_length_3D(representation) with tf.variable_scope(scope_name): # ================ Concat ================ for fe_name, fe_properties in feature_encodings.items(): if fe_name is not self.main_sequence_feature: if fe_properties['type'] in SEQUENCE_TYPES and \ len(fe_properties['representation'].shape) == 3: # The following check makes sense when # both representations have a specified # sequence length dimension. If they do not, # then this check is simply checking if None == None # and will not catch discrepancies in the different # feature length dimension. Those errors will show up # at training time. Possible solutions to this is # to enforce a length second dimension in # sequential feature placeholders, but that # does not work with BucketedBatcher that requires # the second dimension to be undefined in order to be # able to trim the data points and speed up computation. # So for now we are keeping things like this, make sure # to write in the documentation that training time # dimensions mismatch may occur if the sequential # features have different lengths for some data points. if fe_properties['representation'].shape[1] != \ representation.shape[1]: raise ValueError( 'The sequence length of the input feature {} ' 'is {} and is different from the sequence ' 'length of the main sequence feature {} which ' 'is {}.\n Shape of {}: {}, shape of {}: {}.\n' 'Sequence lengths of all sequential features ' 'must be the same in order to be concatenated ' 'by the sequence concat combiner. ' 'Try to impose the same max sequence length ' 'as a preprocessing parameter to both features ' 'or to reduce the output of {}.'.format( fe_properties['name'], fe_properties['representation'].shape[1], self.main_sequence_feature, representations_size, fe_properties['name'], fe_properties['representation'].shape, fe_properties['name'], representation.shape, fe_properties['name'])) # this assumes all sequence representations have the # same sequence length, 2nd dimension representations.append(fe_properties['representation']) elif len(fe_properties['representation'].shape) == 2: sequence_max_length = tf.shape(representation)[1] multipliers = tf.concat( [[1], tf.expand_dims(sequence_max_length, -1), [1]], 0) tiled_representation = tf.tile( tf.expand_dims(fe_properties['representation'], 1), multipliers) logger.debug(' tiled_representation: {0}'.format( tiled_representation)) mask = tf.sequence_mask(sequence_length, sequence_max_length) tiled_representation = tf.multiply( tiled_representation, tf.cast(tf.expand_dims(mask, -1), dtype=tf.float32)) representations.append(tiled_representation) else: raise ValueError( 'The representation of {} has rank {} and cannot be' ' concatenated by a sequence concat combiner. ' 'Only rank 2 and rank 3 tensors are supported.'. format(fe_properties['name'], len(fe_properties['representation'].shape))) representations_size += fe_properties['size'] hidden = tf.concat(representations, 2) logger.debug(' concat_hidden: {0}'.format(hidden)) hidden_size = representations_size # ================ Mask ================ mask_matrix = tf.cast(tf.sign( tf.reduce_sum(tf.abs(representation), -1, keep_dims=True)), dtype=tf.float32) hidden = tf.multiply(hidden, mask_matrix) # ================ Reduce ================ hidden = reduce_sequence(hidden, self.reduce_output) logger.debug(' reduced_concat_hidden: {0}'.format(hidden)) hidden = tf.identity(hidden, name=scope_name) return hidden, hidden_size
def decoder_greedy(self, encoder_output, encoder_end_state=None, training=None): # ================ Setup ================ batch_size = encoder_output.shape[0] # ================ predictions ================= greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler() decoder_input = tf.expand_dims([self.GO_SYMBOL] * batch_size, 1) start_tokens = tf.fill([batch_size], self.GO_SYMBOL) end_token = self.END_SYMBOL decoder_inp_emb = self.decoder_embedding(decoder_input) if self.attention_mechanism is not None: encoder_sequence_length = sequence_length_3D(encoder_output) self.attention_mechanism.setup_memory( encoder_output, memory_sequence_length=encoder_sequence_length) decoder_initial_state = self.build_decoder_initial_state( batch_size, encoder_state=encoder_end_state, dtype=tf.float32) decoder = tfa.seq2seq.BasicDecoder(cell=self.decoder_rnncell, sampler=greedy_sampler, output_layer=self.dense_layer) # ================ generate logits ================== maximum_iterations = self.max_sequence_length # initialize inference decoder decoder_embedding_matrix = self.decoder_embedding.variables[0] (first_finished, first_inputs, first_state) = decoder.initialize(decoder_embedding_matrix, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state) inputs = first_inputs state = first_state # create empty logits tensor logits = tf.convert_to_tensor(np.array([]).reshape( [batch_size, 0, self.num_classes]), dtype=tf.float32) # create empty predictions tensor predictions = tf.convert_to_tensor( np.array([]).reshape([batch_size, 0]), dtype=tf.int32 # todo tf2 need to change to tf.int64 ) # create lengths tensor lengths = tf.zeros([batch_size], dtype=tf.int32) already_finished = tf.cast(tf.zeros([batch_size], dtype=tf.int8), dtype=tf.bool) # build up logits for j in range(maximum_iterations): outputs, next_state, next_inputs, finished = decoder.step( j, inputs, state, training=training) inputs = next_inputs state = next_state one_logit = tf.expand_dims(outputs.rnn_output, axis=1) logits = tf.concat([logits, one_logit], axis=1) one_prediction = tf.expand_dims(outputs.sample_id, axis=1) predictions = tf.concat([predictions, one_prediction], axis=1) already_finished = tf.logical_or(already_finished, finished) lengths += tf.cast(tf.logical_not(already_finished), dtype=tf.int32) probabilities = tf.nn.softmax(logits, name='probabilities_{}'.format( self.name)) predictions = tf.cast(predictions, tf.int64, name='predictions_{}'.format(self.name)) last_predictions = tf.gather_nd(predictions, tf.stack([ tf.range(tf.shape(predictions)[0]), tf.maximum(lengths - 1, 0) ], axis=1), name='last_predictions_{}'.format( self.name)) # mask logits mask = tf.sequence_mask(lengths, maxlen=logits.shape[1], dtype=tf.float32) logits = logits * mask[:, :, tf.newaxis] return logits, lengths, predictions, last_predictions, probabilities
def decoder_beam_search( self, encoder_output, encoder_end_state=None, training=None ): # ================ Setup ================ batch_size = encoder_output.shape[0] encoder_sequence_length = sequence_length_3D(encoder_output) # ================ predictions ================= decoder_input = tf.expand_dims([self.GO_SYMBOL] * batch_size, 1) start_tokens = tf.fill([batch_size], self.GO_SYMBOL) end_token = self.END_SYMBOL decoder_inp_emb = self.decoder_embedding(decoder_input) # code sequence based on example found here # https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq/BeamSearchDecoder tiled_encoder_output = tfa.seq2seq.tile_batch( encoder_output, multiplier=self.beam_width ) tiled_encoder_end_state = tfa.seq2seq.tile_batch( encoder_end_state, multiplier=self.beam_width ) tiled_encoder_sequence_length = tfa.seq2seq.tile_batch( encoder_sequence_length, multiplier=self.beam_width ) if self.attention_mechanism is not None: self.attention_mechanism.setup_memory( tiled_encoder_output, memory_sequence_length=tiled_encoder_sequence_length ) decoder_initial_state = self.build_decoder_initial_state( batch_size * self.beam_width, encoder_state=tiled_encoder_end_state, dtype=tf.float32 ) decoder = tfa.seq2seq.beam_search_decoder.BeamSearchDecoder( cell=self.decoder_rnncell, beam_width=self.beam_width, output_layer=self.dense_layer, output_all_scores=True, ) # ================ generate logits ================== maximum_iterations = self.max_sequence_length # initialize inference decoder decoder_embedding_matrix = self.decoder_embedding.weights[0] # beam search decoder_output, decoder_state, decoder_lengths = tfa.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=False, maximum_iterations=maximum_iterations, decoder_init_input=decoder_embedding_matrix, decoder_init_kwargs=dict( start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state ), ) sequence_id = 0 predictions = decoder_output.predicted_ids[:, :, sequence_id] probabilities = extract_sequence_probabilities( decoder_output, self.beam_width, sequence_id=sequence_id ) seq_len_diff = self.max_sequence_length - tf.shape(predictions)[1] if seq_len_diff > 0: predictions = tf.pad( predictions, [[0, 0], [0, seq_len_diff]] ) probabilities = tf.pad( probabilities, [[0, 0], [0, seq_len_diff], [0, 0]], constant_values=1.0 / self.vocab_size ) # -1 because they include pad lengths = decoder_lengths[:, 0] - 1 last_predictions = tf.gather_nd( predictions, tf.stack( [tf.range(tf.shape(predictions)[0]), tf.maximum(lengths - 1, 0)], axis=1 ), name='last_predictions_{}'.format(self.name) ) # EXPECTED SIZE OF RETURNED TENSORS # lengths: shape[batch_size] # predictions: shape [batch_size, seq_size] # last_predictions: shape[batch_size # probabilities: shape[batch_size, seq_size, num_classes] return None, lengths, predictions, last_predictions, probabilities
def decoder_greedy( self, encoder_output, encoder_end_state=None, training=None ): # ================ Setup ================ batch_size = encoder_output.shape[0] # ================ predictions ================= greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler() decoder_input = tf.expand_dims([self.GO_SYMBOL] * batch_size, 1) start_tokens = tf.fill([batch_size], self.GO_SYMBOL) end_token = self.END_SYMBOL decoder_inp_emb = self.decoder_embedding(decoder_input) if self.attention_mechanism is not None: encoder_sequence_length = sequence_length_3D(encoder_output) self.attention_mechanism.setup_memory( encoder_output, memory_sequence_length=encoder_sequence_length ) decoder_initial_state = self.build_decoder_initial_state( batch_size, encoder_state=encoder_end_state, dtype=tf.float32 ) decoder = tfa.seq2seq.BasicDecoder( cell=self.decoder_rnncell, sampler=greedy_sampler, output_layer=self.dense_layer ) # ================ generate sequence ================== maximum_iterations = self.max_sequence_length # initialize inference decoder decoder_embedding_matrix = self.decoder_embedding.weights[0] decoder_output, decoder_state, decoder_lengths = tfa.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=False, maximum_iterations=maximum_iterations, decoder_init_input=decoder_embedding_matrix, decoder_init_kwargs=dict( start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, ), ) predictions = decoder_output.sample_id seq_len_diff = self.max_sequence_length - tf.shape(predictions)[1] if seq_len_diff > 0: predictions = tf.pad( predictions, [[0, 0], [0, seq_len_diff]] ) logits = tf.pad( decoder_output.rnn_output, [[0, 0], [0, seq_len_diff], [0, 0]] ) # -1 because they include the EOS symbol lengths = decoder_lengths - 1 probabilities = tf.nn.softmax( logits, name='probabilities_{}'.format(self.name) ) predictions = tf.cast( predictions, tf.int64, name='predictions_{}'.format(self.name) ) last_predictions = tf.gather_nd( predictions, tf.stack( [tf.range(tf.shape(predictions)[0]), tf.maximum(lengths - 1, 0)], # -1 because of EOS axis=1 ), name='last_predictions_{}'.format(self.name) ) # EXPECTED SIZE OF RETURNED TENSORS # logits: shape [batch_size, seq_size, num_classes] # lengths: shape[batch_size] # predictions: shape [batch_size, seq_size] # last_predictions: shape[batch_size # probabilities: shape[batch_size, seq_size, num_classes] return logits, lengths, predictions, last_predictions, probabilities
def decoder_greedy(self, encoder_output, encoder_end_state=None, training=None): # ================ Setup ================ batch_size = encoder_output.shape[0] # ================ predictions ================= greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler() decoder_input = tf.expand_dims([self.GO_SYMBOL] * batch_size, 1) start_tokens = tf.fill([batch_size], self.GO_SYMBOL) end_token = self.END_SYMBOL decoder_inp_emb = self.decoder_embedding(decoder_input) if self.attention_mechanism is not None: encoder_sequence_length = sequence_length_3D(encoder_output) self.attention_mechanism.setup_memory( encoder_output, memory_sequence_length=encoder_sequence_length) decoder_initial_state = self.build_decoder_initial_state( batch_size, encoder_state=encoder_end_state, dtype=tf.float32) decoder = tfa.seq2seq.BasicDecoder(cell=self.decoder_rnncell, sampler=greedy_sampler, output_layer=self.dense_layer) # ================ generate logits ================== maximum_iterations = self.max_sequence_length # initialize inference decoder decoder_embedding_matrix = self.decoder_embedding.variables[0] decoder_output, decoder_state, decoder_lengths = tfa.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=False, maximum_iterations=maximum_iterations, decoder_init_input=decoder_embedding_matrix, decoder_init_kwargs=dict( start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, ), ) predictions = decoder_output.sample_id logits = decoder_output.rnn_output lengths = decoder_lengths probabilities = tf.nn.softmax(logits, name='probabilities_{}'.format( self.name)) predictions = tf.cast(predictions, tf.int64, name='predictions_{}'.format(self.name)) last_predictions = tf.gather_nd(predictions, tf.stack([ tf.range(tf.shape(predictions)[0]), tf.maximum(lengths - 1, 0) ], axis=1), name='last_predictions_{}'.format( self.name)) # mask logits mask = tf.sequence_mask(lengths, maxlen=logits.shape[1], dtype=tf.float32) logits = logits * mask[:, :, tf.newaxis] return logits, lengths, predictions, last_predictions, probabilities