def get_start_end_seq_mask(seq_len=5, length=3): ''' :param seq_len: n_ctx :param length: ans_avg_len :return: ''' s = tf.constant(np.array(range(seq_len))) # [0, 1, ..., seq_len-1] s = tf.expand_dims(s, axis=-1) # [[0], [1], ..., [seq_len-1]] s = tf.tile(s, [1, length]) # [[0, 0, 0], [1, 1, 1], ..., [seq_len-1, seq_len-1, seq_len-1]] s = tf.concat(tf.unstack(s, axis=0), axis=0) # [0, 0, 0, 1, 1, 1, 2, 2, 2, ..., 4, 4, 4] gap = tf.constant(np.array(range(length))) # [0, 1, 2] gap = tf.tile(gap, [seq_len]) # [0, 1, 2, 0, 1, 2, ..., 0, 1, 2] e = s + gap # [0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, ... ] s_mask = tf.cast(tf.sequence_mask(s+1, seq_len, dtype=tf.int32), tf.float32) # s_mask_ = tf.cast(tf.sequence_mask(s, seq_len, dtype=tf.int32), tf.float32) s_mask = s_mask - s_mask_ e_mask = tf.cast(tf.sequence_mask(e + 1, seq_len, dtype=tf.int32), tf.float32) e_mask_ = tf.cast(tf.sequence_mask(e, seq_len, dtype=tf.int32), tf.float32) e_mask = e_mask - e_mask_ #res = e_mask - s_mask res = e_mask + s_mask res = res / tf.reduce_sum(res, axis=-1, keepdims=True) res = 2.0 * res return res
def sovle_problem_1(): input = tf.constant([range(5), np.array(range(5)) + 1, np.array(range(5)) + 2]) ''' input = [ [0, 1, 2, 3, 4], [1, 2, 3, 4, 5], [2, 3, 4, 5, 6] ] ''' input = tf.constant([ [0.99, 0.8, 0.7, 0.5, 0.5], [0.2, 0.3, 0.6, 0.7, 0.8], [0.1, 0.1, 0.1, 0.5, 1] ]) sess = tf.Session() mask = tf.cast(tf.cast(tf.greater(input, 3), tf.int32), tf.float32) start_label = tf.constant(np.array([0, 2, 3])) start_label = tf.sequence_mask(start_label, 5, dtype=tf.int32) # not include the index end_label = tf.constant(np.array([2, 4, 3])) end_label = tf.sequence_mask(end_label+1, 5, dtype=tf.int32) res = end_label - start_label log_loss = tf.losses.log_loss(res, input) print sess.run([mask, start_label, end_label, res, log_loss])
def pad_with_identity(x, sequence_length, max_sequence_length, identity_values=0): """Pads a tensor with identity values up to :obj:`max_sequence_length`. Args: x: A ``tf.Tensor`` of shape ``[batch_size, max(sequence_length), depth]``. sequence_length: The true sequence length of :obj:`x`. max_sequence_length: The sequence length up to which the tensor must contain :obj:`identity values`. identity_values: The identity value. Returns: A ``tf.Tensor`` of shape ``[batch_size, max(max_sequence_length), depth]``. """ maxlen = tf.reduce_max(max_sequence_length) mask = tf.sequence_mask(sequence_length, maxlen=maxlen, dtype=x.dtype) mask = tf.expand_dims(mask, axis=-1) mask_combined = tf.sequence_mask(max_sequence_length, dtype=x.dtype) mask_combined = tf.expand_dims(mask_combined, axis=-1) identity_mask = mask_combined * (1.0 - mask) x = pad_in_time(x, maxlen - tf.shape(x)[1]) x = x * mask + (identity_mask * identity_values) return x
def sequence_mask(input_lengths, max_len=None, expand=True): if max_len is None: max_len = tf.reduce_max(input_lengths) if expand: return tf.expand_dims(tf.sequence_mask(input_lengths, max_len, dtype=tf.float32), axis=-1) return tf.sequence_mask(input_lengths, max_len, dtype=tf.float32)
def sequence_mask(lengths, r, expand=True): '''Returns a 2-D or 3-D tensorflow sequence mask depending on the argument 'expand' ''' max_len = tf.reduce_max(lengths) max_len = _round_up_tf(max_len, tf.convert_to_tensor(r)) if expand: return tf.expand_dims(tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32), axis=-1) return tf.sequence_mask(lengths, maxlen=max_len, dtype=tf.float32)
def testNormal(self): with self.test_session(): res = tf.sequence_mask(tf.constant([1, 3, 2]), 5) self.assertAllEqual(res.get_shape(), [3, 5]) self.assertAllEqual(res.eval(), [[True, False, False, False, False], [True, True, True, False, False], [True, True, False, False, False]]) # test dtype and default maxlen: res = tf.sequence_mask(tf.constant([0, 1, 4]), dtype=tf.float32) self.assertAllEqual(res.get_shape().as_list(), [3, None]) self.assertAllEqual(res.eval(), [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
def attend(x, sequence_length=None, method="ave", context=None, feature_dim=None, mask_zero=False, maxlen=None, epsilon=1e-8, bn=True, training=False, seed=0, reuse=True, name="attend"): if method == "ave": if mask_zero: # None * step_dim mask = tf.sequence_mask(sequence_length, maxlen) mask = tf.reshape(mask, (-1, tf.shape(x)[1], 1)) mask = tf.cast(mask, tf.float32) z = tf.reduce_sum(x * mask, axis=1) l = tf.reduce_sum(mask, axis=1) # in some cases especially in the early stages of training the sum may be almost zero z /= tf.cast(l + epsilon, tf.float32) else: z = tf.reduce_mean(x, axis=1) elif method == "sum": if mask_zero: # None * step_dim mask = tf.sequence_mask(sequence_length, maxlen) mask = tf.reshape(mask, (-1, tf.shape(x)[1], 1)) mask = tf.cast(mask, tf.float32) z = tf.reduce_sum(x * mask, axis=1) else: z = tf.reduce_sum(x, axis=1) elif method == "max": if mask_zero: # None * step_dim mask = tf.sequence_mask(sequence_length, maxlen) mask = tf.expand_dims(mask, axis=-1) mask = tf.tile(mask, (1, 1, tf.shape(x)[2])) masked_data = tf.where(tf.equal(mask, tf.zeros_like(mask)), tf.ones_like(x) * -np.inf, x) # if masked assume value is -inf z = tf.reduce_max(masked_data, axis=1) else: z = tf.reduce_max(x, axis=1) elif method == "attention": if context is not None: step_dim = tf.shape(x)[1] context = tf.expand_dims(context, axis=1) context = tf.tile(context, [1, step_dim, 1]) y = tf.concat([x, context], axis=-1) else: y = x a = attention(y, feature_dim, sequence_length, mask_zero, maxlen, seed=seed) z = tf.reduce_sum(x * a, axis=1) if bn: # training=False has slightly better performance z = tf.layers.BatchNormalization()(z, training=False) # z = batch_normalization(z, training=training, name=name) return z
def attention(x, feature_dim, sequence_length, mask_zero=False, maxlen=None, epsilon=1e-8, seed=0): input_shape = tf.shape(x) step_dim = input_shape[1] # feature_dim = input_shape[2] x = tf.reshape(x, [-1, feature_dim]) """ The last dimension of the inputs to `Dense` should be defined. Found `None`. cann't not use `tf.layers.Dense` here eij = tf.layers.Dense(1)(x) see: https://github.com/tensorflow/tensorflow/issues/13348 workaround: specify the feature_dim as input """ eij = tf.layers.Dense(1, activation=tf.nn.tanh, kernel_initializer=tf.glorot_uniform_initializer(seed=seed), dtype=tf.float32, bias_initializer=tf.zeros_initializer())(x) eij = tf.reshape(eij, [-1, step_dim]) a = tf.exp(eij) # apply mask after the exp. will be re-normalized next if mask_zero: # None * step_dim mask = tf.sequence_mask(sequence_length, maxlen) mask = tf.cast(mask, tf.float32) a = a * mask # in some cases especially in the early stages of training the sum may be almost zero a /= tf.cast(tf.reduce_sum(a, axis=1, keep_dims=True) + epsilon, tf.float32) a = tf.expand_dims(a, axis=-1) return a
def check_dtypes(lengths_dtype, maxlen_dtype): res = tf.sequence_mask(tf.constant([1, 3, 2], dtype=lengths_dtype), tf.constant(5, dtype=maxlen_dtype)) self.assertAllEqual(res.get_shape(), [3, 5]) self.assertAllEqual(res.eval(), [[True, False, False, False, False], [True, True, True, False, False], [True, True, False, False, False]])
def _mask_by_length(t, length): """Mask t, 3-D [batch, time, dim], by length, 1-D [batch,].""" maxlen = t.get_shape().as_list()[1] mask = tf.sequence_mask(length, maxlen=maxlen) mask = tf.expand_dims(tf.cast(mask, tf.float32), -1) # shape(mask) = (batch, num_timesteps, 1) return t * mask
def calculate_outputs(self, x): h = lstm_layer(x, self.history_length, self.lstm_size, scope='lstm-1') h = tf.concat([h, x], axis=2) h_final = time_distributed_dense_layer(h, 50, activation=tf.nn.relu, scope='dense-1') n_components = 1 params = time_distributed_dense_layer(h_final, n_components*2, scope='dense-2', activation=None) ps, mixing_coefs = tf.split(params, 2, axis=2) # this is implemented incorrectly, but it still helped... mixing_coefs = tf.nn.softmax(mixing_coefs - tf.reduce_min(mixing_coefs, 2, keep_dims=True)) ps = tf.nn.sigmoid(ps) labels = tf.tile(tf.expand_dims(self.next_is_ordered, 2), (1, 1, n_components)) losses = tf.reduce_sum(mixing_coefs*log_loss(labels, ps), axis=2) sequence_mask = tf.cast(tf.sequence_mask(self.history_length, maxlen=100), tf.float32) avg_loss = tf.reduce_sum(losses*sequence_mask) / tf.cast(tf.reduce_sum(self.history_length), tf.float32) final_temporal_idx = tf.stack([tf.range(tf.shape(self.history_length)[0]), self.history_length - 1], axis=1) self.final_states = tf.gather_nd(h_final, final_temporal_idx) self.prediction_tensors = { 'user_ids': self.user_id, 'product_ids': self.product_id, 'final_states': self.final_states } return avg_loss
def get_mention_emb(self, text_emb, text_outputs, mention_starts, mention_ends): mention_emb_list = [] mention_start_emb = tf.gather(text_outputs, mention_starts) # [num_mentions, emb] mention_emb_list.append(mention_start_emb) mention_end_emb = tf.gather(text_outputs, mention_ends) # [num_mentions, emb] mention_emb_list.append(mention_end_emb) mention_width = 1 + mention_ends - mention_starts # [num_mentions] if self.config["use_features"]: mention_width_index = mention_width - 1 # [num_mentions] mention_width_emb = tf.gather(tf.get_variable("mention_width_embeddings", [self.config["max_mention_width"], self.config["feature_size"]]), mention_width_index) # [num_mentions, emb] mention_width_emb = tf.nn.dropout(mention_width_emb, self.dropout) mention_emb_list.append(mention_width_emb) if self.config["model_heads"]: mention_indices = tf.expand_dims(tf.range(self.config["max_mention_width"]), 0) + tf.expand_dims(mention_starts, 1) # [num_mentions, max_mention_width] mention_indices = tf.minimum(util.shape(text_outputs, 0) - 1, mention_indices) # [num_mentions, max_mention_width] mention_text_emb = tf.gather(text_emb, mention_indices) # [num_mentions, max_mention_width, emb] self.head_scores = util.projection(text_outputs, 1) # [num_words, 1] mention_head_scores = tf.gather(self.head_scores, mention_indices) # [num_mentions, max_mention_width, 1] mention_mask = tf.expand_dims(tf.sequence_mask(mention_width, self.config["max_mention_width"], dtype=tf.float32), 2) # [num_mentions, max_mention_width, 1] mention_attention = tf.nn.softmax(mention_head_scores + tf.log(mention_mask), dim=1) # [num_mentions, max_mention_width, 1] mention_head_emb = tf.reduce_sum(mention_attention * mention_text_emb, 1) # [num_mentions, emb] mention_emb_list.append(mention_head_emb) mention_emb = tf.concat(mention_emb_list, 1) # [num_mentions, emb] return mention_emb
def _create_position_embedding(embedding_dim, num_positions, lengths, maxlen): """Creates position embeddings. Args: embedding_dim: Dimensionality of the embeddings. An integer. num_positions: The number of positions to be embedded. For example, if you have inputs of length up to 100, this should be 100. An integer. lengths: The lengths of the inputs to create position embeddings for. An int32 tensor of shape `[batch_size]`. maxlen: The maximum length of the input sequence to create position embeddings for. An int32 tensor. Returns: A tensor of shape `[batch_size, maxlen, embedding_dim]` that contains embeddings for each position. All elements past `lengths` are zero. """ # Create constant position encodings position_encodings = tf.constant( position_encoding(num_positions, embedding_dim), name="position_encoding") # Slice to size of current sequence pe_slice = position_encodings[:maxlen, :] # Replicate encodings for each element in the batch batch_size = tf.shape(lengths)[0] pe_batch = tf.tile([pe_slice], [batch_size, 1, 1]) # Mask out positions that are padded positions_mask = tf.sequence_mask( lengths=lengths, maxlen=maxlen, dtype=tf.float32) positions_embed = pe_batch * tf.expand_dims(positions_mask, 2) return positions_embed
def reduce_sequence(self, inputs, sequence_lengths): axis = self.axis % inputs[0].shape.ndims if axis == 2: padded, combined_length = pad_n_with_identity(inputs, sequence_lengths) return self.reduce(padded), combined_length elif axis == 1: # Pad all input tensors up to maximum combined length. combined_length = tf.add_n(sequence_lengths) maxlen = tf.reduce_max(combined_length) padded = [pad_in_time(x, maxlen - tf.shape(x)[1]) for x in inputs] current_length = None accumulator = None for elem, length in zip(padded, sequence_lengths): # Make sure padding are 0 vectors as it is required for the next step. mask = tf.sequence_mask(length, maxlen=maxlen, dtype=elem.dtype) elem = elem * tf.expand_dims(mask, -1) if accumulator is None: accumulator = elem current_length = length else: accumulator += roll_sequence(elem, current_length) current_length += length return accumulator, combined_length else: raise ValueError("Unsupported concatenation on axis {}".format(axis))
def prepare_train_eval( self, t_out, out_seq_len, labels, lr, train_op=None, loss=None ): if not loss: weights = tf.sequence_mask( out_seq_len, dtype=t_out.dtype ) loss = tf.contrib.seq2seq.sequence_loss( t_out, labels, weights, average_across_batch=self.average_across_batch, ) if not train_op: train_op = tf.contrib.layers.optimize_loss( loss, tf.train.get_global_step(), optimizer='SGD', learning_rate=lr, summaries=['loss', 'learning_rate'] ) return tf.estimator.EstimatorSpec( mode=self.mode, loss=loss, train_op=train_op, )
def call(self, inputs, **kwargs): query_key_keylen_list = inputs queries, keys, keys_length = query_key_keylen_list hist_len = keys.get_shape()[1] attention_score = LocalActivationUnit( self.hidden_size, self.activation, 0, 1, False, 1024,)([queries, keys]) outputs = tf.transpose(attention_score, (0, 2, 1)) key_masks = tf.sequence_mask(keys_length, hist_len) if self.weight_normalization: paddings = tf.ones_like(outputs) * (-2 ** 32 + 1) else: paddings = tf.zeros_like(outputs) outputs = tf.where(key_masks, outputs, paddings) if self.weight_normalization: outputs = tf.nn.softmax(outputs) outputs = tf.matmul(outputs, keys) return outputs
def _compute_metrics(self, features, labels, predictions): length = self._get_features_length(features) weights = tf.sequence_mask( length, maxlen=tf.shape(labels["tags"])[1], dtype=tf.float32) eval_metric_ops = {} eval_metric_ops["accuracy"] = tf.metrics.accuracy( labels["tags"], predictions["tags"], weights=weights) if self.tagging_scheme in ("bioes",): flag_fn = None if self.tagging_scheme == "bioes": flag_fn = flag_bioes_tags gold_flags, predicted_flags = tf.py_func( flag_fn, [labels["tags"], predictions["tags"], length], [tf.bool, tf.bool], stateful=False) precision_metric = tf.metrics.precision(gold_flags, predicted_flags) recall_metric = tf.metrics.recall(gold_flags, predicted_flags) precision = precision_metric[0] recall = recall_metric[0] f1 = (2 * precision * recall) / (recall + precision) eval_metric_ops["precision"] = precision_metric eval_metric_ops["recall"] = recall_metric eval_metric_ops["f1"] = (f1, tf.no_op()) return eval_metric_ops
def mkMask(input_tensor, maxLen): shape_of_input = tf.shape(input_tensor) shape_of_output = tf.concat(axis=0, values=[shape_of_input, [maxLen]]) oneDtensor = tf.reshape(input_tensor, shape=(-1,)) flat_mask = tf.sequence_mask(oneDtensor, maxlen=maxLen) return tf.reshape(flat_mask, shape_of_output)
def cross_entropy_sequence_loss(logits, labels, sequence_length, label_smoothing=0.0, average_in_time=False, mode=tf.estimator.ModeKeys.TRAIN): """Computes the cross entropy loss of sequences. Args: logits: The unscaled probabilities. labels: The true labels. sequence_length: The length of each sequence. label_smoothing: The label smoothing value. average_in_time: If ``True``, also average the loss in the time dimension. mode: A ``tf.estimator.ModeKeys`` mode. Returns: A tuple (cumulated loss, loss normalizer, token-level normalizer). """ batch_size = tf.shape(logits)[0] max_time = tf.shape(logits)[1] cross_entropy = _softmax_cross_entropy(logits, labels, label_smoothing, mode) weights = tf.sequence_mask( sequence_length, maxlen=max_time, dtype=cross_entropy.dtype) loss = tf.reduce_sum(cross_entropy * weights) loss_token_normalizer = tf.reduce_sum(weights) if average_in_time or mode != tf.estimator.ModeKeys.TRAIN: loss_normalizer = loss_token_normalizer else: loss_normalizer = tf.cast(batch_size, loss.dtype) return loss, loss_normalizer, loss_token_normalizer
def attention(queries, keys, keys_length): ''' queries: [B, H] keys: [B, T, H] keys_length: [B] ''' queries_hidden_units = queries.get_shape().as_list()[-1] queries = tf.tile(queries, [1, tf.shape(keys)[1]]) queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units]) din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1) d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att') d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att') d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att') d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]]) outputs = d_layer_3_all # Mask key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1]) # [B, T] key_masks = tf.expand_dims(key_masks, 1) # [B, 1, T] paddings = tf.ones_like(outputs) * (-2 ** 32 + 1) outputs = tf.where(key_masks, outputs, paddings) # [B, 1, T] # Scale outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5) # Activation outputs = tf.nn.softmax(outputs) # [B, 1, T] # Weighted sum outputs = tf.matmul(outputs, keys) # [B, 1, H] return outputs
def create_variables_for_optimization(self): with tf.name_scope("optimization"): with tf.name_scope("masker"): self.mask = tf.sequence_mask(self.seq_len, self.num_step) self.mask = tf.reshape(tf.cast(self.mask, tf.float32), (-1,)) if self.loss_function == "cross_entropy": self.pl_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logit, labels=self.actions_flatten) elif self.loss_function == "l2": self.one_hot_actions = tf.one_hot(self.actions_flatten, self.num_actions) self.pl_loss = tf.reduce_mean((self.probs - self.one_hot_actions) ** 2, axis=1) else: raise ValueError("loss function type is not defined") self.pl_loss = tf.multiply(self.pl_loss, self.mask) self.pl_loss = tf.reduce_mean(tf.multiply(self.pl_loss, self.returns_flatten)) self.entropy = tf.multiply(self.entropy, self.mask) self.entropy = tf.reduce_mean(self.entropy) self.loss = self.pl_loss - self.entropy_bonus * self.entropy self.trainable_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="policy_network") self.gradients = self.optimizer.compute_gradients(self.loss, var_list=self.trainable_variables) self.clipped_gradients = [(tf.clip_by_norm(grad, self.max_gradient), var) for grad, var in self.gradients] self.train_op = self.optimizer.apply_gradients(self.clipped_gradients, self.global_step) self.grad_norm = tf.global_norm([grad for grad, var in self.gradients]) self.var_norm = tf.global_norm(self.trainable_variables)
def make_positions(sequence_length, maximum_length=None): """Builds a sequence of positions. The first position is 1 as the 0 index is reserved to padding positions. Args: sequence_length: The length of each sequence as a ``tf.Tensor`` of shape :math:`[B]`. maximum_length: Optional size of the returned time dimension. Otherwise it is the maximum of :obj:`sequence_length`. Returns: The sequence of positions as a ``tf.Tensor`` of shape :math:`[B, T]`. """ if maximum_length is None: maximum_length = tf.reduce_max(sequence_length) batch_size = tf.shape(sequence_length)[0] # Make 0 the position of padding. position = tf.range(maximum_length) + 1 position = tf.tile(position, [batch_size]) position = tf.reshape(position, [batch_size, -1]) mask = tf.sequence_mask( sequence_length, maxlen=maximum_length, dtype=position.dtype) position = position * mask return position
def NLL(self, y, lengths, pis, mus, sigmas, rho, es, eps=1e-8): sigma_1, sigma_2 = tf.split(sigmas, 2, axis=2) y_1, y_2, y_3 = tf.split(y, 3, axis=2) mu_1, mu_2 = tf.split(mus, 2, axis=2) norm = 1.0 / (2*np.pi*sigma_1*sigma_2 * tf.sqrt(1 - tf.square(rho))) Z = tf.square((y_1 - mu_1) / (sigma_1)) + \ tf.square((y_2 - mu_2) / (sigma_2)) - \ 2*rho*(y_1 - mu_1)*(y_2 - mu_2) / (sigma_1*sigma_2) exp = -1.0*Z / (2*(1 - tf.square(rho))) gaussian_likelihoods = tf.exp(exp) * norm gmm_likelihood = tf.reduce_sum(pis * gaussian_likelihoods, 2) gmm_likelihood = tf.clip_by_value(gmm_likelihood, eps, np.inf) bernoulli_likelihood = tf.squeeze(tf.where(tf.equal(tf.ones_like(y_3), y_3), es, 1 - es)) nll = -(tf.log(gmm_likelihood) + tf.log(bernoulli_likelihood)) sequence_mask = tf.logical_and( tf.sequence_mask(lengths, maxlen=tf.shape(y)[1]), tf.logical_not(tf.is_nan(nll)), ) nll = tf.where(sequence_mask, nll, tf.zeros_like(nll)) num_valid = tf.reduce_sum(tf.cast(sequence_mask, tf.float32), axis=1) sequence_loss = tf.reduce_sum(nll, axis=1) / tf.maximum(num_valid, 1.0) element_loss = tf.reduce_sum(nll) / tf.maximum(tf.reduce_sum(num_valid), 1.0) return sequence_loss, element_loss
def crossentropy(logits, targets, sequence_length): """ Computes cross entropy loss of a batch of data. (Not averaged by batch_size) The final loss is averaged by the number of samples in the batch. Args: logits: The logits Tensor with shape [timesteps, batch_size, vocab_size]. targets: The gold labels Tensor with shape [timesteps, batch_size]. sequence_length: The length of `targets`, [batch_size, ] Returns: Loss sum and weight sum. """ # [timesteps, batch_size] losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=targets) # [timesteps, batch_size] loss_mask = tf.transpose( tf.sequence_mask( lengths=tf.to_int32(sequence_length), maxlen=tf.to_int32(tf.shape(targets)[0]), dtype=tf.float32), [1, 0]) losses = losses * loss_mask loss_sum = tf.reduce_sum(losses) return loss_sum, tf.to_float(tf.shape(sequence_length)[0])
def smoothing_crossentropy_avgall(logits, targets, sequence_length): """ Computes cross entropy loss of a batch of data with label smoothing. The final loss is averaged by the length of each sequence and then averaged by the batch size. Args: logits: The logits Tensor with shape [timesteps, batch_size, vocab_size]. targets: The gold labels Tensor with shape [timesteps, batch_size]. sequence_length: The length of `targets`, [batch_size, ] Returns: Loss sum and weight sum. """ soft_targets, normalizing = label_smoothing(targets, logits.get_shape().as_list()[-1]) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=soft_targets) - normalizing # [timesteps, batch_size] loss_mask = tf.transpose( tf.sequence_mask( lengths=tf.to_int32(sequence_length), maxlen=tf.to_int32(tf.shape(targets)[0]), dtype=tf.float32), [1, 0]) losses = losses * loss_mask # average loss avg_length = tf.to_float(sequence_length) loss_by_time = tf.reduce_sum(losses, axis=0) / avg_length loss_sum = tf.reduce_sum(loss_by_time) return loss_sum, tf.to_float(tf.shape(sequence_length)[0])
def filter(predictions, actual_lengths): # predictions: batch_size * max_time_steps * num_classes # actual_lengths: list of actual sequence length in a batch max_length = tf.shape(predictions)[1] mask = tf.sequence_mask(actual_lengths, max_length, dtype=tf.bool) predictions_cls = tf.argmax(predictions, 2, name='predictions_cls') invalid_cls = tf.zeros(shape=tf.shape(predictions_cls), dtype=tf.int64) - 1 return tf.where(mask, predictions_cls, invalid_cls, name='filter_predictions_cls')
def softmax(inputs, length, max_length): inputs = tf.cast(inputs, tf.float32) max_axis = tf.reduce_max(inputs, 2, keep_dims=True) inputs = tf.exp(inputs - max_axis) length = tf.reshape(length, [-1]) mask = tf.reshape(tf.cast(tf.sequence_mask(length, max_length), tf.float32), tf.shape(inputs)) inputs *= mask _sum = tf.reduce_sum(inputs, reduction_indices=2, keep_dims=True) + 1e-9 return inputs / _sum
def BOW_encoder(ids_, ns_, V, embed_dim, hidden_dims, dropout_rate=0, is_training=None, **unused_kw): """Construct a bag-of-words encoder. You don't need to define any variables directly in this function, but you should: - Build the embeddings (using embedding_layer(...)) - Apply the mask to zero-out padding indices, and sum the embeddings for each example - Build a stack of hidden layers (using fully_connected_layers(...)) Note that this function returns the final encoding h_ as well as the masked embeddings xs_. The latter is used for L2 regularization, so that we can penalize the norm of only those vectors that were actually used for each example. Args: ids_: [batch_size, max_len] Tensor of int32, integer ids ns_: [batch_size] Tensor of int32, (clipped) length of each sequence V: (int) vocabulary size embed_dim: (int) embedding dimension hidden_dims: list(int) dimensions of the output of each layer dropout_rate: (float) rate to use for dropout is_training: (bool) if true, is in training mode Returns: (h_, xs_) h_: [batch_size, hidden_dims[-1]] Tensor of float32, the activations of the last layer constructed by this function. xs_: [batch_size, max_len, embed_dim] Tensor of float32, the per-word embeddings as returned by embedding_layer and with the mask applied to zero-out the pad indices. """ assert is_training is not None, "is_training must be explicitly set to True or False" # Embedding layer should produce: # xs_: [batch_size, max_len, embed_dim] with tf.variable_scope("Embedding_Layer"): #### YOUR CODE HERE #### xs_ = None # replace with a call to embedding_layer #### END(YOUR CODE) #### #### YOUR CODE HERE #### # Mask off the padding indices with zeros # mask_: [batch_size, max_len, 1] with values of 0.0 or 1.0 mask_ = tf.expand_dims(tf.sequence_mask(ns_, xs_.shape[1], dtype=tf.float32), -1) # Multiply xs_ by the mask to zero-out pad indices. # Sum embeddings: [batch_size, max_len, embed_dim] -> [batch_size, embed_dim] # Build a stack of fully-connected layers #### END(YOUR CODE) #### return h_, xs_
def _mask_by_length(t, length): """Mask t, 3-D [batch, time, dim], by length, 1-D [batch,].""" maxlen = t.get_shape().as_list()[1] # Subtract 1 from length to prevent the perturbation from going on 'eos' mask = tf.sequence_mask(length - 1, maxlen=maxlen) mask = tf.expand_dims(tf.cast(mask, tf.float32), -1) # shape(mask) = (batch, num_timesteps, 1) return t * mask
def define_computation_graph(source_vocab_size: int, target_vocab_size: int, batch_size: int): tf.reset_default_graph() # Placeholders for inputs and outputs encoder_inputs = tf.placeholder(shape=(batch_size, None), dtype=tf.int32, name='encoder_inputs') decoder_targets = tf.placeholder(shape=(batch_size, None), dtype=tf.int32, name='decoder_targets') decoder_inputs = tf.placeholder(shape=(batch_size, None), dtype=tf.int32, name='decoder_inputs') with tf.variable_scope("Embeddings"): source_embedding = tf.get_variable('source_embedding', [source_vocab_size, C.EMBEDDING_SIZE]) target_embedding = tf.get_variable('target_embedding', [source_vocab_size, C.EMBEDDING_SIZE]) encoder_inputs_embedded = tf.nn.embedding_lookup(source_embedding, encoder_inputs) decoder_inputs_embedded = tf.nn.embedding_lookup(target_embedding, decoder_inputs) with tf.variable_scope("Encoder"): encoder_cell = tf.contrib.rnn.LSTMCell(C.HIDDEN_SIZE) initial_state = encoder_cell.zero_state(batch_size, tf.float32) encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(encoder_cell, encoder_inputs_embedded, initial_state=initial_state, dtype=tf.float32) with tf.variable_scope("Decoder"): decoder_cell = tf.contrib.rnn.LSTMCell(C.HIDDEN_SIZE) decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(decoder_cell, decoder_inputs_embedded, initial_state=encoder_final_state, dtype=tf.float32) with tf.variable_scope("Logits"): decoder_logits = tf.contrib.layers.linear(decoder_outputs, target_vocab_size) with tf.variable_scope("Loss"): one_hot_labels = tf.one_hot(decoder_targets, depth=target_vocab_size, dtype=tf.float32) stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( labels=one_hot_labels, logits=decoder_logits) # mask padded positions target_lengths = compute_lengths(decoder_targets) target_weights = tf.sequence_mask(lengths=target_lengths, maxlen=None, dtype=decoder_logits.dtype) weighted_cross_entropy = stepwise_cross_entropy * target_weights loss = tf.reduce_mean(weighted_cross_entropy) with tf.variable_scope('Optimizer'): train_step = tf.train.AdamOptimizer(learning_rate=C.LEARNING_RATE).minimize(loss) # Logging of cost scalar (@tensorboard) tf.summary.scalar('loss', loss) summary = tf.summary.merge_all() return encoder_inputs, decoder_targets, decoder_inputs, loss, train_step, decoder_logits, summary
def buildNetwork(self): with tf.name_scope('inputs'): self.getInputs() with tf.variable_scope('sampling', reuse=tf.AUTO_REUSE): """ sample from the rnn for nSample times """ # ce_loss, rewards: [batch_size*n_samples] # probs, valid, skip_flag: [batch_size*n_samples, maxSteps] # n_corrects: single number, number of correct predictions in batch_size*n_samples ce_loss, rewards, predicted_skips, probs, valid, self.n_corrects, skip_flag, transfering_loss = self.get_loss_and_rewards( ) # [batch_size, nSamples] ce_loss = tf.reshape(ce_loss, shape=[self.batch_size, self.n_samples], name='ce_loss') rewards = tf.reshape(rewards, shape=[self.batch_size, self.n_samples], name='rewards') transfering_loss = tf.reshape( transfering_loss, shape=[self.batch_size, self.n_samples], name='transfering_loss') # [batch_size, n_samples, maxSteps] probs = tf.reshape( probs, shape=[self.batch_size, self.n_samples, self.args.maxSteps], name='probs_ori') valid = tf.reshape( valid, shape=[self.batch_size, self.n_samples, self.args.maxSteps], name='valid_ori') skip_flag = tf.reshape( skip_flag, shape=[self.batch_size, self.n_samples, self.args.maxSteps], name='skip_flag_ori') predicted_skips = tf.reshape( predicted_skips, shape=[self.batch_size, self.n_samples, self.args.maxSteps], name='skip_flag_ori') probs = tf.add(probs, 1e-5, name='probs') valid = tf.cast(valid, tf.float32, name='valid_ori') # mask out steps exceeding the length of each sample # [batch_size, n_samples] length = tf.reshape(self.length, shape=[self.batch_size, self.n_samples], name='length') skip_flag_mask = tf.sequence_mask(lengths=length, maxlen=self.args.maxSteps, dtype=tf.float32, name='skip_flag_mask') # [batch_size, n_samples, maxSteps] skip_flag = tf.multiply(skip_flag, skip_flag_mask, name='skip_flag') # [batch_size, n_samples] n_skips = tf.reduce_sum(skip_flag, axis=-1, name='n_skips') # [batch_size, n_samples] skip_rate = tf.divide(n_skips, tf.cast(length, tf.float32), name='skip_rate') self.skip_rate = tf.reshape(skip_rate, shape=[-1]) # [batch_size, n_samples] # number of valid decisions made in each sample # for sentence whose length <= min_read, n_valids would be 0 n_valids = tf.reduce_sum(valid, axis=-1, name='n_valids') self.n_valids_sum = tf.reduce_sum(n_valids, name='n_valids_sum') with tf.name_scope('rewards'): # [batch_size, n_samples] sparse_rewards = tf.reduce_sum(predicted_skips, axis=-1, name='sparse_rewards') sparse_rewards = tf.multiply(self.args.sparse, tf.cast(sparse_rewards, tf.float32)) rewards = tf.add(tf.cast(sparse_rewards, tf.float32), tf.cast(rewards, tf.float32), name='rewards') with tf.name_scope('pg_loss'): # [batch_size, ] rewards_mean, rewards_var = tf.nn.moments(rewards, axes=-1, name='rewards_moments') # [batch_size, 1] rewards_mean = tf.expand_dims(rewards_mean, axis=-1) # [batch_size, n_samples] rewards_mean = tf.tile(rewards_mean, multiples=[1, self.n_samples], name='rewards_mean') # [batch_size, n_samples] rewards_norm = tf.subtract(rewards, rewards_mean, name='rewards_norm') # [batch_size, n_samples, maxSteps] rewards_norm_tiled = tf.tile(tf.expand_dims(rewards_norm, axis=-1), multiples=[1, 1, self.args.maxSteps]) # mask out steps that are not valid # [batch_size, n_samples, maxSteps] rewards_norm_tiled = tf.multiply(rewards_norm_tiled, valid, name='rewards_norm_tiled') rewards_norm_tiled = tf.stop_gradient(rewards_norm_tiled) # [batch_size, n_samples, maxSteps] pg_loss_ori = tf.multiply(rewards_norm_tiled, tf.log(probs), name='pg_loss_ori') ## each sampled sample average over its valid steps # [batch_size, n_samples] pg_loss_sum = tf.reduce_sum(pg_loss_ori, axis=-1, name='pg_loss_sum') # [batch_size, n_samples] # some n_valids is 0, resulting in nan in pg_loss_avg, replace nan with 0, the average over valid steps n_valids = tf.where(tf.equal(tf.cast(n_valids, tf.int32), 0), tf.ones_like(n_valids) * 1e10, n_valids, name='n_valids_final') pg_loss_avg = tf.divide(pg_loss_sum, n_valids, name='pg_loss_avg') # average over samples # [batch_size] pg_loss = tf.reduce_mean(pg_loss_avg, axis=-1, name='pg') pg_loss = tf.subtract(0.0, pg_loss, name='pg_loss') #pg_loss = tf.Print(pg_loss, data=[tf.reduce_sum(pg_loss)]) with tf.name_scope('gradients'): # average over samples # [batch_size] ce_loss = tf.reduce_mean(ce_loss, axis=-1, name='ce_loss') transfering_loss = tf.reduce_mean(transfering_loss, axis=-1, name='transfering_loss') # mask out transfering_loss and pg_loss is_transfering = tf.cast(self.is_transfering, tf.float32) # disable transfering_loss when RL begins transfering_loss = tf.multiply(is_transfering, transfering_loss) pg_loss = tf.multiply((1.0 - is_transfering), pg_loss) trainable_params = tf.trainable_variables() # ce_params = [] # pg_params = [] # # for param in trainable_params: # if param.name == 'sampling/loop/skip_lstm_cell/skip_kernel:0' \ # or param.name == 'sampling/loop/skip_lstm_cell/skip_bias:0': # pg_params.append(param) # else: # ce_params.append(param) # TODO: should we use gradients from pg_loss for params other than skip_kernel and skip_bias? # Yes,lower level nets should also be optimized for prediction of skips # add sparse_loss # sparse_loss = tf.Print(sparse_loss, data=[tf.reduce_sum(sparse_loss)]) # when testing upper bound, we only cares about ce_loss #self.loss = tf.reduce_sum(ce_loss + pg_loss + transfering_loss, name='loss') self.loss = tf.reduce_sum(ce_loss, name='loss') gradients_all = tf.gradients(self.loss, trainable_params) # gradients_ce = tf.gradients(ce_loss, ce_params) # gradients_pg = tf.gradients(pg_loss, pg_params) # gradients_sparse = tf.gradients(sparse_loss, trainable_params) opt = tf.train.AdamOptimizer(learning_rate=self.args.learningRate, beta1=0.9, beta2=0.999, epsilon=1e-08) # all_params = ce_params + pg_params # all_gradients = gradients_ce + gradients_pg self.optOp = opt.apply_gradients( zip(gradients_all, trainable_params)) print('RL model built!')
def build_network(is_training): train_output_embed,enc_state= encoder_net(image, 'encode_features',is_training) #vocab_size: 输入数据的总词汇量,指的是总共有多少类词汇,不是总个数,embed_dim:想要得到的嵌入矩阵的维度 embeddings = tf.get_variable(name='embed_matrix',shape=[4, 4]) output_embed=embedding_ops.embedding_lookup(embeddings,train_output) start_tokens = tf.zeros([40], dtype=tf.int64) train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(output_embed, train_length, embeddings, sample_rate) #用于inference阶段的helper,将output输出后的logits使用argmax获得id再经过embedding layer来获取下一时刻的输入。 #start_tokens: batch中每个序列起始输入的token_id end_token:序列终止的token_id #start_tokens: int32 vector shaped [batch_size], the start tokens. #end_token: int32 scalar, the token that marks end of decoding. pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings, start_tokens=tf.to_int32(start_tokens), end_token=1)#GO,EOS的序号 train_outputs = decode(train_helper, train_output_embed,'decode',enc_state) pred_outputs = decode(pred_helper, train_output_embed, 'decode',enc_state, reuse=True) train_decode_result = train_outputs[0].rnn_output[:, :-1, :] pred_decode_result = pred_outputs[0].rnn_output mask = tf.cast(tf.sequence_mask(40 * [train_length[0] - 1], train_length[0]), tf.float32) att_loss = tf.contrib.seq2seq.sequence_loss(train_outputs[0].rnn_output, target_output,weights=mask) loss = tf.reduce_mean(att_loss) return loss,train_decode_result, pred_decode_result
def train_CRNN(): print('Run CRNN chord recognition on %s-%d...' % (hp.dataset, hp.test_set_id)) # Load training and testing data train_data, test_data = load_data_symbol( dir=hp.dataset + '_preprocessed_data_MIREX_Mm.pickle', test_set_id=hp.test_set_id, sequence_with_overlap=hp.train_sequence_with_overlap) n_train_sequences = train_data['pianoroll'].shape[0] n_test_sequences = test_data['pianoroll'].shape[0] n_iterations_per_epoch = int(math.ceil(n_train_sequences / hp.n_batches)) print('n_train_sequences =', n_train_sequences) print('n_test_sequences =', n_test_sequences) print('n_iterations_per_epoch =', n_iterations_per_epoch) print(hp) with tf.name_scope('placeholder'): x_p = tf.placeholder(tf.int32, [None, hp.n_steps, 88], name="pianoroll") x_len = tf.placeholder(tf.int32, [None], name="seq_lens") y_tc = tf.placeholder(tf.int32, [None, hp.n_steps], name="tchord") dropout = tf.placeholder(dtype=tf.float32, name="dropout_rate") is_training = tf.placeholder(dtype=tf.bool, name="is_training") global_step = tf.placeholder(dtype=tf.int32, name='global_step') with tf.name_scope('model'): x_in = tf.cast(x_p, tf.float32) source_mask = tf.sequence_mask( lengths=x_len, maxlen=hp.n_steps, dtype=tf.float32) # [n_batches, n_steps] input_embed = crm.CRNN(x_in, x_len, dropout, is_training, hp) with tf.variable_scope("output_projection"): input_embed = tf.layers.dropout(input_embed, rate=dropout, training=is_training) chord_logits = tf.layers.dense(input_embed, hp.n_chord_classes) with tf.name_scope('loss'): loss = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot( y_tc, hp.n_chord_classes), logits=chord_logits, weights=source_mask) valid = tf.reduce_sum(source_mask) summary_loss = tf.Variable(0.0, trainable=False, dtype=tf.float32) summary_valid = tf.Variable(0, trainable=False, dtype=tf.float32) update_loss = tf.assign(summary_loss, summary_loss + valid * loss) update_valid = tf.assign(summary_valid, summary_valid + valid) mean_loss = tf.assign(summary_loss, summary_loss / summary_valid) clr_summary_loss = summary_loss.initializer clr_summary_valid = summary_valid.initializer tf.summary.scalar('Loss_total', summary_loss) with tf.name_scope('evaluation'): chord_mask = tf.cast(source_mask, tf.bool) chord_mask = tf.logical_and(chord_mask, tf.less(y_tc, tquality_dict['O'] * 12)) pred_tc = tf.argmax(chord_logits, axis=2, output_type=tf.int32) pred_tc_correct = tf.equal(pred_tc, y_tc) pred_tc_correct_mask = tf.boolean_mask(tensor=pred_tc_correct, mask=chord_mask) correct = tf.reduce_sum(tf.cast(pred_tc_correct_mask, tf.float32)) total = tf.cast(tf.size(pred_tc_correct_mask), tf.float32) summary_count = tf.Variable([0.0 for _ in range(2)], trainable=False, dtype=tf.float32) summary_score = tf.Variable(0.0, trainable=False, dtype=tf.float32) update_count = tf.assign(summary_count, summary_count + [correct, total]) acc_tc = summary_count[0] / summary_count[1] compute_score = tf.assign(summary_score, summary_score + acc_tc) clr_summary_count = summary_count.initializer clr_summary_score = summary_score.initializer tf.summary.scalar('Accuracy_tchord', summary_score) with tf.name_scope('optimization'): # Apply warm-up learning rate warm_up_steps = tf.constant(4000, dtype=tf.float32) gstep = tf.cast(global_step, dtype=tf.float32) learning_rate = pow(hp.input_embed_size, -0.5) * tf.minimum( tf.pow(gstep, -0.5), gstep * tf.pow(warm_up_steps, -1.5)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.98, epsilon=1e-9) update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS ) # update moving_mean and moving_variance of batch normalization train_op = optimizer.minimize(loss) train_op = tf.group([train_op, update_ops]) # Graph location and summary writers print('Saving graph to: %s' % hp.graph_location) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(hp.graph_location + '\\train') test_writer = tf.summary.FileWriter(hp.graph_location + '\\test') train_writer.add_graph(tf.get_default_graph()) test_writer.add_graph(tf.get_default_graph()) saver = tf.train.Saver(max_to_keep=1) # Training print('Train the model...') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) startTime = time.time() # start time of training best_score = [0.0, 0.0] in_succession = 0 best_epoch = 0 for step in range(hp.n_training_steps): # Training if step == 0: indices = range(n_train_sequences) batch_indices = [ indices[x:x + hp.n_batches] for x in range(0, len(indices), hp.n_batches) ] if step >= 2 * n_iterations_per_epoch and step % n_iterations_per_epoch == 0: # Shuffle training data indices = random.sample(range(n_train_sequences), n_train_sequences) batch_indices = [ indices[x:x + hp.n_batches] for x in range(0, len(indices), hp.n_batches) ] batch = ( train_data['pianoroll'][batch_indices[step % len(batch_indices)]], train_data['len'][batch_indices[step % len(batch_indices)]], train_data['tchord'][batch_indices[step % len(batch_indices)]], train_data['root'][batch_indices[step % len(batch_indices)]], train_data['tquality'][batch_indices[step % len(batch_indices)]]) train_run_list = [ train_op, update_valid, update_loss, update_count, loss, pred_tc, chord_mask ] train_feed_fict = { x_p: batch[0], x_len: batch[1], y_tc: batch[2], dropout: hp.drop, is_training: True, global_step: step + 1 } _, _, _, _, train_loss, train_pred_tc, train_chord_mask = sess.run( train_run_list, feed_dict=train_feed_fict) if step == 0: print('*~ loss %.4f ~*' % (train_loss)) # Display training log & Testing if step > 0 and step % n_iterations_per_epoch == 0: sess.run([mean_loss, compute_score]) train_summary, train_loss, train_score = sess.run( [merged, summary_loss, summary_score]) sess.run([ clr_summary_valid, clr_summary_loss, clr_summary_count, clr_summary_score ]) train_writer.add_summary(train_summary, step) print( "---- step %d, epoch %d: train_loss: %.4f, evaluation: tc %.4f ----" % (step, step // n_iterations_per_epoch, train_loss, train_score)) display_len = 64 print('len =', batch[1][0]) print( 'y_root'.ljust(7, ' '), ''.join([[k for k, v in root_dict.items() if v == b][0].rjust(3, ' ') for b in batch[3][0, :display_len]])) print( 'y_tq'.ljust(7, ' '), ''.join([[k for k, v in tquality_dict.items() if v == b][0].rjust(3, ' ') for b in batch[4][0, :display_len]])) print( 'valid'.ljust(7, ' '), ''.join([ 'y'.rjust(3, ' ') if b else 'n'.rjust(3, ' ') for b in train_chord_mask[0, :display_len] ])) print( 'y_tc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in batch[2][0, :display_len] ])) print( 'pred_tc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in train_pred_tc[0, :display_len] ])) # Testing test_run_list = [ update_valid, update_loss, update_count, pred_tc, chord_mask ] test_feed_fict = { x_p: test_data['pianoroll'], x_len: test_data['len'], y_tc: test_data['tchord'], dropout: 0.0, is_training: False } _, _, _, test_pred_tc, test_chord_mask = sess.run( test_run_list, feed_dict=test_feed_fict) sess.run([mean_loss, compute_score]) test_summary, test_loss, test_score = sess.run( [merged, summary_loss, summary_score]) sess.run([ clr_summary_valid, clr_summary_loss, clr_summary_count, clr_summary_score ]) test_writer.add_summary(test_summary, step) sq = crm.segmentation_quality(test_data['tchord'], test_pred_tc, test_data['len']) print( "==== step %d, epoch %d: test_loss: %.4f, evaluation: tc %.4f, sq %.4f ====" % (step, step // n_iterations_per_epoch, test_loss, test_score, sq)) sample_id = random.randint(0, n_test_sequences - 1) print('len =', test_data['len'][sample_id]) print( 'y_root'.ljust(7, ' '), ''.join( [[k for k, v in root_dict.items() if v == b][0].rjust(3, ' ') for b in test_data['root'][sample_id, :display_len]])) print( 'y_tq'.ljust(7, ' '), ''.join([ [k for k, v in tquality_dict.items() if v == b][0].rjust(3, ' ') for b in test_data['tquality'][sample_id, :display_len] ])) print( 'valid'.ljust(7, ' '), ''.join([ 'y'.rjust(3, ' ') if b else 'n'.rjust(3, ' ') for b in test_chord_mask[sample_id, :display_len] ])) print( 'y_tc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in test_data['tchord'][sample_id, :display_len] ])) print( 'pred_tc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in test_pred_tc[sample_id, :display_len] ])) if step > 0 and test_score + sq > sum(best_score): best_score = [test_score, sq] best_epoch = step // n_iterations_per_epoch in_succession = 0 # Save variables of the model print('*saving variables...\n') saver.save( sess, hp.graph_location + '\\CRNN_chord_recognition_' + hp.dataset + '_' + str(hp.test_set_id) + '.ckpt') else: in_succession += 1 if in_succession > hp.n_in_succession: print('Early stopping.') break elapsed_time = time.time() - startTime print('\nCRNN chord symbol recognition on %s-%d:' % (hp.dataset, hp.test_set_id)) print('training time = %.2f hr' % (elapsed_time / 3600)) print('best epoch = ', best_epoch) print('best score =', np.round(best_score, 4))
def mask_logits(self, logits, sequence_lengths): mask = tf.sequence_mask(sequence_lengths, dtype=tf.float32) mask_value = -1e32 return logits + mask_value * (1 - mask)
def compute_loss(self, pred_dict, gt_dict, config, is_eval, is_nn, P_in=None): ''' Input: pred_dict should contain: - W: BxNxK, segmentation parts. Allow zero rows to indicate unassigned points. - nocs_per_point: BxNx3, nocs per point - confi_per_point: type per points - This should be logit of shape BxNxT if is_eval=False, and actual value of shape BxN otherwise - can contain -1 - parameters - a dict, each entry is a BxKx... tensor gt_dict should be obtained from calling create_gt_dict P_in - BxNx3 is the input point cloud, used only when is_eval=True Returns: {loss_dict, matching_indices} + stats from calculate_eval_stats(), where - loss_dict contains: - nocs_loss: B, averaged over all N points - type_loss: B, averaged over all N points. - This is cross entropy loss during training, and accuracy during test time - miou_loss: BxK, mean IoU loss for each matched parts - residue_loss: BxK, residue loss for each part - parameter_loss: BxK, parameter loss for each part - avg_miou_loss: B - avg_residue_loss: B - avg_parameter_loss: B - matching_indices: BxK, where (b,k)th ground truth primitive is matched with (b, matching_indices[b, k]) ''' # dimension tensors W = pred_dict['W'] batch_size = tf.shape(W)[0] # B*N*K(k parts) n_points = tf.shape(W)[1] n_max_parts = W.get_shape()[ 2] # n_max_parts should not be dynamic, fixed number of parts # n_registered_primitives = fitter_factory.get_n_registered_primitives() if is_eval and is_nn: # at loss, want W to be binary and filtered (if is from nn) W = nn_filter_W(W) # note that I_gt can contain -1, indicating part of unknown primitive type I_gt = gt_dict['cls_per_point'] # BxN n_parts_gt = tf.reduce_max( I_gt, axis=1 ) + 1 # only count known primitive type parts, as -1 will be ignored mask_gt = tf.sequence_mask( n_parts_gt, maxlen=n_max_parts ) # BxK, mask_gt[b, k] = 1 iff instace k is present in the ground truth batch b matching_indices = tf.stop_gradient( tf.py_func(hungarian_matching, [W, I_gt], Tout=tf.int32)) # BxK into K parts # miou_loss = loss.compute_miou_loss(W, I_gt, matching_indices) # losses all have dimension BxK, here is for segmentation miou_loss = loss.compute_miou_loss(W, I_gt) nocs_loss = loss.compute_nocs_loss(pred_dict['nocs_per_point'], gt_dict['nocs_per_point'], pred_dict['confi_per_point'], \ num_parts=n_max_parts, mask_array=gt_dict['mask_array_per_point'], \ TYPE_L=config.get_nocs_loss(), MULTI_HEAD=True, SELF_SU=False) # todo if self.is_mixed: gocs_loss = loss.compute_nocs_loss(pred_dict['gocs_per_point'], gt_dict['gocs_per_point'], pred_dict['confi_per_point'], \ num_parts=n_max_parts, mask_array=gt_dict['mask_array_per_point'], \ TYPE_L=config.get_nocs_loss(), MULTI_HEAD=True, SELF_SU=False) # todo heatmap_loss = loss.compute_vect_loss(pred_dict['heatmap_per_point'], gt_dict['heatmap_per_point'], confidence=gt_dict['joint_cls_mask'],\ TYPE_L=config.get_nocs_loss()) unitvec_loss = loss.compute_vect_loss(pred_dict['unitvec_per_point'], gt_dict['unitvec_per_point'], confidence=gt_dict['joint_cls_mask'],\ TYPE_L=config.get_nocs_loss()) orient_loss = loss.compute_vect_loss(pred_dict['joint_axis_per_point'], gt_dict['orient_per_point'], confidence=gt_dict['joint_cls_mask'],\ TYPE_L=config.get_nocs_loss()) J_gt = gt_dict['index_per_point'] # BxN inds_pred = pred_dict['index_per_point'] miou_joint_loss = loss.compute_miou_loss( inds_pred, J_gt) # losses all have dimension BxK, here is for segmentation # here we need to add input GT masks for different array loss_dict = { 'nocs_loss': nocs_loss, 'miou_loss': miou_loss, 'heatmap_loss': heatmap_loss, 'unitvec_loss': unitvec_loss, 'orient_loss': orient_loss, 'index_loss': miou_joint_loss } if self.is_mixed: loss_dict['gocs_loss'] = gocs_loss result = {'loss_dict': loss_dict, 'matching_indices': matching_indices} """ if is_eval: result.update( calculate_eval_stats( W=W, matching_indices=matching_indices, mask_gt=mask_gt, P_in=P_in, confi_per_point=pred_dict['confi_per_point'], ) ) """ return result
def _compute_logits(self, mfcc, mfcc_lens, training): logits = self.model(mfcc, mask=tf.sequence_mask(mfcc_lens), training=training) return tf.transpose(logits, [1, 0, 2])
def build_tagging_graph(self, inputs, hidden_layers, channels, num_tags, use_crf, lamd, dropout_emb, dropout_hidden, kernel_size, use_bn, use_wn, active_type): """ Build a deep neural model for sequence tagging. """ stag_ids = tf.placeholder(dtype=INT_TYPE, shape=[None, None], name='stag_ids') seq_lengths = tf.placeholder(dtype=INT_TYPE, shape=[None], name='seq_lengths') # Default is not train. is_train = tf.placeholder(dtype=tf.bool, shape=[], name='is_train') masks = tf.cast(tf.sequence_mask(seq_lengths), FLOAT_TYPE) # Dropout on embedding output. if dropout_emb: inputs = tf.cond(is_train, lambda: tf.nn.dropout(inputs, 1 - dropout_emb), lambda: inputs) hidden_output = inputs pre_channels = inputs.get_shape()[-1].value for i in xrange(hidden_layers): k = kernel_size cur_channels = channels[i] filter_w = tf.get_variable('filter_w_%d' % i, shape=[k, pre_channels, cur_channels], dtype=FLOAT_TYPE) filter_v = tf.get_variable('filter_v_%d' % i, shape=[k, pre_channels, cur_channels], dtype=FLOAT_TYPE) bias_b = tf.get_variable( 'bias_b_%d' % i, shape=[cur_channels], initializer=tf.zeros_initializer(dtype=FLOAT_TYPE)) bias_c = tf.get_variable( 'bias_c_%d' % i, shape=[cur_channels], initializer=tf.zeros_initializer(dtype=FLOAT_TYPE)) # Weight normalization. if use_wn: epsilon = 1e-12 g_w = tf.get_variable('g_w_%d' % i, shape=[k, 1, cur_channels], dtype=FLOAT_TYPE) g_v = tf.get_variable('g_v_%d' % i, shape=[k, 1, cur_channels], dtype=FLOAT_TYPE) # Perform wn filter_w = g_w * filter_w / (tf.sqrt( tf.reduce_sum(filter_w**2, 1, keep_dims=True)) + epsilon) filter_v = g_v * filter_v / (tf.sqrt( tf.reduce_sum(filter_v**2, 1, keep_dims=True)) + epsilon) w = tf.nn.conv1d(hidden_output, filter_w, 1, 'SAME') + bias_b v = tf.nn.conv1d(hidden_output, filter_v, 1, 'SAME') + bias_c if use_bn: w = layers.batch_norm(inputs=v, decay=0.9, is_training=is_train, center=True, scale=True, scope='BatchNorm_w_%d' % i) v = layers.batch_norm(inputs=w, decay=0.9, is_training=is_train, center=True, scale=True, scope='BatchNorm_v_%d' % i) if active_type == 'glu': hidden_output = w * tf.nn.sigmoid(v) elif active_type == 'relu': hidden_output = tf.nn.relu(w) elif active_type == 'gtu': hidden_output = tf.tanh(w) * tf.nn.sigmoid(v) elif active_type == 'tanh': hidden_output = tf.tanh(w) elif active_type == 'linear': hidden_output = w elif active_type == 'bilinear': hidden_output = w * v # Mask paddings. hidden_output = hidden_output * tf.expand_dims(masks, -1) # Dropout on hidden output. if dropout_hidden: hidden_output = tf.cond( is_train, lambda: tf.nn.dropout(hidden_output, 1 - dropout_hidden), lambda: hidden_output) pre_channels = cur_channels # Un-scaled log probabilities. scores = layers.fully_connected(hidden_output, num_tags, tf.identity) if use_crf: cost, transitions = crf.crf_log_likelihood( inputs=scores, tag_indices=stag_ids, sequence_lengths=seq_lengths) cost = -tf.reduce_mean(cost) else: reshaped_scores = tf.reshape(scores, [-1, num_tags]) reshaped_stag_ids = tf.reshape(stag_ids, [-1]) real_distribution = layers.one_hot_encoding( reshaped_stag_ids, num_tags) cost = tf.nn.softmax_cross_entropy_with_logits( reshaped_scores, real_distribution) cost = tf.reduce_sum( tf.reshape(cost, tf.shape(stag_ids)) * masks) / tf.cast( tf.shape(inputs)[0], FLOAT_TYPE) # Calculate L2 penalty. l2_penalty = 0 if lamd > 0: for v in tf.trainable_variables(): if '/B:' not in v.name and '/biases:' not in v.name: l2_penalty += lamd * tf.nn.l2_loss(v) train_cost = cost + l2_penalty # Summary cost. tf.summary.scalar('cost', cost) summaries = tf.summary.merge_all() update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) with tf.control_dependencies([updates]): cost = tf.identity(cost) return stag_ids, seq_lengths, is_train, cost, train_cost, scores, summaries
import tensorflow as tf a = tf.sequence_mask([1, 2, 3], 5) #一维度的变成二维度的。 b = tf.sequence_mask([[1, 2], [3, 4]]) # 二维度的变成三维度的。 a = tf.cast(a, tf.float32) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print(sess.run(a)) print(sess.run(b)) """ [[1. 0. 0. 0. 0.] [1. 1. 0. 0. 0.] [1. 1. 1. 0. 0.]] 解析:maxlen是5,所以一共有5列,lengths有三个元素[1,2,3],所以有三行,每一行分别前1、2、3个元素为True,经过了转型,那么就会变成1或者0. [[[ True False False False] [ True True False False]] [[ True True True False] [ True True True True]]] 解析:因为没有指定maxlen,故maxlen默认取lengths中的最大值4,所以一共有4列,lengths是二维数组, 将其看作由两个一维lengths组成,所以输出也可以看作由这两个一维lengths的输出所组成 """
def __init__(self, batch, config, is_train=True, image_features=None): self.batch = batch self.config = config self.image_dir = config.image_dir self.is_train = is_train # word_weight_dir is only for answer accuracy visualization self.word_weight_dir = getattr(config, 'vlmap_word_weight_dir', None) if self.word_weight_dir is None: log.warn('word_weight_dir is None') self.losses = {} self.report = {} self.mid_result = {} self.output = {} self.heavy_output = {} self.vis_image = {} self.vocab = cPickle.load(open(config.vocab_path, 'rb')) self.answer_dict = cPickle.load( open(os.path.join(config.tf_record_dir, 'answer_dict.pkl'), 'rb')) self.num_answer = len(self.answer_dict['vocab']) self.num_train_answer = self.answer_dict['num_train_answer'] self.train_answer_mask = tf.expand_dims(tf.sequence_mask( self.num_train_answer, maxlen=self.num_answer, dtype=tf.float32), axis=0) self.test_answer_mask = 1.0 - self.train_answer_mask self.obj_answer_mask = tf.expand_dims(tf.constant( self.answer_dict['is_object'], dtype=tf.float32), axis=0) self.attr_answer_mask = tf.expand_dims(tf.constant( self.answer_dict['is_attribute'], dtype=tf.float32), axis=0) self.glove_map = modules.LearnGloVe(self.vocab) self.answer_exist_mask = modules.AnswerExistMask( self.answer_dict, self.word_weight_dir) if self.config.debug: self.features, self.spatials, self.normal_boxes, self.num_boxes, \ self.max_box_num, self.vfeat_dim = get_dummy_data() elif image_features is None: log.infov('loading image features...') with h5py.File(config.vfeat_path, 'r') as f: self.features = np.array(f.get('image_features')) log.infov('feature done') self.spatials = np.array(f.get('spatial_features')) log.infov('spatials done') self.normal_boxes = np.array(f.get('normal_boxes')) log.infov('normal_boxes done') self.num_boxes = np.array(f.get('num_boxes')) log.infov('num_boxes done') self.max_box_num = int(f['data_info']['max_box_num'].value) self.vfeat_dim = int(f['data_info']['vfeat_dim'].value) log.infov('done') else: self.features = image_features['features'] self.spatials = image_features['spatials'] self.normal_boxes = image_features['normal_boxes'] self.num_boxes = image_features['num_boxes'] self.max_box_num = image_features['max_box_num'] self.vfeat_dim = image_features['vfeat_dim'] self.build()
def _build_task_termination(self): """ Build task-specific nodes, losses, and optimizers. """ logger = logging.getLogger("%s.Network._build_task_termination" % self.config.name) logger.debug("Building task termination") for task in self.config.tasks: input_layer = self._shared_layers_output[task.name] logger.debug( "Building task termination for task %s on top of shared layers", task.name) logger.debug("Building %d hidden layers", len(task.hidden_layers)) for idx, hidden_layer in enumerate(task.hidden_layers): assert isinstance(hidden_layer, HiddenLayerConfig) logger.debug( "Building %d. hidden layer with %d units and activation %s", idx + 1, hidden_layer.units, hidden_layer.activation) input_layer = tf.compat.v1.layers.dense( input_layer, hidden_layer.units, activation=ACTIVATION_MAPPING[hidden_layer.activation], name="hidden_layer-%s-%d" % (task.name, idx + 1)) input_layer = tf.nn.dropout(input_layer, 1 - (task.dropout_keep_probability)) # Projection for prediction num_classes = len(task.data_reader.get_labels()) logger.debug( "Build projection layer to map network output to classes. There are %d classes", num_classes) self._projections[task.name] = tf.compat.v1.layers.dense( input_layer, num_classes, name="projection_layer-%s" % task.name) # Loss and prediction logger.debug("Attaching classifier") if task.classifier == CLASSIFIER_CRF: # CRF logger.debug("CRF classifier") # Prediction is performed via Viterbi decoding -> no prediction layer necessary self._predictions[task.name] = None with tf.compat.v1.variable_scope("crf_log_likelihood_%s" % task.name): log_likelihood, self._transition_params[ task.name] = tfa.text.crf_log_likelihood( self._projections[task.name], self._inputs_label[task.name], self._input_sequence_length) self._losses[task.name] = tf.reduce_mean( input_tensor=-log_likelihood) else: # Softmax logger.debug("Softmax classifier") self._predictions[task.name] = tf.cast( tf.argmax(input=self._projections[task.name], axis=-1), tf.int32) # Transition params are not required for softmax self._transition_params[task.name] = None labels = tf.one_hot(self._inputs_label[task.name], len(task.data_reader.get_labels())) # NOTE: this is for testing soft-label capability only (should be disabled!) # labels = tf.multiply(labels, 10.0) # Multiply with 10 so that true label has a higher weight # labels = tf.add(labels, 1.0) # Add one so that multiplication with random values has effect # noise = tf.random_uniform( # tf.shape(labels) # ) # labels = tf.multiply(labels, noise) # Element-wise multiplication with noise # labels = tf.nn.softmax(labels) # Perform softmax to restore the valid probability distribution losses = tf.nn.softmax_cross_entropy_with_logits( logits=self._projections[task.name], labels=tf.stop_gradient(labels), name="softmax_%s" % task.name) # Add Mask for padded sentences mask = tf.sequence_mask(self._input_sequence_length, name="softmax_mask_%s" % task.name) losses = tf.boolean_mask(tensor=losses, mask=mask, name="softmax_mask_layer_%s" % task.name) self._losses[task.name] = tf.reduce_mean(input_tensor=losses) # Optimizer logger.debug("Attaching optimizer") optimizer_function = OPTIMIZER_MAPPING[ self.config.training.optimizer] optimizer = optimizer_function( **self.config.training.optimizer_params) gradients, variables = list( zip(*optimizer.compute_gradients(self._losses[task.name]))) if self.config.training.use_gradient_clipping: logger.debug( "Adding node for performing gradient clipping for task %s.", task.name) gradients, self._gradient_norms[ task.name] = tf.clip_by_global_norm( gradients, self.config.training.clip_norm) else: self._gradient_norms[task.name] = tf.linalg.global_norm( gradients) self._operations_train[task.name] = optimizer.apply_gradients( list(zip(gradients, variables)))
def __init__(self, word_vocab_enc, word_vocab_dec, options=None, mode='ce_train'): # here 'mode', whose value can be: # 'ce_train', # 'rl_train', # 'evaluate', # 'evaluate_bleu', # 'decode'. # it is different from 'mode_gen' in generator_utils.py # value of 'mode_gen' can be ['ce_loss', 'rl_loss', 'greedy' or 'sample'] self.mode = mode # is_training controls whether to use dropout is_training = True if mode in ('ce_train', ) else False self.options = options self.word_vocab_enc = word_vocab_enc self.word_vocab_dec = word_vocab_dec self.create_placeholders(options) # encode the input instance # encoder.graph_hidden [batch, node_num, vsize] # encoder.graph_cell [batch, node_num, vsize] with tf.variable_scope('linamr_encoder'): self.linamr_encoder = encoder_utils.SeqEncoder(options, word_vocab = word_vocab_enc) self.linamr_hidden_dim, self.linamr_hiddens, self.linamr_decinit = \ self.linamr_encoder.encode(is_training=is_training) self.linamr_words = self.linamr_encoder.in_passage_words self.linamr_lengths = self.linamr_encoder.passage_lengths self.linamr_mask = self.linamr_encoder.passage_mask with tf.variable_scope('src_encoder'): self.src_encoder = encoder_utils.SeqEncoder(options, word_vocab=word_vocab_enc) self.src_hidden_dim, self.src_hiddens, self.src_decinit = \ self.src_encoder.encode(is_training=is_training) self.src_words = self.src_encoder.in_passage_words self.src_lengths = self.src_encoder.passage_lengths self.src_mask = self.src_encoder.passage_mask # ============== Choices of initializing decoder state ============= if options.way_init_decoder == 'src': new_c, new_h = self.src_decinit.c, self.src_decinit.h elif options.way_init_decoder == 'linamr': new_c, new_h = self.linamr_decinit.c, self.linamr_decinit.h elif options.way_init_decoder == 'zero': new_c = tf.zeros([self.encoder.batch_size, options.gen_hidden_size]) new_h = tf.zeros([self.encoder.batch_size, options.gen_hidden_size]) else: assert False, 'way to initial decoder (%s) not supported' % options.way_init_decoder self.init_decoder_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h) # prepare src-side input for decoder loss_weights = tf.sequence_mask(self.answer_len, options.max_answer_len, dtype=tf.float32) # [batch_size, gen_steps] with variable_scope.variable_scope("generator"): # create generator self.generator = generator_utils.CovAttenGen(self, options, word_vocab_dec, is_training=is_training) # calculate encoder_features with variable_scope.variable_scope("encoder_feats"): self.linamr_features = self.generator.calculate_encoder_features( self.linamr_hiddens, self.linamr_hidden_dim) with variable_scope.variable_scope("src_feats"): self.src_features = self.generator.calculate_encoder_features( self.src_hiddens, self.src_hidden_dim) if mode == 'decode': self.context_encoder_t_1 = tf.placeholder(tf.float32, [None, self.linamr_hidden_dim], name='context_encoder_t_1') # [batch_size, encoder_dim] self.context_src_t_1 = tf.placeholder(tf.float32, [None, self.src_hidden_dim], name='context_src_t_1') # [batch_size, src_dim] if options.use_coverage: self.coverage_t_1 = tf.placeholder(tf.float32, [None, None], name='coverage_t_1') # [batch_size, encoder_dim] else: self.coverage_t_1 = None self.word_t = tf.placeholder(tf.int32, [None], name='word_t') # [batch_size] (self.state_t, self.context_encoder_t, self.context_src_t, self.coverage_t, self.attn_dist_t, self.ouput_t, self.topk_log_probs, self.topk_ids, self.greedy_prediction, self.multinomial_prediction) = \ self.generator.decode_mode( word_vocab_dec, options.beam_size, self.init_decoder_state, self.context_encoder_t_1, self.context_src_t_1, self.coverage_t_1, self.word_t, self.linamr_hiddens, self.linamr_features, self.linamr_mask, self.src_hiddens, self.src_features, self.src_mask) # not buiding training op for this mode return elif mode == 'evaluate_bleu': _, _, self.greedy_words = self.generator.train_mode(word_vocab_dec, self.linamr_hidden_dim, self.linamr_hiddens, self.linamr_features, self.linamr_mask, self.src_hidden_dim, self.src_hiddens, self.src_features, self.src_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='greedy') # not buiding training op for this mode return elif mode in ('ce_train', 'evaluate', ): self.accu, self.loss, _ = self.generator.train_mode(word_vocab_dec, self.linamr_hidden_dim, self.linamr_hiddens, self.linamr_features, self.linamr_mask, self.src_hidden_dim, self.src_hiddens, self.src_features, self.src_mask, self.init_decoder_state, self.answer_inp, self.answer_ref, loss_weights, mode_gen='ce_loss') if mode == 'evaluate': return # not buiding training op for evaluation with tf.device('/gpu:1'): if options.optimize_type == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(learning_rate=options.learning_rate) elif options.optimize_type == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=options.learning_rate) clipper = 50 if not options.__dict__.has_key("max_gradient_norm") else options.max_gradient_norm print("MAX gradient norm {}".format(clipper)) tvars = tf.trainable_variables() if options.lambda_l2>0.0: l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tvars if v.get_shape().ndims > 1]) self.loss = self.loss + options.lambda_l2 * l2_loss grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), clipper) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) extra_train_ops = [] train_ops = [self.train_op] + extra_train_ops self.train_op = tf.group(*train_ops)
def add_decoder_op(self): # reshape inputs to a list of words input_mask = tf.sequence_mask(self.sequence_lengths) encoder_output_h, encoder_output_c = self.encoder_output decoder_input_h = tf.boolean_mask(encoder_output_h, input_mask) decoder_input_c = tf.boolean_mask(encoder_output_c, input_mask) initial_state = tf.contrib.rnn.LSTMStateTuple(h=decoder_input_h, c=decoder_input_c) batch_size = tf.shape(decoder_input_h)[0] projection_layer = tf.layers.Dense(self.config.ntags, use_bias=True, name="decoder_proj") decoder_cell = tf.contrib.rnn.LSTMCell( num_units=2 * self.config.hidden_size_lstm ) # num_units = encoder backword and forward hidden states concatenated if (self.config.analysis_embeddings == "attention_tag" or self.config.analysis_embeddings == "attention_category"): self.logger.warning("Using attention %s" % self.config.analysis_embeddings) # shape: [words X analysis-number X attention-embedding-size] analysis_attention_embeddings = tf.boolean_mask( self.analysis_attention_embeddings, input_mask) analysis_lengths = tf.boolean_mask(self.analysis_lengths, input_mask) # shape: [words] if self.config.attention_mechanism == 'luong': attention_mechanism = tf.contrib.seq2seq.LuongAttention( num_units=2 * self.config.hidden_size_lstm, memory=analysis_attention_embeddings, memory_sequence_length=analysis_lengths, scale=False) elif self.config.attention_mechanism == 'bahdanau': attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=2 * self.config.hidden_size_lstm, memory=analysis_attention_embeddings, memory_sequence_length=analysis_lengths) else: raise ValueError("Invalid attention mechanism '%s'" % self.config.attention_mechnism) decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention_mechanism, attention_layer_size=2 * self.config.hidden_size_lstm) initial_state = decoder_cell.zero_state( dtype=tf.float32, batch_size=batch_size).clone(cell_state=initial_state) start_tokens = tf.tile([self.sos_id], [batch_size]) # shift tags one step to the left and prepend 'sos' token. tag_ids_train = tf.concat( [tf.expand_dims(start_tokens, 1), self.tag_ids[:, :-1]], 1) tags_train_embedded = tf.nn.embedding_lookup(self.tag_embeddings, tag_ids_train) tags_train_embedded = tf.layers.dropout( tags_train_embedded, rate=1 - self.config.tag_embeddings_dropout, training=self.training_phase) # Training if self.config.trainer == "basic": train_helper = tf.contrib.seq2seq.TrainingHelper( inputs=tags_train_embedded, sequence_length=self. tag_lengths, # `tag-length` covers <sos-token, actual tags, eos-token> ) elif self.config.trainer == "scheduled": train_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( inputs=tags_train_embedded, sequence_length=self. tag_lengths, # `tag-length` covers <sos-token, actual tags, eos-token> embedding=lambda ids: tf.nn.embedding_lookup( self.tag_embeddings, ids), sampling_probability=self.config. scheduled_trainer_sampling_prob) else: raise ValueError("Invalid trainer specified: '%s'" % self.config.trainer) train_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, train_helper, initial_state=initial_state, output_layer=projection_layer) decoder_outputs, final_state, decoder_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( train_decoder, impute_finished=False) # logits = decoder_outputs.rnn_output logits = decoder_outputs[0] logits = tf.verify_tensor_all_finite(logits, "Logits not finite") # from padded training tags extracts actual-tags + eos-token: weights = tf.to_float(tf.not_equal(tag_ids_train, self.eos_id)) weights = tf.to_float(tf.not_equal(weights, self.pad_id)) loss = tf.contrib.seq2seq.sequence_loss(logits=logits, targets=self.tag_ids, weights=weights, name="sequence_loss", average_across_timesteps=False) self.loss = tf.reduce_sum(loss) # Scoring # 1. Score given labels scoring_helper = tf.contrib.seq2seq.TrainingHelper( inputs=tags_train_embedded, sequence_length=self.tag_lengths) scoring_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, scoring_helper, initial_state=initial_state, output_layer=projection_layer) scoring_outputs, _, scoring_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( scoring_decoder) scoring_logits = scoring_outputs.rnn_output scoring_logits = tf.verify_tensor_all_finite( scoring_logits, "Scoring logits not finite") logits_flat = tf.reshape(scoring_logits, [-1, tf.shape(scoring_logits)[2]]) softmax_scores_flat = tf.nn.softmax(logits_flat, dim=-1) tag_ids_train_flat = tf.reshape(self.tag_ids, [-1]) indices = tf.concat([ tf.expand_dims(tf.range(0, tf.shape(tag_ids_train_flat)[0]), 1), tf.expand_dims(tag_ids_train_flat, 1) ], axis=1) tag_softmax_scores_flat = tf.gather_nd(softmax_scores_flat, indices) tag_softmax_scores = tf.reshape(tag_softmax_scores_flat, [batch_size, -1]) tag_mask = tf.sequence_mask(self.tag_lengths, tf.shape(tag_softmax_scores)[1]) tag_softmax_scores = tf.multiply(tag_softmax_scores, tf.cast(tag_mask, tf.float32)) tag_softmax_scores += tf.cast(tf.logical_not(tag_mask), tf.float32) scores = np.e**-tf.div( tf.reduce_sum(tf.log(tag_softmax_scores), axis=-1), tf.cast(self.tag_lengths, tf.float32)) self.labels_scores = scores # 2. Score best labels max_tag_softmax_scores = tf.reduce_max(tf.nn.softmax(scoring_logits, dim=-1), axis=-1) max_tag_mask = tf.sequence_mask(self.tag_lengths, tf.shape(max_tag_softmax_scores)[1]) max_tag_softmax_scores = tf.multiply(max_tag_softmax_scores, tf.cast(max_tag_mask, tf.float32)) max_tag_softmax_scores += tf.cast(tf.logical_not(max_tag_mask), tf.float32) max_scores = np.e**-tf.div( tf.reduce_sum(tf.log(max_tag_softmax_scores), axis=-1), tf.cast(self.tag_lengths, tf.float32)) self.labels_max_scores = max_scores self.labels_max_ids = tf.argmax(scoring_logits, axis=-1) # Inference infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=self.tag_embeddings, start_tokens=start_tokens, end_token=self.eos_id) infer_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, infer_helper, initial_state=initial_state, output_layer=projection_layer) final_outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode( infer_decoder, maximum_iterations=self.config.decoder_maximum_iterations, impute_finished=True) decoder_logits = final_outputs.rnn_output decoder_logits = tf.verify_tensor_all_finite( decoder_logits, "Decoder Logits not finite") with tf.control_dependencies([ tf.assert_rank(decoder_logits, 3), tf.assert_none_equal(tf.reduce_sum(decoder_logits), 0.), tf.assert_equal( tf.cast(tf.argmax(decoder_logits, axis=-1), tf.int32), final_outputs.sample_id) ]): decoder_logits = tf.identity(decoder_logits) self.decoder_logits = decoder_logits self.labels_pred = final_outputs.sample_id self.labels_pred_lengths = final_sequence_lengths
lr, target_sequence_length, max_target_sequence_length, source_sequence_length, len(da.source_letter_to_int), len(da.target_letter_to_int), encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, batch_size) training_logits = tf.identity(training_decoder_output.rnn_output, 'logits') predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions') masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks') with tf.name_scope("optimization"): # Loss function cost = tf.contrib.seq2seq.sequence_loss( training_logits, targets, masks) # Optimizer optimizer = tf.train.AdamOptimizer(lr) # Gradient Clipping gradients = optimizer.compute_gradients(cost) capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
def build_training_graph(self, input_tensors): target_index = input_tensors[reader.TARGET_INDEX_KEY] target_lengths = input_tensors[reader.TARGET_LENGTH_KEY] path_source_indices = input_tensors[reader.PATH_SOURCE_INDICES_KEY] node_indices = input_tensors[reader.NODE_INDICES_KEY] path_target_indices = input_tensors[reader.PATH_TARGET_INDICES_KEY] valid_context_mask = input_tensors[reader.VALID_CONTEXT_MASK_KEY] path_source_lengths = input_tensors[reader.PATH_SOURCE_LENGTHS_KEY] path_lengths = input_tensors[reader.PATH_LENGTHS_KEY] path_target_lengths = input_tensors[reader.PATH_TARGET_LENGTHS_KEY] with tf.variable_scope('model'): subtoken_vocab = tf.get_variable('SUBTOKENS_VOCAB', shape=(self.subtoken_vocab_size, self.config.EMBEDDINGS_SIZE), dtype=tf.float32, initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_OUT', uniform=True)) target_words_vocab = tf.get_variable('TARGET_WORDS_VOCAB', shape=(self.target_vocab_size, self.config.EMBEDDINGS_SIZE), dtype=tf.float32, initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_OUT', uniform=True)) nodes_vocab = tf.get_variable('NODES_VOCAB', shape=(self.nodes_vocab_size, self.config.EMBEDDINGS_SIZE), dtype=tf.float32, initializer=tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_OUT', uniform=True)) # (batch, max_contexts, decoder_size) batched_contexts = self.compute_contexts(subtoken_vocab=subtoken_vocab, nodes_vocab=nodes_vocab, source_input=path_source_indices, nodes_input=node_indices, target_input=path_target_indices, valid_mask=valid_context_mask, path_source_lengths=path_source_lengths, path_lengths=path_lengths, path_target_lengths=path_target_lengths) batch_size = tf.shape(target_index)[0] outputs, final_states = self.decode_outputs(target_words_vocab=target_words_vocab, target_input=target_index, batch_size=batch_size, batched_contexts=batched_contexts, valid_mask=valid_context_mask) step = tf.Variable(0, trainable=False) logits = outputs.rnn_output # (batch, max_output_length, dim * 2 + rnn_size) crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_index, logits=logits) target_words_nonzero = tf.sequence_mask(target_lengths + 1, maxlen=self.config.MAX_TARGET_PARTS + 1, dtype=tf.float32) loss = tf.reduce_sum(crossent * target_words_nonzero) / tf.to_float(batch_size) if self.config.USE_MOMENTUM: learning_rate = tf.train.exponential_decay(0.01, step * self.config.BATCH_SIZE, self.num_training_examples, 0.95, staircase=True) optimizer = tf.train.MomentumOptimizer(learning_rate, 0.95, use_nesterov=True) train_op = optimizer.minimize(loss, global_step=step) else: params = tf.trainable_variables() gradients = tf.gradients(loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, clip_norm=5) optimizer = tf.train.AdamOptimizer() train_op = optimizer.apply_gradients(zip(clipped_gradients, params)) self.saver = tf.train.Saver(max_to_keep=10) return train_op, loss
def train_attention_modified(): # 构造graph train_graph = tf.Graph() with train_graph.as_default(): # 获得模型输入 input_keywords_ids, input_pretexts_ids, targets, lr, target_sequence_length, max_target_sequence_length, \ input_keywords_length, input_pretexts_length = get_inputs_modified() training_decoder_output, predict_output = seq2seq_model_modified( input_keywords_ids, input_pretexts_ids, targets, lr, target_sequence_length, max_target_sequence_length, input_keywords_length, input_pretexts_length, len(word2id), len(word2id), encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers) training_logits = tf.identity(training_decoder_output.rnn_output, 'logits') predicting_logits = tf.identity(predict_output.sample_id, name='predictions') masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks') with tf.name_scope("optimization"): # Loss function cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks) # Optimizer optimizer = tf.train.AdamOptimizer(lr) # Gradient Clipping gradients = optimizer.compute_gradients(cost) capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] train_op = optimizer.apply_gradients(capped_gradients) # 将数据集分割为train和validation train_keywords = keywords_int[300 * batch_size:] train_pretexts = pretexts_int[300 * batch_size:] train_target = curlines_int[300 * batch_size:] # 留出一个batch进行验证 valid_keywords = keywords_int[:300 * batch_size] valid_pretexts = pretexts_int[:300 * batch_size] valid_target = curlines_int[:300 * batch_size] (valid_targets_batch, valid_keywords_batch, valid_pretexts_batch, valid_targets_lengths, valid_keywords_lengths, valid_pretexts_length) = next( getbatches_modified(valid_target, valid_keywords, valid_pretexts, batch_size, word2id['<PAD>'])) display_step = 50 # 每隔50轮输出loss checkpoint = "./model/trained_model_attention.ckpt" checkpoint_path = './model/trained_model_attention_qijue_epoch' with tf.Session(graph=train_graph) as sess: sess.run(tf.global_variables_initializer()) for epoch_i in range(1, epochs + 1): for batch_i, (targets_batch, keywords_batch, pretexts_batch, targets_lengths, batch_keywords_lengths, batch_pretexts_lengths) in enumerate( getbatches_modified(train_target, train_keywords, train_pretexts, batch_size, word2id['<PAD>'])): _, loss = sess.run( [train_op, cost], { input_keywords_ids: keywords_batch, input_pretexts_ids: pretexts_batch, targets: targets_batch, lr: learning_rate, target_sequence_length: targets_lengths, input_pretexts_length: batch_pretexts_lengths, input_keywords_length: batch_keywords_lengths }) if batch_i % display_step == 0: # 计算validation loss validation_loss = sess.run( [cost], { input_keywords_ids: valid_keywords_batch, input_pretexts_ids: valid_pretexts_batch, targets: valid_targets_batch, lr: learning_rate, target_sequence_length: valid_targets_lengths, input_keywords_length: valid_keywords_lengths, input_pretexts_length: valid_pretexts_length }) print( 'Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f} - Validation loss: {:>6.3f}' .format(epoch_i, epochs, batch_i, len(train_target) // batch_size, loss, validation_loss[0])) checkpoint = checkpoint_path + str(epoch_i) + '.ckpt' saver = tf.train.Saver() saver.save(sess, checkpoint) print('Model Trained and Saved')
def score(self, features_file, predictions_file, checkpoint_path=None, output_file=None): """Scores existing predictions. Args: features_file: The input file. predictions_file: The predictions file to score. checkpoint_path: Path of a specific checkpoint to use. If ``None``, the latest is used. output_file: The file where the scores are saved. Otherwise, they will be printed on the standard output. Raises: ValueError: if no checkpoint are found or if the model is not a sequence to sequence model. """ if not isinstance(self._model, (models.LanguageModel, models.SequenceToSequence)): raise ValueError("scoring only works for sequence to sequence or language models") if checkpoint_path is None: checkpoint_path = tf.train.latest_checkpoint(self._config["model_dir"]) elif tf.gfile.IsDirectory(checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) if checkpoint_path is None: raise ValueError("could not find a trained model in %s" % self._config["model_dir"]) model = copy.deepcopy(self._model) with tf.Graph().as_default(): dataset = model.examples_inputter.make_evaluation_dataset( features_file, predictions_file, self._config["score"]["batch_size"], num_threads=self._config["score"].get("num_threads"), prefetch_buffer_size=self._config["score"].get("prefetch_buffer_size")) iterator = dataset.make_initializable_iterator() features, labels = iterator.get_next() labels["alignment"] = None # Add alignment key to force the model to return attention. outputs, _ = model( features, labels, self._config["params"], tf.estimator.ModeKeys.EVAL) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=outputs["logits"], labels=labels["ids_out"]) weights = tf.sequence_mask(labels["length"], dtype=cross_entropy.dtype) masked_cross_entropy = cross_entropy * weights scores = tf.reduce_sum(masked_cross_entropy, axis=1) results = { "cross_entropy": cross_entropy, "score": scores, "tokens": labels["tokens"], "length": labels["length"] - 1 # -1 for the special token. } if "attention" in outputs: results["attention"] = outputs["attention"] if output_file: stream = io.open(output_file, encoding="utf-8", mode="w") else: stream = sys.stdout output_tokenizer = ( self._model.labels_inputter.tokenizer if not self._model.unsupervised else self._model.features_inputter.tokenizer) with tf.train.MonitoredSession( session_creator=tf.train.ChiefSessionCreator( checkpoint_filename_with_path=checkpoint_path, config=self._session_config)) as sess: sess.run(iterator.initializer) while not sess.should_stop(): for batch in misc.extract_batches(sess.run(results)): tokens = batch["tokens"][:batch["length"]] sentence = output_tokenizer.detokenize(tokens) token_level_scores = None attention = None if self._config["score"].get("with_token_level"): token_level_scores = batch["cross_entropy"][:batch["length"]] if "attention" in batch: attention = batch["attention"][:batch["length"]] alignment_type = self._config["score"].get("with_alignments") sentence = format_translation_output( sentence, score=batch["score"], token_level_scores=token_level_scores, attention=attention, alignment_type=alignment_type) misc.print_bytes(tf.compat.as_bytes(sentence), stream=stream) if output_file: stream.close()
def call(self, inputs, mask=None, training=None, **kwargs): if self.supports_masking: queries, keys = inputs query_masks, key_masks = mask query_masks = tf.cast(query_masks, tf.float32) key_masks = tf.cast(key_masks, tf.float32) else: queries, keys, query_masks, key_masks = inputs query_masks = tf.sequence_mask(query_masks, self.seq_len_max, dtype=tf.float32) key_masks = tf.sequence_mask(key_masks, self.seq_len_max, dtype=tf.float32) query_masks = tf.squeeze(query_masks, axis=1) key_masks = tf.squeeze(key_masks, axis=1) if self.use_positional_encoding: queries = positional_encoding(queries) keys = positional_encoding(queries) querys = tf.tensordot(queries, self.W_Query, axes=(-1, 0)) # None T_q D*head_num keys = tf.tensordot(keys, self.W_key, axes=(-1, 0)) values = tf.tensordot(keys, self.W_Value, axes=(-1, 0)) # head_num*None T_q D querys = tf.concat(tf.split(querys, self.head_num, axis=2), axis=0) keys = tf.concat(tf.split(keys, self.head_num, axis=2), axis=0) values = tf.concat(tf.split(values, self.head_num, axis=2), axis=0) # head_num*None T_q T_k outputs = tf.matmul(querys, keys, transpose_b=True) outputs = outputs / (keys.get_shape().as_list()[-1]**0.5) key_masks = tf.tile(key_masks, [self.head_num, 1]) # (h*N, T_q, T_k) key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) paddings = tf.ones_like(outputs) * (-2**32 + 1) # (h*N, T_q, T_k) outputs = tf.where( tf.equal(key_masks, 1), outputs, paddings, ) if self.blinding: outputs = tf.matrix_set_diag( outputs, tf.ones_like(outputs)[:, :, 0] * (-2**32 + 1)) outputs -= tf.reduce_max(outputs, axis=-1, keep_dims=True) outputs = tf.nn.softmax(outputs) query_masks = tf.tile(query_masks, [self.head_num, 1]) # (h*N, T_q) # (h*N, T_q, T_k) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) outputs *= query_masks outputs = self.dropout(outputs, training=training) # Weighted sum # ( h*N, T_q, C/h) result = tf.matmul(outputs, values) result = tf.concat(tf.split(result, self.head_num, axis=0), axis=2) if self.use_res: # tf.tensordot(queries, self.W_Res, axes=(-1, 0)) result += queries if self.use_layer_norm: result = self.ln(result) if self.use_feed_forward: fw1 = tf.nn.relu(tf.tensordot(result, self.fw1, axes=[-1, 0])) fw1 = self.dropout(fw1, training=training) fw2 = tf.tensordot(fw1, self.fw2, axes=[-1, 0]) if self.use_res: result += fw2 if self.use_layer_norm: result = self.ln(result) return tf.reduce_mean(result, axis=1, keep_dims=True)
def _create_network(self): self.X = tf.placeholder(tf.int32, [self.batch_size, None], name="X") # input smiles self.Y = tf.placeholder(tf.int32, [self.batch_size, None], name="Y") # reconstructed smiles self.S = tf.placeholder(tf.float32, [self.batch_size, self.sample_size], name="S") # seed self.L = tf.placeholder(tf.int32, [self.batch_size], "L") # actual length of SMILES self.N = tf.placeholder(tf.float32, [self.batch_size, self.latent_size], "N") # randomness on latent vectors self.P = tf.placeholder(tf.float32, [self.batch_size, self.property_task], "P") # properties mol_onehot = tf.one_hot(tf.cast(self.X, tf.int32), self.vocab_size) mol_onehot = tf.cast(mol_onehot, tf.float32) self.prefn = [self.latent_size, self.latent_size, self.property_task] self.disfn = [self.latent_size, self.latent_size, 1] self.genfn = [self.latent_size, self.latent_size, self.latent_size] decoded_rnn_size = [self.latent_size] encoded_rnn_size = [self.latent_size] with tf.variable_scope('decode'): decode_cell = [] for i in decoded_rnn_size[:]: decode_cell.append(tf.nn.rnn_cell.LSTMCell(i)) self.decoder = tf.nn.rnn_cell.MultiRNNCell(decode_cell) with tf.variable_scope('encode'): encode_cell = [] for i in encoded_rnn_size[:]: encode_cell.append(tf.nn.rnn_cell.LSTMCell(i)) self.encoder = tf.nn.rnn_cell.MultiRNNCell(encode_cell) self.initial_state = self.decoder.zero_state(self.batch_size, tf.float32) self.weights = {} self.biases = {} self.weights['softmax'] = tf.get_variable("softmaxw", initializer=tf.contrib.layers.xavier_initializer(),\ shape=[decoded_rnn_size[-1], self.vocab_size]) self.biases['softmax'] = tf.get_variable( "softmaxb", initializer=tf.contrib.layers.xavier_initializer(), shape=[self.vocab_size]) for i in range(len(self.disfn)): name = 'disfw' + str(i + 1) if i == 0: self.weights[name] = tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\ shape=[self.latent_size, self.disfn[i]]) else: self.weights[name] = tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\ shape=[self.disfn[i-1], self.disfn[i]]) name = 'disfb' + str(i + 1) self.biases[name] = tf.get_variable( name, initializer=tf.zeros_initializer(), shape=[self.disfn[i]]) for i in range(len(self.prefn)): name = 'clyfw' + str(i + 1) if i == 0: self.weights[name] = tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\ shape=[self.latent_size, self.prefn[i]]) else: self.weights[name] = tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\ shape=[self.prefn[i-1], self.prefn[i]]) name = 'clyfb' + str(i + 1) self.biases[name] = tf.get_variable( name, initializer=tf.zeros_initializer(), shape=[self.prefn[i]]) for i in range(len(self.genfn)): name = 'genfw' + str(i + 1) if i == 0: self.weights[name] = tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\ shape=[self.sample_size, self.genfn[i]]) else: self.weights[name] = tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(),\ shape=[self.genfn[i-1], self.genfn[i]]) name = 'genfb' + str(i + 1) self.biases[name] = tf.get_variable( name, initializer=tf.zeros_initializer(), shape=[self.genfn[i]]) self.mol_encoded0 = self.total_encoder(mol_onehot) self.mol_encoded = tf.nn.l2_normalize(self.mol_encoded0, dim=-1) self.latent_vector = self.generator(self.S) d_real_logits = self.discriminator(self.mol_encoded) d_fake_logits = self.discriminator(self.latent_vector, reuse=True) predicted_property = self.predictor(self.mol_encoded) self.mol_encoded += self.N self.mol_decoded_softmax, mol_decoded_logits = self.total_decoder( self.mol_encoded, mol_onehot, self.P) weights = tf.sequence_mask(self.L, tf.shape(self.X)[1]) weights = tf.cast(weights, tf.int32) weights = tf.cast(weights, tf.float32) self.reconstr_loss = tf.reduce_mean( tf.contrib.seq2seq.sequence_loss(logits=mol_decoded_logits, targets=self.Y, weights=weights)) self.g_loss = -tf.reduce_mean(d_fake_logits) self.en_loss = (tf.reduce_mean(d_real_logits)) self.d_loss = (-tf.reduce_mean(d_real_logits) + tf.reduce_mean(d_fake_logits)) self.en_classified_loss = -tf.reduce_mean( tf.square(predicted_property - self.P)) # need to be modified self.classified_loss = tf.reduce_mean( tf.square(predicted_property - self.P)) # need to be modified # Loss self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() ae_list = [ var for var in tvars if 'decode' in var.name or 'encode' in var.name or 'softmax' in var.name ] en_list = [var for var in tvars if 'encode' in var.name] gen_list = [var for var in tvars if 'gen' in var.name] dis_list = [var for var in tvars if 'dis' in var.name] pre_list = [var for var in tvars if 'cly' in var.name] print(np.sum([np.prod(v.shape) for v in ae_list])) print(np.sum([np.prod(v.shape) for v in en_list])) print(np.sum([np.prod(v.shape) for v in dis_list])) print(np.sum([np.prod(v.shape) for v in gen_list])) print(np.sum([np.prod(v.shape) for v in pre_list])) print(np.sum([np.prod(v.shape) for v in tvars])) name1 = [v.name for v in ae_list] name2 = [v.name for v in en_list] name3 = [v.name for v in dis_list] name4 = [v.name for v in gen_list] name5 = [v.name for v in pre_list] optimizer1 = tf.train.GradientDescentOptimizer(1.0) optimizer2 = tf.train.AdamOptimizer(1e-5) optimizer3 = tf.train.AdamOptimizer(2e-6) optimizer4 = tf.train.AdamOptimizer(1e-5) self.opt1 = optimizer1.minimize(self.reconstr_loss, var_list=ae_list) self.opt2 = optimizer1.minimize(self.en_loss, var_list=en_list) # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.opt3 = optimizer2.minimize(self.g_loss, var_list=gen_list) self.opt4 = optimizer3.minimize(self.d_loss, var_list=dis_list) self.opt5 = optimizer1.minimize(self.en_classified_loss, var_list=en_list) self.opt6 = optimizer1.minimize(self.classified_loss, var_list=pre_list) self.clip_dis = [ p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in dis_list ] self.mol_pred = tf.argmax(self.mol_decoded_softmax, axis=2) self.sess = tf.Session() init = tf.global_variables_initializer() self.sess = tf.Session() self.sess.run(init) self.saver = tf.train.Saver(max_to_keep=None) # tf.train.start_queue_runners(sess=self.sess) print("Network Ready")
def sequence_sampled_softmax_cross_entropy(targets, train_logits, decoder_weights, decoder_biases, num_classes, **loss): batch_max_targets_sequence_length = tf.shape(targets)[1] targets_sequence_length = sequence_length_2D(tf.cast(targets, tf.int64)) batch_max_train_logits_sequence_length = tf.shape(train_logits)[1] logits_pad_len = tf.maximum( 0, batch_max_targets_sequence_length - batch_max_train_logits_sequence_length, ) targets_pad_len = tf.maximum( 0, batch_max_train_logits_sequence_length - batch_max_targets_sequence_length, ) padded_logits = tf.pad(train_logits, [[0, 0], [0, logits_pad_len], [0, 0]]) padded_targets = tf.pad(targets, [[0, 0], [0, targets_pad_len]]) output_exp = tf.cast(tf.reshape(padded_targets, [-1, 1]), tf.int64) sampled_values = sample_values_from_classes( output_exp, loss["sampler"], num_classes, loss["negative_samples"], loss["unique"], loss["class_counts"], loss["distortion"], ) if loss["sampler"] == "fixed_unigram": # regenerate sampled_values structure for specified samplers # to handle any zero values in true_expected_count tensor sampled_values = FixedUnigramCandidateSampler( sampled_values.sampled_candidates, # add smoothing constant EPSILON to handle any zero values tf.add(sampled_values.true_expected_count, EPSILON), sampled_values.sampled_expected_count, ) def _sampled_loss(labels, logits): labels = tf.cast(labels, tf.int64) labels = tf.reshape(labels, [-1, 1]) logits = tf.cast(logits, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss( weights=tf.transpose(decoder_weights), biases=decoder_biases, labels=labels, inputs=logits, num_sampled=loss["negative_samples"], num_classes=num_classes, sampled_values=sampled_values, ), tf.float32, ) train_loss = tfa.seq2seq.sequence_loss( padded_logits, padded_targets, tf.sequence_mask( targets_sequence_length, tf.shape(padded_targets)[1], dtype=tf.float32, ), average_across_timesteps=True, average_across_batch=False, softmax_loss_function=_sampled_loss, ) return train_loss
def model_fn(features, labels, mode, params): vectors = features['v'] * 3 mels = features['mel'] mels_len = features['mel_length'][:, 0] dim_neck = 32 bottleneck = 512 config = malaya_speech.config.fastspeech_config config['encoder_hidden_size'] = bottleneck + 80 config['decoder_hidden_size'] = bottleneck + dim_neck config = fastspeech.Config(vocab_size=1, **config) model = fastvc.model.Model(dim_neck, config, dim_speaker=bottleneck) encoder_outputs, mel_before, mel_after, codes = model( mels, vectors, vectors, mels_len) codes_ = model.call_second(mel_after, vectors, mels_len) loss_f = tf.losses.absolute_difference max_length = tf.cast(tf.reduce_max(mels_len), tf.int32) mask = tf.sequence_mask(lengths=mels_len, maxlen=max_length, dtype=tf.float32) mask = tf.expand_dims(mask, axis=-1) mel_loss_before = loss_f(labels=mels, predictions=mel_before, weights=mask) mel_loss_after = loss_f(labels=mels, predictions=mel_after, weights=mask) g_loss_cd = tf.losses.absolute_difference(codes, codes_) loss = mel_loss_before + mel_loss_after + g_loss_cd tf.identity(loss, 'total_loss') tf.identity(mel_loss_before, 'mel_loss_before') tf.identity(mel_loss_after, 'mel_loss_after') tf.identity(g_loss_cd, 'g_loss_cd') tf.summary.scalar('total_loss', loss) tf.summary.scalar('mel_loss_before', mel_loss_before) tf.summary.scalar('mel_loss_after', mel_loss_after) tf.summary.scalar('g_loss_cd', g_loss_cd) global_step = tf.train.get_or_create_global_step() if mode == tf.estimator.ModeKeys.TRAIN: train_op = train.optimizer.adamw.create_optimizer( loss, init_lr=0.001, num_train_steps=total_steps, num_warmup_steps=int(0.1 * total_steps), end_learning_rate=0.00005, weight_decay_rate=0.001, beta_1=0.9, beta_2=0.98, epsilon=1e-6, clip_norm=1.0, ) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: estimator_spec = tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.EVAL, loss=loss) return estimator_spec
def __init__(self, reversed_dict, article_max_len, summary_max_len, config, forward_only=False): self.vocabulary_size = len(reversed_dict) self.embedding_size = config['embedding_size'] self.num_hidden = config['num_hidden'] self.num_layers = config['num_layers'] self.learning_rate = config['learning_rate'] self.beam_width = config['beam_width'] if not forward_only: self.keep_prob = config['keep_prob'] else: self.keep_prob = 1.0 self.cell = tf.nn.rnn_cell.BasicLSTMCell with tf.variable_scope("decoder/projection"): self.projection_layer = tf.layers.Dense(self.vocabulary_size, use_bias=False) self.batch_size = tf.placeholder(tf.int32, (), name="batch_size") self.X = tf.placeholder(tf.int32, [None, article_max_len]) self.X_len = tf.placeholder(tf.int32, [None]) self.decoder_input = tf.placeholder(tf.int32, [None, summary_max_len]) self.decoder_len = tf.placeholder(tf.int32, [None]) self.decoder_target = tf.placeholder(tf.int32, [None, summary_max_len]) self.global_step = tf.Variable(0, trainable=False) with tf.name_scope("embedding"): if not forward_only and config['glove']: init_embeddings = tf.constant(get_init_embedding( reversed_dict, self.embedding_size), dtype=tf.float32) else: init_embeddings = tf.random_uniform( [self.vocabulary_size, self.embedding_size], -1.0, 1.0) self.embeddings = tf.get_variable("embeddings", initializer=init_embeddings) self.encoder_emb_inp = tf.transpose(tf.nn.embedding_lookup( self.embeddings, self.X), perm=[1, 0, 2]) self.decoder_emb_inp = tf.transpose(tf.nn.embedding_lookup( self.embeddings, self.decoder_input), perm=[1, 0, 2]) with tf.name_scope("encoder"): fw_cells = [ self.cell(self.num_hidden) for _ in range(self.num_layers) ] bw_cells = [ self.cell(self.num_hidden) for _ in range(self.num_layers) ] fw_cells = [rnn.DropoutWrapper(cell) for cell in fw_cells] bw_cells = [rnn.DropoutWrapper(cell) for cell in bw_cells] encoder_outputs, encoder_state_fw, encoder_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn( fw_cells, bw_cells, self.encoder_emb_inp, sequence_length=self.X_len, time_major=True, dtype=tf.float32) self.encoder_output = tf.concat(encoder_outputs, 2) encoder_state_c = tf.concat( (encoder_state_fw[0].c, encoder_state_bw[0].c), 1) encoder_state_h = tf.concat( (encoder_state_fw[0].h, encoder_state_bw[0].h), 1) self.encoder_state = rnn.LSTMStateTuple(c=encoder_state_c, h=encoder_state_h) with tf.name_scope("decoder"), tf.variable_scope( "decoder") as decoder_scope: decoder_cell = self.cell(self.num_hidden * 2) if not forward_only: attention_states = tf.transpose(self.encoder_output, [1, 0, 2]) attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( self.num_hidden * 2, attention_states, memory_sequence_length=self.X_len, normalize=True) decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention_mechanism, attention_layer_size=self.num_hidden * 2) initial_state = decoder_cell.zero_state( dtype=tf.float32, batch_size=self.batch_size) initial_state = initial_state.clone( cell_state=self.encoder_state) helper = tf.contrib.seq2seq.TrainingHelper( self.decoder_emb_inp, self.decoder_len, time_major=True) decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, helper, initial_state) outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder, output_time_major=True, scope=decoder_scope) self.decoder_output = outputs.rnn_output self.logits = tf.transpose(self.projection_layer( self.decoder_output), perm=[1, 0, 2]) self.logits_reshape = tf.concat([ self.logits, tf.zeros([ self.batch_size, summary_max_len - tf.shape(self.logits)[1], self.vocabulary_size ]) ], axis=1) else: tiled_encoder_output = tf.contrib.seq2seq.tile_batch( tf.transpose(self.encoder_output, perm=[1, 0, 2]), multiplier=self.beam_width) tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch( self.encoder_state, multiplier=self.beam_width) tiled_seq_len = tf.contrib.seq2seq.tile_batch( self.X_len, multiplier=self.beam_width) attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( self.num_hidden * 2, tiled_encoder_output, memory_sequence_length=tiled_seq_len, normalize=True) decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention_mechanism, attention_layer_size=self.num_hidden * 2) initial_state = decoder_cell.zero_state( dtype=tf.float32, batch_size=self.batch_size * self.beam_width) initial_state = initial_state.clone( cell_state=tiled_encoder_final_state) decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=decoder_cell, embedding=self.embeddings, start_tokens=tf.fill([self.batch_size], tf.constant(2)), end_token=tf.constant(3), initial_state=initial_state, beam_width=self.beam_width, output_layer=self.projection_layer) outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder, output_time_major=True, maximum_iterations=summary_max_len, scope=decoder_scope) self.prediction = tf.transpose(outputs.predicted_ids, perm=[1, 2, 0]) with tf.name_scope("loss"): if not forward_only: crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits_reshape, labels=self.decoder_target) weights = tf.sequence_mask(self.decoder_len, summary_max_len, dtype=tf.float32) self.loss = tf.reduce_sum(crossent * weights / tf.to_float(self.batch_size)) params = tf.trainable_variables() gradients = tf.gradients(self.loss, params) clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.update = optimizer.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)
def build_graph(self): with tf.variable_scope('input'): self.inputs = tf.placeholder(tf.int32, [None, None], name='inputs') self.targets = tf.placeholder(tf.int32, [None, None], name='targets') self.learning_rate = tf.placeholder(tf.float32, name='learning_rate') self.target_sequence_length = tf.placeholder(tf.int32, (None,), name='target_sequence_length') self.max_target_sequence_length = tf.reduce_max(self.target_sequence_length, name='max_target_length') self.source_sequence_length = tf.placeholder(tf.int32, (None,), name='source_sequence_length') with tf.variable_scope('encoder'): encoder_embed_input = tf.contrib.layers.embed_sequence(self.inputs, len(self.source_letter_to_int), self.config.encoding_embedding_size) encoder_cell = tf.contrib.rnn.MultiRNNCell( [self.get_lstm_cell(self.config.rnn_size) for _ in range(self.config.rnn_layers)]) encoder_output, encoder_state = tf.nn.dynamic_rnn(encoder_cell, encoder_embed_input, sequence_length=self.source_sequence_length, dtype=tf.float32) with tf.variable_scope('decoder'): # 1. embedding decoder_input = self.process_decoder_input(self.targets, self.target_letter_to_int, self.config.batch_size) target_vocab_size = len(self.target_letter_to_int) decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, self.config.decoding_embedding_size])) decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input) # decoder_embed_input = tf.contrib.layers.embed_sequence(decoder_input, target_vocab_size, self.config.decoding_embedding_size) # 2. construct the rnn num_units = self.config.rnn_size attention_states = encoder_output # tf.transpose(encoder_output, [1, 0, 2]) attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units, attention_states, memory_sequence_length=self.source_sequence_length) # cells = [] # for i in range(self.config.rnn_layers): # cell = self.get_lstm_cell(self.config.rnn_size) # cell = tf.contrib.seq2seq.AttentionWrapper(cell, # attention_mechanism, # attention_layer_size=num_units) # cells.append(cell) # decoder_cell = tf.contrib.rnn.MultiRNNCell(cells) decoder_cell = tf.contrib.rnn.MultiRNNCell( [self.get_lstm_cell(self.config.rnn_size) for _ in range(self.config.rnn_layers)]) decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=num_units) attention_zero = decoder_cell.zero_state(self.config.batch_size, dtype=tf.float32) initial_state = attention_zero.clone(cell_state=encoder_state) # 3. output fully connected output_layer = Dense(target_vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) if self.mode == 'train': training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input, sequence_length=self.target_sequence_length, time_major=False) training_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, training_helper, initial_state, output_layer) decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) else: start_tokens = tf.tile(tf.constant([self.target_letter_to_int[GO]], dtype=tf.int32), [self.config.batch_size], name='start_tokens') predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings, start_tokens, self.target_letter_to_int[EOS]) predicting_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, predicting_helper, initial_state, output_layer) decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) with tf.variable_scope('loss'): training_logits = tf.identity(decoder_output.rnn_output, 'logits') predicting_logits = tf.identity(decoder_output.sample_id, name='predictions') masks = tf.sequence_mask(self.target_sequence_length, self.max_target_sequence_length, dtype=tf.float32, name='masks') self.loss = tf.contrib.seq2seq.sequence_loss(training_logits, self.targets, masks) tf.summary.scalar("loss", self.loss) with tf.name_scope('optimize'): # optimizer = tf.train.AdamOptimizer(lr) # gradients = optimizer.compute_gradients(cost) # capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None] # train_op = optimizer.apply_gradients(capped_gradients) training_variables = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, training_variables), 5) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, training_variables), name='train_op')
def train_HT(): print('Run HT chord recognition on %s-%d...' % (hp.dataset, hp.test_set_id)) # Load training and testing data train_data, test_data = load_data_symbol( dir=hp.dataset + '_preprocessed_data_MIREX_Mm.pickle', test_set_id=hp.test_set_id, sequence_with_overlap=hp.train_sequence_with_overlap) n_train_sequences = train_data['pianoroll'].shape[0] n_test_sequences = test_data['pianoroll'].shape[0] n_iterations_per_epoch = int(math.ceil(n_train_sequences / hp.n_batches)) print('n_train_sequences =', n_train_sequences) print('n_test_sequences =', n_test_sequences) print('n_iterations_per_epoch =', n_iterations_per_epoch) print(hp) with tf.name_scope('placeholder'): x_p = tf.placeholder(tf.int32, [None, hp.n_steps, 88], name="pianoroll") x_len = tf.placeholder(tf.int32, [None], name="seq_lens") y_tc = tf.placeholder(tf.int32, [None, hp.n_steps], name="tchord") y_cc = tf.placeholder(tf.int32, [None, hp.n_steps], name="chord_change") dropout = tf.placeholder(dtype=tf.float32, name="dropout_rate") is_training = tf.placeholder(dtype=tf.bool, name="is_training") global_step = tf.placeholder(dtype=tf.int32, name='global_step') slope = tf.placeholder(dtype=tf.float32, name='annealing_slope') with tf.name_scope('model'): x_in = tf.cast(x_p, tf.float32) source_mask = tf.sequence_mask( lengths=x_len, maxlen=hp.n_steps, dtype=tf.float32) # [n_batches, n_steps] target_mask = source_mask # chord_change_logits, dec_input_embed, enc_weights, dec_weights = crm.HT(x_in, source_mask, target_mask, slope, dropout, is_training, hp) chord_change_logits, dec_input_embed, enc_weights, dec_weights, _, _ = crm.HTv2( x_in, source_mask, target_mask, slope, dropout, is_training, hp) with tf.variable_scope("output_projection"): dec_input_embed = tf.layers.dropout(dec_input_embed, rate=dropout, training=is_training) chord_logits = tf.layers.dense(dec_input_embed, hp.n_chord_classes, name='output_dense') with tf.name_scope('loss'): # Chord change loss_cc = 1.5 * tf.losses.sigmoid_cross_entropy( multi_class_labels=tf.cast(y_cc, tf.float32), logits=slope * chord_change_logits, weights=source_mask) # Chord symbol loss_tc = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot( y_tc, hp.n_chord_classes), logits=chord_logits, weights=target_mask) # Total loss loss = loss_cc + loss_tc valid = tf.reduce_sum(target_mask) summary_loss = tf.Variable([0.0, 0.0, 0.0], trainable=False, dtype=tf.float32) summary_valid = tf.Variable(0, trainable=False, dtype=tf.float32) update_loss = tf.assign(summary_loss, summary_loss + valid * [loss, loss_cc, loss_tc]) update_valid = tf.assign(summary_valid, summary_valid + valid) mean_loss = tf.assign(summary_loss, summary_loss / summary_valid) clr_summary_loss = summary_loss.initializer clr_summary_valid = summary_valid.initializer tf.summary.scalar('Loss_total', summary_loss[0]) tf.summary.scalar('Loss_chord_change', summary_loss[1]) tf.summary.scalar('Loss_chord', summary_loss[2]) with tf.name_scope('evaluation'): chord_mask = tf.cast(target_mask, tf.bool) chord_mask = tf.logical_and(chord_mask, tf.less(y_tc, tquality_dict['O'] * 12)) # Chord change pred_cc = tf.cast(tf.round(tf.sigmoid(slope * chord_change_logits)), tf.int32) pred_cc_mask = tf.boolean_mask(pred_cc, tf.cast(source_mask, tf.bool)) y_cc_mask = tf.boolean_mask(y_cc, tf.cast(source_mask, tf.bool)) TP_cc, FP_cc, FN_cc = compute_pre_PRF(pred_cc_mask, y_cc_mask) # Chord pred_tc = tf.argmax(chord_logits, axis=2, output_type=tf.int32) pred_tc_correct = tf.equal(pred_tc, y_tc) pred_tc_correct_mask = tf.boolean_mask(tensor=pred_tc_correct, mask=chord_mask) correct = tf.reduce_sum(tf.cast(pred_tc_correct_mask, tf.float32)) total = tf.cast(tf.size(pred_tc_correct_mask), tf.float32) summary_count = tf.Variable([0.0 for _ in range(5)], trainable=False, dtype=tf.float32) summary_score = tf.Variable([0.0 for _ in range(4)], trainable=False, dtype=tf.float32) update_count = tf.assign( summary_count, summary_count + [correct, total, TP_cc, FP_cc, FN_cc]) acc_tc = summary_count[0] / summary_count[1] P_cc, R_cc, F1_cc = comput_PRF_with_pre(summary_count[2], summary_count[3], summary_count[4]) update_score = tf.assign(summary_score, summary_score + [ acc_tc, P_cc, R_cc, F1_cc, ]) clr_summary_count = summary_count.initializer clr_summary_score = summary_score.initializer tf.summary.scalar('Accuracy_tchord', summary_score[0]) tf.summary.scalar('Precision_chord_change', summary_score[1]) tf.summary.scalar('Recall_chord_change', summary_score[2]) tf.summary.scalar('F1_chord_change', summary_score[3]) with tf.name_scope('optimization'): # Apply warm-up learning rate warm_up_steps = tf.constant(4000, dtype=tf.float32) gstep = tf.cast(global_step, dtype=tf.float32) learning_rate = pow(hp.input_embed_size, -0.5) * tf.minimum( tf.pow(gstep, -0.5), gstep * tf.pow(warm_up_steps, -1.5)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.98, epsilon=1e-9) train_op = optimizer.minimize(loss) # Graph location and summary writers print('Saving graph to: %s' % hp.graph_location) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(hp.graph_location + '\\train') test_writer = tf.summary.FileWriter(hp.graph_location + '\\test') train_writer.add_graph(tf.get_default_graph()) test_writer.add_graph(tf.get_default_graph()) saver = tf.train.Saver(max_to_keep=1) # Training print('Train the model...') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) startTime = time.time() best_score = [0.0 for _ in range(5)] in_succession = 0 best_epoch = 0 annealing_slope = 1.0 best_slope = 0.0 for step in range(hp.n_training_steps): # Training if step == 0: indices = range(n_train_sequences) batch_indices = [ indices[x:x + hp.n_batches] for x in range(0, len(indices), hp.n_batches) ] if step > 0 and step % n_iterations_per_epoch == 0: annealing_slope *= hp.annealing_rate if step >= n_iterations_per_epoch and step % n_iterations_per_epoch == 0: # Shuffle training data indices = random.sample(range(n_train_sequences), n_train_sequences) batch_indices = [ indices[x:x + hp.n_batches] for x in range(0, len(indices), hp.n_batches) ] batch = ( train_data['pianoroll'][batch_indices[step % len(batch_indices)]], train_data['len'][batch_indices[step % len(batch_indices)]], train_data['label']['chord_change'][batch_indices[ step % len(batch_indices)]], train_data['tchord'][batch_indices[step % len(batch_indices)]], train_data['root'][batch_indices[step % len(batch_indices)]], train_data['tquality'][batch_indices[step % len(batch_indices)]]) train_run_list = [ train_op, update_valid, update_loss, update_count, loss, loss_cc, loss_tc, pred_cc, pred_tc, chord_mask, enc_weights, dec_weights ] train_feed_fict = { x_p: batch[0], x_len: batch[1], y_cc: batch[2], y_tc: batch[3], dropout: hp.drop, is_training: True, global_step: step + 1, slope: annealing_slope } _, _, _, _, train_loss, train_loss_cc, train_loss_tc, train_pred_cc, train_pred_tc, train_chord_mask, enc_w, dec_w = sess.run( train_run_list, feed_dict=train_feed_fict) if step == 0: print('*~ loss_cc %.4f, loss_tc %.4f ~*' % (train_loss_cc, train_loss_tc)) # Display training log & Testing if step > 0 and step % n_iterations_per_epoch == 0: sess.run([mean_loss, update_score]) train_summary, train_loss, train_score = sess.run( [merged, summary_loss, summary_score]) sess.run([ clr_summary_valid, clr_summary_loss, clr_summary_count, clr_summary_score ]) train_writer.add_summary(train_summary, step) print( "---- step %d, epoch %d: train_loss: total %.4f, cc %.4f, tc %.4f, evaluation: tc %.4f, cc (P %.4f, R %.4f, F1 %.4f) ----" % (step, step // n_iterations_per_epoch, train_loss[0], train_loss[1], train_loss[2], train_score[0], train_score[1], train_score[2], train_score[3])) print('enc_w =', enc_w, 'dec_w =', dec_w) display_len = 64 print('len =', batch[1][0]) print( 'y_root'.ljust(7, ' '), ''.join([[k for k, v in root_dict.items() if v == b][0].rjust(3, ' ') for b in batch[4][0, :display_len]])) print( 'y_tq'.ljust(7, ' '), ''.join([[k for k, v in tquality_dict.items() if v == b][0].rjust(3, ' ') for b in batch[5][0, :display_len]])) print( 'valid'.ljust(7, ' '), ''.join([ 'y'.rjust(3, ' ') if b else 'n'.rjust(3, ' ') for b in train_chord_mask[0, :display_len] ])) print( 'y_cc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in batch[2][0, :display_len] ])) print( 'pred_cc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in train_pred_cc[0, :display_len] ])) print( 'y_tc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in batch[3][0, :display_len] ])) print( 'pred_tc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in train_pred_tc[0, :display_len] ])) # Testing test_run_list = [ update_valid, update_loss, update_count, pred_cc, pred_tc, chord_mask ] test_feed_fict = { x_p: test_data['pianoroll'], x_len: test_data['len'], y_cc: test_data['label']['chord_change'], y_tc: test_data['tchord'], dropout: 0.0, is_training: False, slope: annealing_slope } _, _, _, test_pred_cc, test_pred_tc, test_chord_mask = sess.run( test_run_list, feed_dict=test_feed_fict) sess.run([mean_loss, update_score]) test_summary, test_loss, test_score = sess.run( [merged, summary_loss, summary_score]) sess.run([ clr_summary_valid, clr_summary_loss, clr_summary_count, clr_summary_score ]) test_writer.add_summary(test_summary, step) sq = crm.segmentation_quality(test_data['tchord'], test_pred_tc, test_data['len']) print( "==== step %d, epoch %d: test_loss: total %.4f, cc %.4f, tc %.4f, evaluation: tc %.4f, cc (P %.4f, R %.4f, F1 %.4f), sq %.4f ====" % (step, step // n_iterations_per_epoch, test_loss[0], test_loss[1], test_loss[2], test_score[0], test_score[1], test_score[2], test_score[3], sq)) sample_id = random.randint(0, n_test_sequences - 1) print('len =', test_data['len'][sample_id]) print( 'y_root'.ljust(7, ' '), ''.join( [[k for k, v in root_dict.items() if v == b][0].rjust(3, ' ') for b in test_data['root'][sample_id, :display_len]])) print( 'y_tq'.ljust(7, ' '), ''.join([ [k for k, v in tquality_dict.items() if v == b][0].rjust(3, ' ') for b in test_data['tquality'][sample_id, :display_len] ])) print( 'valid'.ljust(7, ' '), ''.join([ 'y'.rjust(3, ' ') if b else 'n'.rjust(3, ' ') for b in test_chord_mask[sample_id, :display_len] ])) print( 'y_cc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in test_data['label'] ['chord_change'][sample_id, :display_len] ])) print( 'pred_cc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in test_pred_cc[sample_id, :display_len] ])) print( 'y_tc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in test_data['tchord'][sample_id, :display_len] ])) print( 'pred_tc'.ljust(7, ' '), ''.join([ str(b).rjust(3, ' ') for b in test_pred_tc[sample_id, :display_len] ])) if step > 0 and (test_score[0] + sq) > (best_score[0] + best_score[-1]): best_score = np.concatenate([test_score, [sq]], axis=0) best_epoch = step // n_iterations_per_epoch best_slope = annealing_slope in_succession = 0 # Save variables of the model print('*saving variables...\n') saver.save( sess, hp.graph_location + '\\HT_chord_recognition_' + hp.dataset + '_' + str(hp.test_set_id) + '.ckpt') else: in_succession += 1 if in_succession > hp.n_in_succession: print('Early stopping.') break # saver.save(sess, hp.graph_location + '\\HT_chord_recognition_train_model.ckpt') elapsed_time = time.time() - startTime print('\nHT chord symbol recognition on %s-%d:' % (hp.dataset, hp.test_set_id)) print('training time = %.2f hr' % (elapsed_time / 3600)) print('best epoch = ', best_epoch) print('best score =', np.round(best_score, 4)) print('best slope =', best_slope)
def model_fn(features, labels, mode, params): # For serving, features are a bit different if isinstance(features, dict): features = features['words'], features['nwords'] # Read vocabs and inputs dropout = params['dropout'] words, nwords = features training = (mode == tf.estimator.ModeKeys.TRAIN) vocab_words = tf.contrib.lookup.index_table_from_file( params['words'], num_oov_buckets=params['num_oov_buckets']) with Path(params['tags']).open() as f: indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O'] num_tags = len(indices) + 1 # Word Embeddings word_ids = vocab_words.lookup(words) glove = np.load(params['glove'])['embeddings'] # np.array variable = np.vstack([glove, [[0.] * params['dim']]]) variable = tf.Variable(variable, dtype=tf.float32, trainable=False) embeddings = tf.nn.embedding_lookup(variable, word_ids) embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training) # LSTM t = tf.transpose(embeddings, perm=[1, 0, 2]) lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size']) lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size']) lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw) output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords) output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords) output = tf.concat([output_fw, output_bw], axis=-1) output = tf.transpose(output, perm=[1, 0, 2]) output = tf.layers.dropout(output, rate=dropout, training=training) # CRF logits = tf.layers.dense(output, num_tags) crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32) pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords) if mode == tf.estimator.ModeKeys.PREDICT: # Predictions reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file( params['tags']) pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids)) predictions = {'pred_ids': pred_ids, 'tags': pred_strings} return tf.estimator.EstimatorSpec(mode, predictions=predictions) else: # Loss vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags']) tags = vocab_tags.lookup(labels) log_likelihood, _ = tf.contrib.crf.crf_log_likelihood( logits, tags, nwords, crf_params) loss = tf.reduce_mean(-log_likelihood) # Metrics weights = tf.sequence_mask(nwords) metrics = { 'acc': tf.metrics.accuracy(tags, pred_ids, weights), 'precision': precision(tags, pred_ids, num_tags, indices, weights), 'recall': recall(tags, pred_ids, num_tags, indices, weights), 'f1': f1(tags, pred_ids, num_tags, indices, weights), } for metric_name, op in metrics.items(): tf.summary.scalar(metric_name, op[1]) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics) elif mode == tf.estimator.ModeKeys.TRAIN: train_op = tf.train.AdamOptimizer().minimize( loss, global_step=tf.train.get_or_create_global_step()) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def build_decoder(self): """构建解码器 """ with tf.variable_scope('decoder') as decoder_scope: # Building decoder_cell and decoder_initial_state (self.decoder_cell, self.decoder_initial_state) = self.build_decoder_cell() # 解码器embedding if self.share_embedding: self.decoder_embeddings = self.encoder_embeddings else: with tf.device(_get_embed_device(self.target_vocab_size)): self.decoder_embeddings = tf.get_variable( name='embeddings', shape=(self.target_vocab_size, self.embedding_size), initializer=self.initializer, dtype=tf.float32) # On Using Very Large Target Vocabulary # for Neural Machine Translation # https://arxiv.org/pdf/1412.2007v2.pdf # Input projection layer to feed embedded inputs to the cell # ** Essential when use_residual=True to match input/output dims hidden_units = self.hidden_units if self.bidirectional: hidden_units *= 2 input_layer = layers.Dense(hidden_units, dtype=tf.float32, use_bias=False, name='input_projection') self.output_layer = layers.Dense(self.target_vocab_size, dtype=tf.float32, use_bias=False, name='output_projection') if self.mode == 'train': # decoder_inputs_embedded: # [batch_size, max_time_step + 1, embedding_size] self.decoder_inputs_embedded = tf.nn.embedding_lookup( params=self.decoder_embeddings, ids=self.decoder_inputs_train) # Embedded inputs having gone through input projection layer self.decoder_inputs_embedded = input_layer( self.decoder_inputs_embedded) # Helper to feed inputs for training: # read inputs from dense ground truth vectors inputs = self.decoder_inputs_embedded if self.time_major: inputs = tf.transpose(inputs, (1, 0, 2)) training_helper = seq2seq.TrainingHelper( inputs=inputs, sequence_length=self.decoder_inputs_length_train, time_major=self.time_major, name='training_helper') # 训练的时候不在这里应用 output_layer # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢 # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数 training_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_initial_state, # output_layer=self.output_layer ) # Maximum decoder time_steps in current batch max_decoder_length = tf.reduce_max( self.decoder_inputs_length_train) # decoder_outputs_train: BasicDecoderOutput # namedtuple(rnn_outputs, sample_id) # decoder_outputs_train.rnn_output: # if output_time_major=False: # [batch_size, max_time_step + 1, num_decoder_symbols] # if output_time_major=True: # [max_time_step + 1, batch_size, num_decoder_symbols] # decoder_outputs_train.sample_id: [batch_size], tf.int32 ( outputs, self.final_state, # contain attention _ # self.final_sequence_lengths ) = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=self.time_major, impute_finished=True, maximum_iterations=max_decoder_length, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope) # More efficient to do the projection # on the batch-time-concatenated tensor # logits_train: # [batch_size, max_time_step + 1, num_decoder_symbols] # 训练的时候一次性对所有的结果进行 output_layer 的投影运算 # 官方NMT库说这样能提高10~20%的速度 # 实际上我提高的速度会更大 self.decoder_logits_train = self.output_layer( outputs.rnn_output) # masks: masking for valid and padded time steps, # [batch_size, max_time_step + 1] self.masks = tf.sequence_mask( lengths=self.decoder_inputs_length_train, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # Computes per word average cross-entropy over a batch # Internally calls # 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default decoder_logits_train = self.decoder_logits_train if self.time_major: decoder_logits_train = tf.transpose( decoder_logits_train, (1, 0, 2)) self.decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') # 下面的一些变量用于强化学习训练 self.train_entropy = \ tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_targets_train, logits=decoder_logits_train) # self.train_entropy *= self.masks # print(self.train_entropy.shape) self.train_entropy_rewards = tf.multiply( self.train_entropy, self.rewards) # print('self.train_entropy_rewards.shape', self.train_entropy_rewards.shape) self.train_entropy_rewards *= self.masks # https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/contrib/seq2seq/python/ops/loss.py # if average_across_timesteps and average_across_batch: # crossent = math_ops.reduce_sum(crossent) # total_size = math_ops.reduce_sum(weights) # total_size += 1e-12 # to avoid division by 0 for all-0 weights # crossent /= total_size self.loss_without_rewards = tf.reduce_sum(self.train_entropy) self.loss_rewards = tf.reduce_sum(self.train_entropy_rewards) total_size = tf.reduce_sum(self.masks) total_size += 1e-12 self.loss_without_rewards /= total_size self.loss_rewards /= total_size self.loss = seq2seq.sequence_loss( logits=decoder_logits_train, targets=self.decoder_targets_train, weights=self.masks, average_across_timesteps=True, average_across_batch=True, ) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) elif self.mode == 'decode': # 预测模式,非训练 start_tokens = tf.fill([self.batch_size], WordSequence.START) end_token = WordSequence.END def embed_and_input_proj(inputs): """输入层的投影层wrapper """ return input_layer( tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) if not self.use_beamsearch_decode: # Helper to feed inputs for greedy decoding: # uses the argmax of the output decoding_helper = seq2seq.GreedyEmbeddingHelper( start_tokens=start_tokens, end_token=end_token, embedding=embed_and_input_proj) # Basic decoder performs greedy decoding at each time step # print("building greedy decoder..") inference_decoder = seq2seq.BasicDecoder( cell=self.decoder_cell, helper=decoding_helper, initial_state=self.decoder_initial_state, output_layer=self.output_layer) else: # Beamsearch is used to approximately # find the most likely translation # print("building beamsearch decoder..") inference_decoder = BeamSearchDecoder( cell=self.decoder_cell, embedding=embed_and_input_proj, start_tokens=start_tokens, end_token=end_token, initial_state=self.decoder_initial_state, beam_width=self.beam_width, output_layer=self.output_layer, ) # For GreedyDecoder, return # decoder_outputs_decode: BasicDecoderOutput instance # namedtuple(rnn_outputs, sample_id) # decoder_outputs_decode.rnn_output: # if output_time_major=False: # [batch_size, max_time_step, num_decoder_symbols] # if output_time_major=True # [max_time_step, batch_size, num_decoder_symbols] # decoder_outputs_decode.sample_id: # if output_time_major=False # [batch_size, max_time_step], tf.int32 # if output_time_major=True # [max_time_step, batch_size], tf.int32 # For BeamSearchDecoder, return # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance # namedtuple(predicted_ids, beam_search_decoder_output) # decoder_outputs_decode.predicted_ids: # if output_time_major=False: # [batch_size, max_time_step, beam_width] # if output_time_major=True # [max_time_step, batch_size, beam_width] # decoder_outputs_decode.beam_search_decoder_output: # BeamSearchDecoderOutput instance # namedtuple(scores, predicted_ids, parent_ids) # 官方文档提到的一个潜在的最大长度选择 # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2) # https://www.tensorflow.org/tutorials/seq2seq if self.max_decode_step is not None: max_decode_step = self.max_decode_step else: # 默认 4 倍输入长度的输出解码 max_decode_step = tf.round( tf.reduce_max(self.encoder_inputs_length) * 4) ( self.decoder_outputs_decode, self.final_state, _ # self.decoder_outputs_length_decode ) = ( seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=self.time_major, # impute_finished=True, # error occurs maximum_iterations=max_decode_step, parallel_iterations=self.parallel_iterations, swap_memory=True, scope=decoder_scope)) if not self.use_beamsearch_decode: # decoder_outputs_decode.sample_id: # [batch_size, max_time_step] # Or use argmax to find decoder symbols to emit: # self.decoder_pred_decode = tf.argmax( # self.decoder_outputs_decode.rnn_output, # axis=-1, name='decoder_pred_decode') # Here, we use expand_dims to be compatible with # the result of the beamsearch decoder # decoder_pred_decode: # [batch_size, max_time_step, 1] (output_major=False) # self.decoder_pred_decode = tf.expand_dims( # self.decoder_outputs_decode.sample_id, # -1 # ) dod = self.decoder_outputs_decode self.decoder_pred_decode = dod.sample_id if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0)) else: # Use beam search to approximately # find the most likely translation # decoder_pred_decode: # [batch_size, max_time_step, beam_width] (output_major=False) self.decoder_pred_decode = \ self.decoder_outputs_decode.predicted_ids if self.time_major: self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, (1, 0, 2)) self.decoder_pred_decode = tf.transpose( self.decoder_pred_decode, perm=[0, 2, 1]) dod = self.decoder_outputs_decode self.beam_prob = dod.beam_search_decoder_output.scores
def build(self): print("Building the language model ... ") vocab_size = self.vocab_size state_size = self.state_size enc_layers = self.enc_layers with tf.name_scope("placeholders"): enc_inputs = tf.placeholder(tf.int32, [None, None], "enc_inputs") targets = tf.placeholder(tf.int32, [None, None], "targets") inp_lens = tf.placeholder(tf.int32, [None], "inp_lens") self.drop_out = tf.placeholder(tf.float32, (), "drop_out") self.enc_inputs = enc_inputs self.inp_lens = inp_lens self.targets = targets batch_size = tf.shape(enc_inputs)[0] max_len = tf.shape(enc_inputs)[1] with tf.variable_scope("embeddings"): embedding_matrix = tf.get_variable("embedding_matrix", [vocab_size, state_size]) enc_inputs = tf.nn.embedding_lookup(embedding_matrix, enc_inputs) with tf.variable_scope("encoder"): # TODO: residual LSTM, layer normalization enc_cell = [ create_cell("enc-%d" % i, state_size, self.drop_out) for i in range(enc_layers) ] enc_cell = tf.nn.rnn_cell.MultiRNNCell(enc_cell) enc_outputs, enc_state = tf.nn.dynamic_rnn( enc_cell, enc_inputs, sequence_length=inp_lens, dtype=tf.float32) enc_proj = tf.layers.Dense(vocab_size, name="enc_proj") enc_logits = enc_proj(enc_outputs) mask = tf.sequence_mask(inp_lens, max_len, dtype=tf.float32) loss = tf.contrib.seq2seq.sequence_loss(enc_logits, targets, mask) # get variables before optimizer all_variables = slim.get_variables_to_restore() lm_variables = [ var for var in all_variables if var.name[:2] == "lm" ] print("lm model, variable list:") for v in lm_variables: print(" %s" % v.name) self.model_saver = tf.train.Saver(lm_variables, max_to_keep=10) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) train_op = optimizer.minimize(loss) self.train_output = { "train_op": train_op, "loss": loss, "ppl": tf.exp(loss) } self.eval_output = {"loss": loss, "ppl": tf.exp(loss)} return
def get_loss_and_rewards(self): """ run the rnn for one time :return: cross entropy loss and rewards """ with tf.name_scope('lstm'): with tf.variable_scope('cell', reuse=False): def get_cell(hiddenSize, dropOutRate): print('Not using ACL style jumping!') cell = SkipLSTMCell(num_units=hiddenSize, state_is_tuple=True, min_read=self.args.minRead, max_skip=self.args.maxSkip, is_training=self.is_training, is_transfering=self.is_transfering) cell = tf.contrib.rnn.DropoutWrapper( cell, input_keep_prob=dropOutRate, output_keep_prob=dropOutRate) return cell # https://stackoverflow.com/questions/47371608/cannot-stack-lstm-with-multirnncell-and-dynamic-rnn cell = get_cell(self.args.hiddenSize, self.dropOutRate) state = self.init_state() outputs = [] skips_remain = [] n_skips = [] probs = [] valid = [] predicted_logits = [] with tf.variable_scope("loop", reuse=tf.AUTO_REUSE): for time_step in range(self.args.maxSteps): # state: # "c", "h", "r", "s", "n", "probs", "valid" (cell_output, state) = cell(self.embedded[:, time_step, :], state) # n: number of steps skipped # p: corresponding probs of n # v: if is valid for computing reward # all of shape [batch_size] # predicted_logits_: [batch_size*n_samples*(max_skips+1)] (c, h, r, s, n, p, v, _, predicted_logits_) = state skips_remain.append(s) n_skips.append(n) probs.append(p) valid.append(v) outputs.append(cell_output) induced_n = tf.slice(self.induced_skips, begin=[0, time_step + 1], size=[-1, 1], name='induced_n' + str(time_step + 1)) induced_n = tf.reshape(induced_n, shape=[-1]) state = SkipLSTMStateTuple(c, h, r, s, n, p, v, induced_n, predicted_logits_) #predicted_logits_ = tf.reshape(predicted_logits_, shape=[self.batch_size*self.n_samples, self.args.maxSkip+1]) predicted_logits.append(predicted_logits_) # [maxSteps, batch_size] skips_remain.insert( 0, tf.zeros(shape=[self.batch_size * self.n_samples], dtype=tf.int32)) skips_remain = skips_remain[0:-1] skips_remain = tf.stack(skips_remain) n_skips = tf.stack(n_skips) probs = tf.stack(probs) valid = tf.stack(valid) # [max_steps, batch_size*n_samples, max_skips+1] predicted_logits = tf.stack(predicted_logits) # [batch_size, maxSteps] skip_flag = tf.cast(tf.greater(tf.transpose(skips_remain, [1, 0]), 0), tf.float32, name='skip_flag') #skip_flag = tf.Print(skip_flag, data=[skip_flag], summarize=100, message='skp_flag') n_skips = tf.transpose(n_skips, [1, 0], name='n_skips') probs = tf.transpose(probs, [1, 0], name='probs') valid = tf.transpose(valid, [1, 0], name='valid') # [batch_size*n_samples, max_steps, max_skips+1] predicted_logits = tf.transpose(predicted_logits, [1, 0, 2], name='predicted_logits') # [maxSteps, batchSize, hiddenSize] outputs = tf.stack(outputs) # [batchSize, maxSteps, hiddenSize] outputs = tf.transpose(outputs, [1, 0, 2], name='outputs') # [batchSize, maxSteps] last_relevant_mask = tf.one_hot(indices=self.length - 1, depth=self.args.maxSteps, name='last_relevant', dtype=tf.int32) # [batchSize, hiddenSize] last_relevant_outputs = tf.boolean_mask( outputs, last_relevant_mask, name='last_relevant_outputs') with tf.name_scope('output'): weights = tf.get_variable( name='weights', shape=[self.args.hiddenSize, self.args.numClasses], initializer=self.initializer) biases = tf.get_variable(name='biases', shape=[self.args.numClasses], initializer=self.initializer) # [batchSize, numClasses] logits = tf.nn.xw_plus_b(x=last_relevant_outputs, weights=weights, biases=biases) with tf.name_scope('rewards'): # [batch_size] self.predictions = tf.argmax(logits, axis=-1, name='predictions', output_type=tf.int32) # [batch_size] self.corrects = tf.equal(self.predictions, self.labels, name='corrects') self.wrongs = tf.logical_not(self.corrects, name='wrongs') # single number n_corrects = tf.reduce_sum(tf.cast(self.corrects, tf.int32), name='n_corrects') # [batch_size], with elements 1 or -1, 1 for corrects and -1 for wrongs rewards = tf.subtract(tf.cast(self.corrects, tf.float32), tf.cast(self.wrongs, tf.float32), name='rewards') # rewards = tf.Print(rewards, data=[rewards], message='rewards') with tf.name_scope('ce_loss'): # [batch_size*n_samples] ce_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=self.labels, name='loss') with tf.name_scope('transfering_loss'): # mask out steps exceeding the length of each sample # [batch_size, n_samples] valid = tf.cast(valid, tf.float32) valid = tf.reshape( valid, [self.batch_size, self.n_samples, self.args.maxSteps]) length = tf.reshape(self.length, shape=[self.batch_size, self.n_samples], name='length') # [batch_size, n_samples, maxSteps] # note that a jump decision made at the last word is not valid # and, in our current mechanism, a sentence with length <= min_read does not have valid predictions valid_mask = tf.sequence_mask(lengths=length - 1, maxlen=self.args.maxSteps, dtype=tf.float32, name='valid_mask') valid = tf.multiply(valid, valid_mask, name='valid') valid = tf.reshape(valid, shape=[-1, self.args.maxSteps]) # predicted_logits: [batch_size*n_samples, max_steps, max_skips+1] # induced_skips: [batch_size*n_samples, max_steps] # valid: [batch_size*n_samples, max_steps] # transfering_loss: [batch_size*n_samples] transfering_loss = tf.contrib.seq2seq.sequence_loss( logits=predicted_logits, targets=self.induced_skips, weights=valid, average_across_timesteps=True, average_across_batch=False) # [batch_size*n_samples, max_steps] self.predicted_inference_skips = tf.argmax( predicted_logits, axis=-1, name='predicted_inference_skips', output_type=tf.int32) self.correct_predicted_inference_skips = tf.cast( tf.equal(self.predicted_inference_skips, self.induced_skips), tf.float32) self.correct_predicted_inference_skips = tf.multiply( self.correct_predicted_inference_skips, valid, name='correct_predicted_inference_skips') self.v0 = skip_flag return ce_loss, rewards, n_skips, probs, valid, n_corrects, skip_flag, transfering_loss
def __init__(self, user_count, item_count, cate_count, cate_list): self.u = tf.placeholder(tf.int32, [None,]) # [B] self.i = tf.placeholder(tf.int32, [None,]) # [B], item feature list, dim: N*M self.j = tf.placeholder(tf.int32, [None,]) # [B] self.y = tf.placeholder(tf.float32, [None,]) # [B] self.hist_i = tf.placeholder(tf.int32, [None, None]) # [B, T] self.sl = tf.placeholder(tf.int32, [None,]) # [B] self.lr = tf.placeholder(tf.float64, []) # learning rate hidden_units = 128 user_emb_w = tf.get_variable("user_emb_w", [user_count, hidden_units]) item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_units // 2]) item_b = tf.get_variable("item_b", [item_count], initializer=tf.constant_initializer(0.0)) cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, hidden_units // 2]) cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64) u_emb = tf.nn.embedding_lookup(user_emb_w, self.u) ic = tf.gather(cate_list, self.i) i_emb = tf.concat(values = [ tf.nn.embedding_lookup(item_emb_w, self.i), tf.nn.embedding_lookup(cate_emb_w, ic), ], axis=1) i_b = tf.gather(item_b, self.i) jc = tf.gather(cate_list, self.j) j_emb = tf.concat([ tf.nn.embedding_lookup(item_emb_w, self.j), tf.nn.embedding_lookup(cate_emb_w, jc), ], axis=1) j_b = tf.gather(item_b, self.j) hc = tf.gather(cate_list, self.hist_i) h_emb = tf.concat([ tf.nn.embedding_lookup(item_emb_w, self.hist_i), tf.nn.embedding_lookup(cate_emb_w, hc), ], axis=2) #-- sum begin ------- mask = tf.sequence_mask(self.sl, tf.shape(h_emb)[1], dtype=tf.float32) # [B, T] mask = tf.expand_dims(mask, -1) # [B, T, 1] mask = tf.tile(mask, [1, 1, tf.shape(h_emb)[2]]) # [B, T, H] h_emb *= mask # [B, T, H] hist = h_emb hist = tf.reduce_sum(hist, 1) hist = tf.div(hist, tf.cast(tf.tile(tf.expand_dims(self.sl,1), [1,128]), tf.float32)) print(h_emb.get_shape().as_list()) #-- sum end --------- hist = tf.layers.batch_normalization(inputs = hist) hist = tf.reshape(hist, [-1, hidden_units]) hist = tf.layers.dense(hist, hidden_units) u_emb = hist #-- fcn begin ------- din_i = tf.concat([u_emb, i_emb], axis=-1) din_i = tf.layers.batch_normalization(inputs=din_i, name='b1') d_layer_1_i = tf.layers.dense(din_i, 80, activation=tf.nn.sigmoid, name='f1') d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=tf.nn.sigmoid, name='f2') d_layer_3_i = tf.layers.dense(d_layer_2_i, 1, activation=None, name='f3') din_j = tf.concat([u_emb, j_emb], axis=-1) din_j = tf.layers.batch_normalization(inputs=din_j, name='b1', reuse=True) d_layer_1_j = tf.layers.dense(din_j, 80, activation=tf.nn.sigmoid, name='f1', reuse=True) d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=tf.nn.sigmoid, name='f2', reuse=True) d_layer_3_j = tf.layers.dense(d_layer_2_j, 1, activation=None, name='f3', reuse=True) d_layer_3_i = tf.reshape(d_layer_3_i, [-1]) d_layer_3_j = tf.reshape(d_layer_3_j, [-1]) x = i_b - j_b + d_layer_3_i - d_layer_3_j # [B] self.logits = i_b + d_layer_3_i u_emb_all = tf.expand_dims(u_emb, 1) u_emb_all = tf.tile(u_emb_all, [1, item_count, 1]) # logits for all item: all_emb = tf.concat([ item_emb_w, tf.nn.embedding_lookup(cate_emb_w, cate_list) ], axis=1) all_emb = tf.expand_dims(all_emb, 0) all_emb = tf.tile(all_emb, [512, 1, 1]) din_all = tf.concat([u_emb_all, all_emb], axis=-1) din_all = tf.layers.batch_normalization(inputs=din_all, name='b1', reuse=True) d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1', reuse=True) d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2', reuse=True) d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3', reuse=True) d_layer_3_all = tf.reshape(d_layer_3_all, [-1, item_count]) self.logits_all = tf.sigmoid(item_b + d_layer_3_all) #-- fcn end ------- self.mf_auc = tf.reduce_mean(tf.to_float(x > 0)) self.score_i = tf.sigmoid(i_b + d_layer_3_i) self.score_j = tf.sigmoid(j_b + d_layer_3_j) self.score_i = tf.reshape(self.score_i, [-1, 1]) self.score_j = tf.reshape(self.score_j, [-1, 1]) self.p_and_n = tf.concat([self.score_i, self.score_j], axis=-1) print(self.p_and_n.get_shape().as_list()) # Step variable self.global_step = tf.Variable(0, trainable=False, name='global_step') self.global_epoch_step = \ tf.Variable(0, trainable=False, name='global_epoch_step') self.global_epoch_step_op = \ tf.assign(self.global_epoch_step, self.global_epoch_step+1) self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.logits, labels=self.y) ) trainable_params = tf.trainable_variables() self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr) gradients = tf.gradients(self.loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm(gradients, 5) self.train_op = self.opt.apply_gradients( zip(clip_gradients, trainable_params), global_step=self.global_step)
def __call__(self, decoder_inp, seq_len, encoder_hidden_states, seq_len_inp): # First prepare the decoder input - Embed the input and obtain the # relevant loop function params = self.params scope = "rnn_decoder" + ("" if self.scope is None else "_" + self.scope) with tf.variable_scope(scope): decoder_inputs, loop_function = self.prepare_decoder_input(decoder_inp) lm_cell = self.get_cell(hidden_size=params.lm_hidden_size) # TensorArray is used to do dynamic looping over decoder input inputs_ta = tf.TensorArray(size=params.max_output, dtype=tf.float32) inputs_ta = inputs_ta.unstack(decoder_inputs) batch_size = tf.shape(decoder_inputs)[1] attn_length = tf.shape(encoder_hidden_states)[1] emb_size = decoder_inputs.get_shape()[2].value attn_size = encoder_hidden_states.get_shape()[2].value # Attention variables attn_mask = tf.sequence_mask(tf.cast(seq_len_inp, tf.int32), dtype=tf.float32) batch_attn_size = tf.stack([batch_size, attn_size]) attn = tf.zeros(batch_attn_size, dtype=tf.float32) batch_alpha_size = tf.stack([batch_size, attn_length, 1, 1]) alpha = tf.zeros(batch_alpha_size, dtype=tf.float32) with tf.variable_scope(scope): # Calculate the W*h_enc component hidden = tf.expand_dims(encoder_hidden_states, 2) W_attn = tf.get_variable( "AttnW", [1, 1, attn_size, params.attention_vec_size]) hidden_features = tf.nn.conv2d(hidden, W_attn, [1, 1, 1, 1], "SAME") v = tf.get_variable("AttnV", [params.attention_vec_size]) def raw_loop_function(time, cell_output, state, loop_state): def attention(query, prev_alpha): """Put attention masks on hidden using hidden_features and query.""" with tf.variable_scope("Attention"): y = _linear(query, params.attention_vec_size, True) y = tf.reshape(y, [-1, 1, 1, params.attention_vec_size]) s = tf.reduce_sum( v * tf.tanh(hidden_features + y), [2, 3]) alpha = tf.nn.softmax(s) * attn_mask sum_vec = tf.reduce_sum(alpha, reduction_indices=[1], keepdims=True) norm_term = tf.tile(sum_vec, tf.stack([1, tf.shape(alpha)[1]])) alpha = alpha / norm_term alpha = tf.expand_dims(alpha, 2) alpha = tf.expand_dims(alpha, 3) context_vec = tf.reduce_sum(alpha * hidden, [1, 2]) return tuple([context_vec, alpha]) # If loop_function is set, we use it instead of decoder_inputs. elements_finished = (time >= tf.cast(seq_len, tf.int32)) finished = tf.reduce_all(elements_finished) if cell_output is None: next_state = self.cell.zero_state(batch_size, dtype=tf.float32) # This output is not used but is just used to tell the shape # without the batch dimension # Check here - https://www.tensorflow.org/api_docs/python/tf/nn/raw_rnn output = tf.zeros((self.params.vocab_size)) lm_input = inputs_ta.read(time) attn_state = tuple([attn, alpha]) lm_state = lm_cell.zero_state(batch_size, dtype=tf.float32) else: next_state = state #loop_state = attention(cell_output, loop_state[1]) lm_state, attn_state = loop_state attn_state = attention(self.get_state(state), attn_state[1]) with tf.variable_scope("AttnProjection"): proj_output = _linear([self.get_state(state), attn_state[0]], self.params.hidden_size_dec, True) if params.ind_softmax: # Don't share parameters with LM model with tf.variable_scope("OutputProjection2"): output = _linear([proj_output], self.params.vocab_size, True) else: with tf.variable_scope("OutputProjection"): output = _linear([proj_output], self.params.vocab_size, True) if not self.isTraining: lm_input = loop_function(output) else: if loop_function is not None: random_prob = tf.random_uniform([]) lm_input = tf.cond( finished, lambda: tf.zeros([batch_size, emb_size], dtype=tf.float32), lambda: tf.cond(tf.less(random_prob, 1 - params.samp_prob), lambda: inputs_ta.read(time), lambda: loop_function(output)) ) else: lm_input = tf.cond( finished, lambda: tf.zeros([batch_size, emb_size], dtype=tf.float32), lambda: inputs_ta.read(time) ) # Common calculations lm_output, next_lm_state = lm_cell(lm_input, lm_state) if params.lm_hidden_size != params.hidden_size_dec: with tf.variable_scope("SimpleProjection", reuse=tf.AUTO_REUSE): lm_output = _linear([lm_output], params.hidden_size_dec, True) # Merge input and previous attentions into one vector of the right size. input_size = lm_input.get_shape().with_rank(2)[1] if input_size.value is None: raise ValueError("Could not infer input size from input") with tf.variable_scope("InputProjection", reuse=tf.AUTO_REUSE): next_input = _linear([lm_output, attn_state[0]], input_size, True) loop_state = tuple([next_lm_state, attn_state]) return (elements_finished, next_input, next_state, output, loop_state) # outputs is a TensorArray with T=max(sequence_length) entries # of shape Bx|V| outputs, state, _ = tf.nn.raw_rnn(self.cell, raw_loop_function) # Concatenate the output across timesteps to get a tensor of TxBx|V| # shape outputs = outputs.concat() return outputs
def din_fcn_attention(query, rnn_output, keys_len, scope_name, stag='null', mode='SUM', softmax_stag=1, time_major=False, return_alphas=False, for_cnn=False): if isinstance(rnn_output, tuple): # In case of Bi-RNN, concatenate the forward and the backward RNN outputs. rnn_output = tf.concat(rnn_output, 2) if len(rnn_output.get_shape().as_list()) == 2: rnn_output = tf.expand_dims(rnn_output, 1) if time_major: # (T,B,D) => (B,T,D) rnn_output = array_ops.transpose(rnn_output, [1, 0, 2]) # Trainable parameters # mask = tf.equal(mask, tf.ones_like(mask)) # query_size = query.get_shape().as_list()[-1] rnn_output_size = rnn_output.get_shape().as_list()[ -1] # D value - hidden size of the RNN layer query = tf.layers.dense(query, rnn_output_size, activation=None, name=scope_name + '_f1' + stag) query = prelu(query, scope=scope_name) queries = tf.tile(query, [1, tf.shape(rnn_output)[1]]) queries = tf.reshape(queries, tf.shape(rnn_output)) din_all = tf.concat( [queries, rnn_output, queries - rnn_output, queries * rnn_output], axis=-1) d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name=scope_name + 'f1_att' + stag) d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name=scope_name + 'f2_att' + stag) d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name=scope_name + 'f3_att' + stag) d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(rnn_output)[1]]) scores = d_layer_3_all # Mask key_masks = tf.sequence_mask(keys_len, tf.shape(rnn_output)[1]) # [B, T] key_masks = tf.expand_dims(key_masks, 1) # [B, 1, T] paddings = tf.ones_like(scores) * (-2**32 + 1) if not for_cnn: scores = tf.where(key_masks, scores, paddings) # [B, 1, T] # Scale # scores = scores / (facts.get_shape().as_list()[-1] ** 0.5) # Activation if softmax_stag: scores = tf.nn.softmax(scores) # [B, 1, T] # Weighted sum if mode == 'SUM': output = tf.matmul(scores, rnn_output) # [B, 1, H] # output = tf.reshape(output, [-1, tf.shape(facts)[-1]]) else: scores = tf.reshape(scores, [-1, tf.shape(rnn_output)[1]]) output = rnn_output * tf.expand_dims(scores, -1) output = tf.reshape(output, tf.shape(rnn_output)) if return_alphas: return output, scores return output