Python sequence_loss примеры использования

Язык программирования: Python

Пространство имен/Пакет: rnn.seq2seq

Метод/Функция: sequence_loss

Примеров на hotexamples.com: 4

Python sequence_loss - 4 примера найдено. Это лучшие примеры Python кода для rnn.seq2seq.sequence_loss, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

    def __init__(self,
                 vocab_size,
                 buckets_or_sentence_length,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 model_type,
                 use_lstm=True,
                 num_samples=512,
                 forward_only=False):
        """Create the model.  This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. 

    Args:
      vocab_size: size of the vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets_or_sentence_length: 
        if using buckets:
          a list of pairs (I, O), where I specifies maximum input length
          that will be processed in that bucket, and O specifies maximum output
          length. Training instances that have inputs longer than I or outputs
          longer than O will be pushed to the next bucket and padded accordingly.
          We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
        else:
          number of the maximum number of words per sentence.
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
        # Need to determine if we're using buckets or not:
        if type(buckets_or_sentence_length) == list:
            self.buckets = buckets_or_sentence_length
        else:
            self.max_sentence_length = buckets_or_sentence_length

        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # Summary variables. NOTE: added these.
        # self.summary_op_learning_rate = tf.scalar_summary('learning rate', self.learning_rate)
        # self.summary_op_global_step = tf.scalar_summary('global step', self.global_step)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = rnn_cell.BasicLSTMCell(size)
        cell = single_cell  #i, j, f, o = array_ops.split(1, 4, concat)
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell(
                [single_cell] *
                num_layers)  #cur_inp, array_ops.concat(1, new_states)

        # The seq2seq function: we use embedding for the input and attention (if applicable).
        if model_type is 'embedding_attention':

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)
        else:  # just build embedding model, I should probably change this to throw an error

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_rnn_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model.
        try:
            encoder_range = self.buckets[-1][0]
            decoder_range = self.buckets[-1][1]
        except AttributeError:
            encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length

        for i in xrange(encoder_range):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(decoder_range + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        try:
            if forward_only:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.buckets,
                    self.vocab_size,
                    lambda x, y: seq2seq_f(x, y, True),
                    softmax_loss_function=softmax_loss_function)
                # If we use output projection, we need to project outputs for decoding.
                if output_projection is not None:
                    for b in xrange(len(self.buckets)):
                        self.outputs[b] = [
                            tf.nn.xw_plus_b(output, output_projection[0],
                                            output_projection[1])
                            for output in self.outputs[b]
                        ]
            else:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.buckets,
                    self.vocab_size,
                    lambda x, y: seq2seq_f(x, y, False),
                    softmax_loss_function=softmax_loss_function)

        except AttributeError:
            if forward_only:
                self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                      self.decoder_inputs[:-1],
                                                      True)
                self.losses = seq2seq.sequence_loss(
                    self.outputs,
                    targets,
                    self.target_weights[:-1],
                    self.vocab_size,
                    softmax_loss_function=softmax_loss_function)
                # Project outputs for decoding
                if output_projection is not None:
                    self.outputs = [
                        tf.nn.xw_plus_b(output, output_projection[0],
                                        output_projection[1])
                        for output in self.outputs
                    ]
            else:
                self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                      self.decoder_inputs[:-1],
                                                      False)
                self.losses = (seq2seq.sequence_loss(
                    self.outputs,
                    targets,
                    self.target_weights[:-1],
                    self.vocab_size,
                    softmax_loss_function=softmax_loss_function))

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        self.params = params  # Hold onto this for Woz
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)

            try:
                for b in xrange(len(self.buckets)):
                    gradients = tf.gradients(self.losses[b], params)
                    clipped_gradients, norm = tf.clip_by_global_norm(
                        gradients, max_gradient_norm)
                    self.gradient_norms.append(norm)
                    self.updates.append(
                        opt.apply_gradients(zip(clipped_gradients, params),
                                            global_step=self.global_step))
            except AttributeError:
                gradients = tf.gradients(self.losses, params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms = norm
                self.updates = opt.apply_gradients(
                    zip(clipped_gradients, params),
                    global_step=self.global_step)

        self.saver = tf.train.Saver(tf.all_variables())

Пример #2

Показать файл

    def __init__(self,
                 vocab_size,
                 max_sentence_length,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 num_samples=512,
                 forward_only=False,
                 model_type):

        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.max_sentence_length = max_sentence_length
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # output projection for sampled softmax:
        output_projection = None
        softmax_loss_function = None

        if num_samples > 0 and num_samples < self.vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w", [size, self.vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                      num_samples,
                                                      self.vocab_size)

            softmax_loss_function = sampled_loss

        # single LSTM cell creation, use to build hidden layers
        single_cell = rnn_cell.BasicLSTMCell(size)
        cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)

        if model_type == 'embedding_attention':

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)
        else:

            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return seq2seq.embedding_rnn_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    vocab_size,
                    vocab_size,
                    output_projection=output_projection,
                    feed_previous=do_decode)

        # feeds for inputs are limited to max_sentence_length
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(max_sentence_length):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(max_sentence_length + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # targets are decoder inputs shifted by one
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # training outputs and losses
        if forward_only:
            self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                  self.decoder_inputs[:-1],
                                                  True)
            self.losses = seq2seq.sequence_loss(
                self.outputs,
                targets,
                self.target_weights[:-1],
                self.vocab_size,
                softmax_loss_function=softmax_loss_function)
            # project outputs for decoding
            if output_projection is not None:
                self.outputs = [
                    tf.nn.xw_plus_b(output, output_projection[0],
                                    output_projection[1])
                    for output in self.outputs
                ]
        else:
            self.outputs, self.states = seq2seq_f(self.encoder_inputs,
                                                  self.decoder_inputs[:-1],
                                                  False)
            self.losses = (seq2seq.sequence_loss(
                self.outputs,
                targets,
                self.target_weights[:-1],
                self.vocab_size,
                softmax_loss_function=softmax_loss_function))

        # gradients and SGD update operation for training
        params = tf.trainable_variables()
        self.params = params
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)

            gradients = tf.gradients(self.losses, params)
            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, max_gradient_norm)
            self.gradient_norms = norm
            self.updates = opt.apply_gradients(zip(clipped_gradients, params),
                                               global_step=self.global_step)

        self.saver = tf.train.Saver(tf.all_variables())

Пример #3

Показать файл

Файл: seq2seq_NOBUCKETS.py Проект: Vunb/SpeakEasy

  def __init__(self, vocab_size, max_sentence_length, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, num_samples=512, forward_only=False, model_type):

    self.vocab_size = vocab_size
    self.batch_size = batch_size
    self.max_sentence_length = max_sentence_length
    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
    self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)

    # output projection for sampled softmax:
    output_projection = None
    softmax_loss_function = None

    if num_samples > 0 and num_samples < self.vocab_size:
      with tf.device("/cpu:0"):
        w = tf.get_variable("proj_w", [size, self.vocab_size])
        w_t = tf.transpose(w)
        b = tf.get_variable("proj_b", [self.vocab_size])
      output_projection = (w, b)

      def sampled_loss(inputs, labels):
        with tf.device("/cpu:0"):
          labels = tf.reshape(labels, [-1, 1])
          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.vocab_size)
      softmax_loss_function = sampled_loss

    # single LSTM cell creation, use to build hidden layers
    single_cell=rnn_cell.BasicLSTMCell(size)
    cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
    
    if model_type == 'embedding_attention':
      def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        return seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode)
    else:
      def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        return seq2seq.embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode)

    # feeds for inputs are limited to max_sentence_length
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []
    for i in xrange(max_sentence_length):
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
    for i in xrange(max_sentence_length + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i)))

    # targets are decoder inputs shifted by one
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]

    # training outputs and losses
    if forward_only:
      self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], True)
      self.losses = seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function)
      # project outputs for decoding
      if output_projection is not None:
        self.outputs = [tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs]
    else:
      self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], False)
      self.losses = (seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function))

    # gradients and SGD update operation for training
    params = tf.trainable_variables()
    self.params = params
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)

      gradients = tf.gradients(self.losses, params)
      clipped_gradients, norm = tf.clip_by_global_norm(gradients,max_gradient_norm)
      self.gradient_norms = norm
      self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

    self.saver = tf.train.Saver(tf.all_variables())

Пример #4

Показать файл

Файл: seq2seq_model.py Проект: Vunb/SpeakEasy

  def __init__(self, vocab_size, buckets_or_sentence_length, size,
               num_layers, max_gradient_norm, batch_size, learning_rate,
               learning_rate_decay_factor, model_type, use_lstm=True,
               num_samples=512, forward_only=False):
    """Create the model.  This constructor can be used to created an embedded or embedded-attention, bucketed or non-bucketed model made of single or multi-layer RNN cells. 

    Args:
      vocab_size: size of the vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets_or_sentence_length: 
        if using buckets:
          a list of pairs (I, O), where I specifies maximum input length
          that will be processed in that bucket, and O specifies maximum output
          length. Training instances that have inputs longer than I or outputs
          longer than O will be pushed to the next bucket and padded accordingly.
          We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
        else:
          number of the maximum number of words per sentence.
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
    # Need to determine if we're using buckets or not:
    if type(buckets_or_sentence_length) == list:
      self.buckets = buckets_or_sentence_length
    else:
      self.max_sentence_length = buckets_or_sentence_length
    
    self.vocab_size = vocab_size
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False) 

    # Summary variables. NOTE: added these.
    # self.summary_op_learning_rate = tf.scalar_summary('learning rate', self.learning_rate)
    # self.summary_op_global_step = tf.scalar_summary('global step', self.global_step)

    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if num_samples > 0 and num_samples < self.vocab_size:
      with tf.device("/cpu:0"):
        w = tf.get_variable("proj_w", [size, self.vocab_size])
        w_t = tf.transpose(w)
        b = tf.get_variable("proj_b", [self.vocab_size])
      output_projection = (w, b)

      def sampled_loss(inputs, labels):
        with tf.device("/cpu:0"):
          labels = tf.reshape(labels, [-1, 1])
          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                                            self.vocab_size)
      softmax_loss_function = sampled_loss

    # Create the internal multi-layer cell for our RNN.
    single_cell = rnn_cell.GRUCell(size)
    if use_lstm:
      single_cell = rnn_cell.BasicLSTMCell(size)
    cell = single_cell #i, j, f, o = array_ops.split(1, 4, concat)
    if num_layers > 1:
      cell = rnn_cell.MultiRNNCell([single_cell] * num_layers) #cur_inp, array_ops.concat(1, new_states)

    # The seq2seq function: we use embedding for the input and attention (if applicable).
    if model_type is 'embedding_attention':
      def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        return seq2seq.embedding_attention_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode)
    else: # just build embedding model, I should probably change this to throw an error
      def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        return seq2seq.embedding_rnn_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size, vocab_size, output_projection=output_projection, feed_previous=do_decode)

    # Feeds for inputs.
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []

    # NOTE: If the model is not bucketed, these try blocks will throw an AttributeError and execute code to build a non-bucketed model. 
    try:
      encoder_range = self.buckets[-1][0]
      decoder_range = self.buckets[-1][1]
    except AttributeError:
      encoder_range, decoder_range = self.max_sentence_length, self.max_sentence_length
    
    for i in xrange(encoder_range):  # Last bucket is the biggest one.
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="encoder{0}".format(i)))
    for i in xrange(decoder_range + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
                                                name="weight{0}".format(i)))

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]

    # Training outputs and losses.
    try:
      if forward_only:
        self.outputs, self.losses = seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets,
            self.target_weights, self.buckets, self.vocab_size,
            lambda x, y: seq2seq_f(x, y, True),
            softmax_loss_function=softmax_loss_function)
        # If we use output projection, we need to project outputs for decoding.
        if output_projection is not None:
          for b in xrange(len(self.buckets)):
            self.outputs[b] = [tf.nn.xw_plus_b(output, output_projection[0],
                                               output_projection[1])
                               for output in self.outputs[b]]
      else:
        self.outputs, self.losses = seq2seq.model_with_buckets(
            self.encoder_inputs, self.decoder_inputs, targets,
            self.target_weights, self.buckets, self.vocab_size,
            lambda x, y: seq2seq_f(x, y, False),
            softmax_loss_function=softmax_loss_function)

    except AttributeError:
      if forward_only:
        self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], True)
        self.losses = seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function)
        # Project outputs for decoding
        if output_projection is not None:
          self.outputs = [tf.nn.xw_plus_b(output, output_projection[0], output_projection[1]) for output in self.outputs]
      else:
        self.outputs, self.states = seq2seq_f(self.encoder_inputs, self.decoder_inputs[:-1], False)
        self.losses = (seq2seq.sequence_loss(self.outputs, targets, self.target_weights[:-1], self.vocab_size, softmax_loss_function=softmax_loss_function))


    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()
    self.params = params # Hold onto this for Woz
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      
      try:
        for b in xrange(len(self.buckets)):
          gradients = tf.gradients(self.losses[b], params)
          clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                           max_gradient_norm)
          self.gradient_norms.append(norm)
          self.updates.append(opt.apply_gradients(
              zip(clipped_gradients, params), global_step=self.global_step))
      except AttributeError:
        gradients = tf.gradients(self.losses, params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,max_gradient_norm)
        self.gradient_norms = norm
        self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

    self.saver = tf.train.Saver(tf.all_variables())