def __init__(self, source_vocab_size, target_vocab_size, buckets, 
          text_hidden_size, speech_hidden_size, parse_hidden_size,
          text_num_layers, speech_num_layers, parse_num_layers,
          filter_sizes, num_filters, feat_dim, fixed_word_length, 
          embedding_size, max_gradient_norm, batch_size, 
          attn_vec_size, spscale,  
          learning_rate, learning_rate_decay_factor, 
          optimizer, use_lstm=True, output_keep_prob=0.8,
          num_samples=512, forward_only=False):
    """Create the model.
    """
    self.source_vocab_size = source_vocab_size
    self.target_vocab_size = target_vocab_size
    self.buckets = buckets
    self.batch_size = batch_size
    self.spscale = spscale
    self.epoch = 0
    self.feat_dim = feat_dim
    self.fixed_word_length = fixed_word_length

    self.filter_sizes = filter_sizes
    self.num_filters = num_filters

    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)

    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if num_samples > 0 and num_samples < self.target_vocab_size:
      w = tf.get_variable("proj_w", [hidden_size, self.target_vocab_size])
      w_t = tf.transpose(w)
      b = tf.get_variable("proj_b", [self.target_vocab_size])
      output_projection = (w, b)

      def sampled_loss(inputs, labels):
        labels = tf.reshape(labels, [-1, 1])
        return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                self.target_vocab_size)
      softmax_loss_function = sampled_loss

    # Create the internal multi-layer cell for our RNN.
    def create_cell(hidden_size, num_layers):
        single_cell = rnn_cell.GRUCell(hidden_size)
        if use_lstm:
            print("Using LSTM")
            single_cell = rnn_cell.BasicLSTMCell(hidden_size, state_is_tuple=True)
            #single_cell = rnn_cell.BasicLSTMCell(hidden_size)
        if not forward_only:
            # always use dropout; set keep_prob=1 if not dropout
            print("Training mode; dropout used!")
            single_cell = rnn_cell.DropoutWrapper(single_cell, 
                    output_keep_prob=output_keep_prob)
        cell = single_cell
        if num_layers > 1:
            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers, state_is_tuple=True)
            #cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)
        return cell

    text_cell = create_cell(text_hidden_size, text_num_layers)
    speech_cell = create_cell(speech_hidden_size, speech_num_layers)
    parse_cell = create_cell(parse_hidden_size, parse_num_layers)

    # The seq2seq function: we use embedding for the input and attention.
    def seq2seq_f(encoder_inputs_list, decoder_inputs, text_len, speech_len, do_decode, attn_vec_size):
      return many2one_seq2seq.many2one_attention_seq2seq(
          encoder_inputs_list, decoder_inputs, 
          text_len, speech_len, feat_dim,  
          text_cell, speech_cell, parse_cell,
          num_encoder_symbols=source_vocab_size,
          num_decoder_symbols=target_vocab_size,
          embedding_size=embedding_size,
          attention_vec_size=attn_vec_size,
          fixed_word_length=fixed_word_length,
          filter_sizes=filter_sizes, 
          num_filters=num_filters,
          output_projection=output_projection,
          feed_previous=do_decode)
      
    # Feeds for inputs.
    #self.encoder_inputs = []
    self.text_encoder_inputs = []
    self.speech_encoder_inputs = []
    self.speech_partitions = []
    self.decoder_inputs = []
    self.target_weights = []
    for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
      self.text_encoder_inputs.append(tf.placeholder(tf.int32, 
          shape=[None],name="text_encoder{0}".format(i)))
    for i in xrange(buckets[-1][0]*self.spscale):
      self.speech_encoder_inputs.append(tf.placeholder(tf.float32, 
          shape=[None, fixed_word_length, feat_dim],name="speech_encoder{0}".format(i)))
    for i in xrange(buckets[-1][1]+1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
                                                name="weight{0}".format(i)))
    self.encoder_inputs_list = [self.text_encoder_inputs, self.speech_encoder_inputs]

    # seq_len stuff:
    _batch_size = tf.shape(self.text_encoder_inputs[0])[0]
    # the constant "2" is just a placeholder
    self.text_seq_len = tf.fill(tf.expand_dims(_batch_size, 0), tf.constant(2, dtype=tf.int64))
    self.speech_seq_len = tf.fill(tf.expand_dims(_batch_size, 0), tf.constant(2, dtype=tf.int64))

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i+1] for i in xrange(len(self.decoder_inputs)-1)]

    # Training outputs and losses.
    if forward_only:
      self.outputs, self.losses = many2one_seq2seq.many2one_model_with_buckets(
          self.encoder_inputs_list, self.decoder_inputs, targets,
          self.target_weights, self.text_seq_len, self.speech_seq_len, buckets, 
          lambda x, y, z, w: seq2seq_f(x, y, z, w, True, attn_vec_size),
          softmax_loss_function=softmax_loss_function, spscale=self.spscale)
      # If we use output projection, we need to project outputs for decoding.
      if output_projection is not None:
        for b in xrange(len(buckets)):
          self.outputs[b] = [
              tf.matmul(output, output_projection[0]) + output_projection[1]
              for output in self.outputs[b]
          ]
    else:
      self.outputs, self.losses = many2one_seq2seq.many2one_model_with_buckets(
          self.encoder_inputs_list, self.decoder_inputs, targets,
          self.target_weights, self.text_seq_len, self.speech_seq_len, buckets,
          lambda x, y, z, w: seq2seq_f(x, y, z, w, False, attn_vec_size),
          softmax_loss_function=softmax_loss_function, spscale=self.spscale)

    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      #opt = tf.train.AdagradOptimizer(self.learning_rate) 
      ## Make optimizer a hyperparameter
      if optimizer == "momentum":
        opt = tf.train.MomentumOptimizer(self.learning_rate, 0.9)
      elif optimizer == "grad_descent":
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      elif optimizer == "adagrad":
        print("Using adagrad optimizer")
        opt = tf.train.AdagradOptimizer(self.learning_rate)
      else:
        print("Using Adam optimizer")
        opt = tf.train.AdamOptimizer(self.learning_rate)

      for b in xrange(len(buckets)):
      # add gradient aggregration trick for less memory
        gradients = tf.gradients(self.losses[b], params, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

    #self.saver = tf.train.Saver(tf.all_variables())
    self.saver = tf.train.Saver(tf.global_variables())
示例#2
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 hidden_size,
                 num_layers,
                 embedding_size,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 output_keep_prob=0.8,
                 num_samples=512,
                 forward_only=False,
                 dropout=True):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w = tf.get_variable("proj_w",
                                [hidden_size, self.target_vocab_size])
            w_t = tf.transpose(w)
            b = tf.get_variable("proj_b", [self.target_vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                labels = tf.reshape(labels, [-1, 1])
                return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                  num_samples,
                                                  self.target_vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = tf.nn.rnn_cell.GRUCell(hidden_size)
        if use_lstm:
            single_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
        if dropout and not forward_only:
            print("Training mode; dropout used!")
            single_cell = tf.nn.rnn_cell.DropoutWrapper(
                single_cell, output_keep_prob=output_keep_prob)
        cell = single_cell
        if num_layers > 1:
            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs_list, decoder_inputs, text_len, do_decode,
                      attn_vec_size):
            return many2one_seq2seq.many2one_attention_seq2seq(
                encoder_inputs_list,
                decoder_inputs,
                text_len,
                cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=embedding_size,
                output_projection=output_projection,
                feed_previous=do_decode,
                attention_vec_size=attn_vec_size)

        # Feeds for inputs.
        #self.encoder_inputs = []
        self.text_encoder_inputs = []
        self.speech_encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.text_encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="text_encoder{0}".format(i)))
        for i in xrange(buckets[-1][0] * spscale):
            self.speech_encoder_inputs.append(
                tf.placeholder(tf.float32,
                               shape=[None, mfcc_num],
                               name="speech_encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))
        self.encoder_inputs_list = [
            self.text_encoder_inputs, self.speech_encoder_inputs
        ]

        # seq_len stuff:
        _batch_size = tf.shape(self.text_encoder_inputs[0])[0]
        self.seq_len = tf.fill(tf.expand_dims(_batch_size, 0),
                               tf.constant(2, dtype=tf.int64))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = many2one_seq2seq.many2one_model_with_buckets(
                self.encoder_inputs_list,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.seq_len,
                buckets,
                lambda x, y, z: seq2seq_f(x, y, z, True, attn_vec_size),
                softmax_loss_function=softmax_loss_function,
                spscale=spscale)
            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = many2one_seq2seq.many2one_model_with_buckets(
                self.encoder_inputs_list,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.seq_len,
                buckets,
                lambda x, y, z: seq2seq_f(x, y, z, False, attn_vec_size),
                softmax_loss_function=softmax_loss_function,
                spscale=spscale)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            #opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            opt = tf.train.AdagradOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())