예제 #1
0
    def _create_loss(self):
        print('Creating loss... \nIt might take a couple of minutes depending on how many buckets you have.')
        start = time.time()
        def _seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs, decoder_inputs, self.cell,
                    num_encoder_symbols=config.ENC_VOCAB,
                    num_decoder_symbols=config.DEC_VOCAB,
                    embedding_size=config.HIDDEN_SIZE,
                    output_projection=self.output_projection,
                    feed_previous=do_decode)

        if self.fw_only:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                                        self.encoder_inputs, 
                                        self.decoder_inputs, 
                                        self.targets,
                                        self.decoder_masks, 
                                        config.BUCKETS, 
                                        lambda x, y: _seq2seq_f(x, y, True),
                                        softmax_loss_function=self.softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if self.output_projection:
                for bucket in xrange(len(config.BUCKETS)):
                    self.outputs[bucket] = [tf.matmul(output, 
                                            self.output_projection[0]) + self.output_projection[1]
                                            for output in self.outputs[bucket]]
        else:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                                        self.encoder_inputs, 
                                        self.decoder_inputs, 
                                        self.targets,
                                        self.decoder_masks,
                                        config.BUCKETS,
                                        lambda x, y: _seq2seq_f(x, y, False),
                                        softmax_loss_function=self.softmax_loss_function)
        print('Time:', time.time() - start)
예제 #2
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 wordEmbedding=None,
                 num_samples=-1,
                 embedding_size=100,
                 forward_only=False,
                 beam_search=False,
                 beam_size=10,
                 category=6,
                 use_emb=False,
                 use_imemory=False,
                 use_ememory=False,
                 emotion_size=100,
                 imemory_size=256,
                 dtype=tf.float32):

        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=dtype)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w_t = tf.get_variable("proj_w", [self.target_vocab_size, size],
                                  dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [self.target_vocab_size],
                                dtype=dtype)
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                labels = tf.reshape(labels, [-1, 1])
                # We need to compute the sampled_softmax_loss using 32bit floats to
                # avoid numerical instabilities.
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(inputs, tf.float32)
                return tf.cast(
                    tf.nn.sampled_softmax_loss(local_w_t, local_b,
                                               local_inputs, labels,
                                               num_samples,
                                               self.target_vocab_size), dtype)

            softmax_loss_function = sampled_loss
        else:
            w_t = tf.get_variable("proj_w", [self.target_vocab_size, size],
                                  dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [self.target_vocab_size],
                                dtype=dtype)
            output_projection = (w, b)

        # Create the internal multi-layer cell for our RNN.
        def create_rnn_cell():
            gr = tf.contrib.rnn.GRUCell(size)
            return gr

        gru = tf.contrib.rnn.GRUCell(size)
        encoder_cell = gru
        if num_layers > 1:
            encoder_cell = tf.contrib.rnn.MultiRNNCell(
                [create_rnn_cell() for _ in range(num_layers)], )
        # Create the internal multi-layer cell for our RNN.
        decoder_cell = encoder_cell
        print('===ok=====')

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, decoder_emotions,
                      do_decode):
            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                decoder_emotions,
                decoder_cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=embedding_size,
                emotion_category=category,
                emotion_size=emotion_size,
                imemory_size=imemory_size,
                wordEmbedding=wordEmbedding,
                use_emb=use_emb,
                use_imemory=use_imemory,
                use_ememory=use_ememory,
                output_projection=output_projection,
                initial_state_attention=True,
                feed_previous=do_decode,
                dtype=dtype,
                beam_search=beam_search,
                beam_size=beam_size)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        self.target_weights1 = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(dtype, shape=[None],
                               name="weight{0}".format(i)))
            self.target_weights1.append(
                tf.placeholder(dtype,
                               shape=[None],
                               name="weight1{0}".format(i)))
        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        self.decoder_emotions = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name="decoder_emotion")

        # Training outputs and losses.
        if forward_only:
            if beam_search:
                self.outputs, self.beam_results, self.beam_symbols, self.beam_parents = seq2seq.decode_model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.decoder_emotions,
                    buckets,
                    lambda x, y, z: seq2seq_f(x, y, z, True),
                    softmax_loss_function=softmax_loss_function)
            else:
                self.outputs, self.losses, self.ppxes = seq2seq.model_with_buckets(
                    self.encoder_inputs,
                    self.decoder_inputs,
                    targets,
                    self.target_weights,
                    self.target_weights1,
                    self.decoder_emotions,
                    buckets,
                    lambda x, y, z: seq2seq_f(x, y, z, True),
                    softmax_loss_function=softmax_loss_function,
                    use_imemory=use_imemory,
                    use_ememory=use_ememory)
        else:
            self.outputs, self.losses, self.ppxes = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.target_weights1,
                self.decoder_emotions,
                buckets,
                lambda x, y, z: seq2seq_f(x, y, z, False),
                softmax_loss_function=softmax_loss_function,
                use_imemory=use_imemory,
                use_ememory=use_ememory)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.pretrain_var = []
        self.initial_var = []
        for i in tf.trainable_variables():
            if 'Emotion' not in i.name and 'emotion' not in i.name and 'memory' not in i.name and 'Memory' not in i.name:
                self.pretrain_var.append(i)
        for i in tf.all_variables():
            if i not in self.pretrain_var:
                self.initial_var.append(i)
        self.pretrain_saver = tf.train.Saver(
            self.pretrain_var, write_version=tf.train.SaverDef.V2)
        self.saver = tf.train.Saver(tf.all_variables(),
                                    write_version=tf.train.SaverDef.V2,
                                    max_to_keep=200)
예제 #3
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 dummy_set,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 fixed_rate,
                 weibo_rate,
                 qa_rate,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False,
                 scope_name='seq2seq',
                 dtype=tf.float32):

        self.scope_name = scope_name
        with tf.variable_scope(self.scope_name):
            self.source_vocab_size = source_vocab_size
            self.target_vocab_size = target_vocab_size
            self.buckets = buckets
            self.batch_size = batch_size
            self.fixed_rate = fixed_rate
            self.weibo_rate = weibo_rate
            self.qa_rate = qa_rate
            self.learning_rate = tf.Variable(float(learning_rate),
                                             trainable=False,
                                             dtype=dtype)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
            self.global_step = tf.Variable(0, trainable=False)
            self.dummy_dialogs = dummy_set

            # If we use sampled softmax, we need an output projection.
            output_projection = None
            softmax_loss_function = None
            # Sampled softmax only makes sense if we sample less than vocabulary size.
            if num_samples > 0 and num_samples < self.target_vocab_size:
                w_t = tf.get_variable("proj_w", [self.target_vocab_size, size],
                                      dtype=dtype)
                w = tf.transpose(w_t)
                b = tf.get_variable("proj_b", [self.target_vocab_size],
                                    dtype=dtype)
                output_projection = (w, b)

                def sampled_loss(inputs, labels):
                    labels = tf.reshape(labels, [-1, 1])
                    # We need to compute the sampled_softmax_loss using 32bit floats to
                    # avoid numerical instabilities.
                    local_w_t = tf.cast(w_t, tf.float32)
                    local_b = tf.cast(b, tf.float32)
                    local_inputs = tf.cast(inputs, tf.float32)
                    return tf.cast(
                        tf.nn.sampled_softmax_loss(local_w_t, local_b,
                                                   local_inputs, labels,
                                                   num_samples,
                                                   self.target_vocab_size),
                        dtype)

                softmax_loss_function = sampled_loss

            # Create the internal multi-layer cell for our RNN.
            single_cell = tf.nn.rnn_cell.GRUCell(size)
            if use_lstm:
                single_cell = tf.nn.rnn_cell.BasicLSTMCell(size)
            cell = single_cell
            if num_layers > 1:
                cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)

            # The seq2seq function: we use embedding for the input and attention.
            def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
                return rl_seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=size,
                    output_projection=output_projection,
                    feed_previous=do_decode,
                    dtype=dtype)

            # Feeds for inputs.
            self.encoder_inputs = []
            self.decoder_inputs = []
            self.target_weights = []
            for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
                self.encoder_inputs.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="encoder{0}".format(i)))
            for i in xrange(buckets[-1][1] + 1):
                self.decoder_inputs.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="decoder{0}".format(i)))
                self.target_weights.append(
                    tf.placeholder(dtype,
                                   shape=[None],
                                   name="weight{0}".format(i)))

            # Our targets are decoder inputs shifted by one.
            targets = [
                self.decoder_inputs[i + 1]
                for i in xrange(len(self.decoder_inputs) - 1)
            ]

            # for reinforcement learning
            self.force_dec_input = tf.placeholder(tf.bool,
                                                  name="force_dec_input")
            self.en_output_proj = tf.placeholder(tf.bool,
                                                 name="en_output_proj")
            # Training outputs and losses.
            #if forward_only:
            self.outputs, self.losses, self.encoder_state = rl_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(
                    x, y, tf.select(self.force_dec_input, False, True)),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            #if output_projection is not None:
            for b in xrange(len(buckets)):
                self.outputs[b] = [
                    control_flow_ops.cond(
                        self.en_output_proj,
                        lambda: tf.matmul(output, output_projection[
                            0]) + output_projection[1], lambda: output)
                    for output in self.outputs[b]
                ]

            # Gradients and SGD update operation for training the model.
            self.tvars = tf.trainable_variables()
            #if not forward_only:
            self.gradient_norms = []
            self.updates = []
            self.advantage = [
                tf.placeholder(tf.float32, name="advantage_%i" % i)
                for i in range(len(buckets))
            ]
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                adjusted_losses = tf.sub(self.losses[b], self.advantage[b])
                gradients = tf.gradients(adjusted_losses, self.tvars)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, self.tvars),
                                        global_step=self.global_step))

            # self.saver = tf.train.Saver(tf.all_variables())
            all_variables = [
                k for k in tf.global_variables()
                if k.name.startswith(self.scope_name)
            ]
            self.saver = tf.train.Saver(all_variables)
예제 #4
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 embedding_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=2048,
                 forward_only=False,
                 dtype=tf.float32):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w = tf.get_variable("proj_w", [size, self.target_vocab_size])
            w_t = tf.transpose(w)
            b = tf.get_variable("proj_b", [self.target_vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                labels = tf.reshape(labels, [-1, 1])
                return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels,
                                                  num_samples,
                                                  self.target_vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = tf.nn.rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = tf.nn.rnn_cell.BasicLSTMCell(size,
                                                       state_is_tuple=True)
        cell = single_cell
        if num_layers > 1:
            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers,
                                               state_is_tuple=True)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            #print(do_decode[0].dtype)
            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                do_decode,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=embedding_size,
                output_projection=output_projection,
                dtype=dtype)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        self.decode = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder_{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder_{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight_{0}".format(i)))
            self.decode.append(
                tf.placeholder(tf.bool, name='decode_{0}'.format(i)))
        #self.iteration = tf.placeholder(tf.float32)
        #self.eps = exp_decay(self.iteration)
        #self.decode = sampling(self.eps, self.iteration, buckets[-1][1]+1)
        #self.decode = tf.placeholder(tf.bool, shape=[buckets[-1][1]+1], name='decode')
        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            self.states, self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, self.decode),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
            for b in xrange(len(buckets)):
                self.outputs[b] = [
                    tf.nn.log_softmax(output) for output in self.outputs[b]
                ]
        else:
            self.states, self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, self.decode),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())
예제 #5
0
    def __init__(self, encoder_masks, encoder_inputs_tensor, decoder_inputs,
                 target_weights, target_vocab_size, buckets,
                 target_embedding_size, attn_num_layers, attn_num_hidden,
                 forward_only, use_gru):
        """Create the model.

        Args:
          source_vocab_size: size of the source vocabulary.
          target_vocab_size: size of the target vocabulary.
          buckets: a list of pairs (I, O), where I specifies maximum input length
            that will be processed in that bucket, and O specifies maximum output
            length. Training instances that have inputs longer than I or outputs
            longer than O will be pushed to the next bucket and padded accordingly.
            We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
          size: number of units in each layer of the model.
          num_layers: number of layers in the model.
          max_gradient_norm: gradients will be clipped to maximally this norm.
          learning_rate: learning rate to start with.
          learning_rate_decay_factor: decay learning rate by this much when needed.
          use_lstm: if true, we use LSTM cells instead of GRU cells.
          num_samples: number of samples for sampled softmax.
          forward_only: if set, we do not construct the backward pass in the model.
        """
        self.encoder_inputs_tensor = encoder_inputs_tensor
        self.decoder_inputs = decoder_inputs
        self.target_weights = target_weights
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.encoder_masks = encoder_masks

        # Create the internal multi-layer cell for our RNN.
        single_cell = tf.contrib.rnn.BasicLSTMCell(attn_num_hidden,
                                                   forget_bias=0.0,
                                                   state_is_tuple=False)
        if use_gru:
            print("using GRU CELL in decoder")
            single_cell = tf.contrib.rnn.GRUCell(attn_num_hidden)
        cell = single_cell

        if attn_num_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell([single_cell] * attn_num_layers,
                                               state_is_tuple=False)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(lstm_inputs, decoder_inputs, seq_length, do_decode):

            num_hidden = attn_num_layers * attn_num_hidden
            lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden,
                                                        forget_bias=0.0,
                                                        state_is_tuple=False)
            # Backward direction cell
            lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden,
                                                        forget_bias=0.0,
                                                        state_is_tuple=False)

            pre_encoder_inputs, output_state_fw, output_state_bw = tf.contrib.rnn.static_bidirectional_rnn(
                lstm_fw_cell,
                lstm_bw_cell,
                lstm_inputs,
                initial_state_fw=None,
                initial_state_bw=None,
                dtype=tf.float32,
                sequence_length=None,
                scope=None)

            encoder_inputs = [
                e * f
                for e, f in zip(pre_encoder_inputs, encoder_masks[:seq_length])
            ]
            top_states = [
                array_ops.reshape(e, [-1, 1, num_hidden * 2])
                for e in encoder_inputs
            ]
            attention_states = array_ops.concat(top_states, 1)
            initial_state = tf.concat(
                axis=1, values=[output_state_fw, output_state_bw])
            outputs, _, attention_weights_history = embedding_attention_decoder(
                decoder_inputs,
                initial_state,
                attention_states,
                cell,
                num_symbols=target_vocab_size,
                embedding_size=target_embedding_size,
                num_heads=1,
                output_size=target_vocab_size,
                output_projection=None,
                feed_previous=do_decode,
                initial_state_attention=False,
                attn_num_hidden=attn_num_hidden)
            return outputs, attention_weights_history

        # Our targets are decoder inputs shifted by one.
        targets = [
            decoder_inputs[i + 1] for i in xrange(len(decoder_inputs) - 1)
        ]

        softmax_loss_function = None  # default to tf.nn.sparse_softmax_cross_entropy_with_logits

        # Training outputs and losses.
        if forward_only:
            self.output, self.loss, self.attention_weights_history = model_with_buckets(
                encoder_inputs_tensor,
                decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y, z: seq2seq_f(x, y, z, True),
                softmax_loss_function=softmax_loss_function)
        else:
            self.output, self.loss, self.attention_weights_history = model_with_buckets(
                encoder_inputs_tensor,
                decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y, z: seq2seq_f(x, y, z, False),
                softmax_loss_function=softmax_loss_function)
예제 #6
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False,
                 config=None,
                 corrective_tokens_mask=None):
        """Create the model.

        Args:
          source_vocab_size: size of the source vocabulary.
          target_vocab_size: size of the target vocabulary.
          buckets: a list of pairs (I, O), where I specifies maximum input
            length that will be processed in that bucket, and O specifies
            maximum output length. Training instances that have longer than I
            or outputs longer than O will be pushed to the next bucket and
            padded accordingly. We assume that the list is sorted, e.g., [(2,
            4), (8, 16)].
          size: number of units in each layer of the model.
          num_layers: number of layers in the model.
          max_gradient_norm: gradients will be clipped to maximally this norm.
          batch_size: the size of the batches used during training;
            the model construction is independent of batch_size, so it can be
            changed after initialization if this is convenient, e.g.,
            for decoding.
          learning_rate: learning rate to start with.
          learning_rate_decay_factor: decay learning rate by this much when
            needed.
          use_lstm: if true, we use LSTM cells instead of GRU cells.
          num_samples: number of samples for sampled softmax.
          forward_only: if set, we do not construct the backward pass in the
            model.
        """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)
        self.config = config

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in range(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in range(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # One hot encoding of corrective tokens.
        corrective_tokens_tensor = tf.constant(
            corrective_tokens_mask
            if corrective_tokens_mask else np.zeros(self.target_vocab_size),
            shape=[self.target_vocab_size],
            dtype=tf.float32)
        batched_corrective_tokens = tf.stack([corrective_tokens_tensor] *
                                             self.batch_size)
        self.batch_corrective_tokens_mask = batch_corrective_tokens_mask = \
            tf.placeholder(
            tf.float32,
            shape=[None, None],
            name="corrective_tokens")

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in range(len(self.decoder_inputs) - 1)
        ]
        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary
        # size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w = tf.get_variable("proj_w", [size, self.target_vocab_size])
            w_t = tf.transpose(w)
            b = tf.get_variable("proj_b", [self.target_vocab_size])

            output_projection = (w, b)

            def sampled_loss(labels, inputs):
                labels = tf.reshape(labels, [-1, 1])
                return tf.nn.sampled_softmax_loss(w_t, b, labels, inputs,
                                                  num_samples,
                                                  self.target_vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = core_rnn_cell_impl.GRUCell(size)
        if use_lstm:
            single_cell = core_rnn_cell_impl.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            cell = core_rnn_cell_impl.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            """

            :param encoder_inputs: list of length equal to the input bucket
            length of 1-D tensors (of length equal to the batch size) whose
            elements consist of the token index of each sample in the batch
            at a given index in the input.
            :param decoder_inputs:
            :param do_decode:
            :return:
            """

            if do_decode:
                # Modify bias here to bias the model towards selecting words
                # present in the input sentence.
                input_bias = self.build_input_bias(
                    encoder_inputs, batch_corrective_tokens_mask)

                # Redefined seq2seq to allow for the injection of a special
                # decoding function that
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=size,
                    output_projection=output_projection,
                    feed_previous=do_decode,
                    loop_fn_factory=
                    apply_input_bias_and_extract_argmax_fn_factory(input_bias))
            else:
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=size,
                    output_projection=output_projection,
                    feed_previous=do_decode)

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)

            if output_projection is not None:
                for b in range(len(buckets)):
                    # We need to apply the same input bias used during model
                    # evaluation when decoding.
                    input_bias = self.build_input_bias(
                        self.encoder_inputs[:buckets[b][0]],
                        batch_corrective_tokens_mask)
                    self.outputs[b] = [
                        project_and_apply_input_bias(output, output_projection,
                                                     input_bias)
                        for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.RMSPropOptimizer(0.001) if self.config.use_rms_prop \
                else tf.train.GradientDescentOptimizer(self.learning_rate)
            # opt = tf.train.AdamOptimizer()

            for b in range(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.global_variables())
예제 #7
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False,
                 dtype=tf.float32):
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=dtype)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        output_projection = None
        softmax_loss_function = None
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w_t = tf.get_variable("proj_w", [self.target_vocab_size, size],
                                  dtype=dtype)
            w = tf.transpose(w_t)
            b = tf.get_variable("proj_b", [self.target_vocab_size],
                                dtype=dtype)
            output_projection = (w, b)

            def sampled_loss(labels, logits):
                labels = tf.reshape(labels, [-1, 1])
                local_w_t = tf.cast(w_t, tf.float32)
                local_b = tf.cast(b, tf.float32)
                local_inputs = tf.cast(logits, tf.float32)
                return tf.cast(
                    tf.nn.sampled_softmax_loss(
                        weights=local_w_t,
                        biases=local_b,
                        labels=labels,
                        inputs=local_inputs,
                        num_sampled=num_samples,
                        num_classes=self.target_vocab_size), dtype)

            softmax_loss_function = sampled_loss

        def single_cell():
            return tf.contrib.rnn.GRUCell(size)

        if use_lstm:

            def single_cell():
                return tf.contrib.rnn.BasicLSTMCell(size)

        cell = single_cell()
        encoder_cell = single_cell()
        if num_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell(
                [single_cell() for _ in range(num_layers)])
            encoder_cell = tf.contrib.rnn.MultiRNNCell(
                [single_cell() for _ in range(num_layers)])

        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                encoder_cell,
                cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=size,
                output_projection=output_projection,
                feed_previous=do_decode,
                dtype=dtype)

        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(dtype, shape=[None],
                               name="weight{0}".format(i)))

        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        if forward_only:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.global_variables())
예제 #8
0
    def __init__(self,
                 source_target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 scheduling_rate,
                 scheduling_rate_decay_factor,
                 num_samples=4096,
                 forward_only=False):
        """Create the model.

    Args:
      source_target_vocab_size: size of the source/target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      scheduling_rate:
      scheduling_rate_decay_factor:
      forward_only: if set, we do not construct the backward pass in the model.
    """
        self.source_target_vocab_size = source_target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)

        self.scheduling_rate = tf.Variable(float(scheduling_rate),
                                           trainable=False)
        self.scheduling_rate_decay_op = self.scheduling_rate.assign(
            self.scheduling_rate * scheduling_rate_decay_factor)

        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.source_target_vocab_size:
            with tf.device("/cpu:0"):
                w = tf.get_variable("proj_w",
                                    [size, self.source_target_vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.source_target_vocab_size])
            output_projection = (w, b)

            def sampled_loss(inputs, labels):
                with tf.device("/cpu:0"):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(
                        w_t, b, inputs, labels, num_samples,
                        self.source_target_vocab_size)

            softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = tf.nn.rnn_cell.BasicLSTMCell(size)
        cell = single_cell
        if num_layers > 1:
            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)

        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=source_target_vocab_size,
                num_decoder_symbols=source_target_vocab_size,
                embedding_size=size,
                output_projection=output_projection,
                feed_previous=do_decode,
                scheduling_rate=self.scheduling_rate)

        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        for i in xrange(buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))

        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        if forward_only:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)
            print("i'm in here")
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())
  def __init__(self,
               vocab_size,
               buckets,
               size,
               num_layers,
               batch_size,
               mode):
    
    self.vocab_size = vocab_size
    self.buckets =buckets
    # units of rnn cell
    self.size = size
    # dimension of words
    self.num_layers = num_layers
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(0.5, trainable=False)
    self.mode = mode
    self.dummy_reply = ["what ?", "yeah .", "you are welcome ! ! ! !"]

    # learning rate decay
    self.learning_rate_decay = self.learning_rate.assign(self.learning_rate * 0.99) 

    # input for Reinforcement part
    self.loop_or_not = tf.placeholder(tf.bool)
    self.reward = tf.placeholder(tf.float32, [None])
    batch_reward = tf.stop_gradient(self.reward)
    self.RL_index = [None for _ in self.buckets]

    # projection function
    w_t = tf.get_variable('proj_w', [self.vocab_size, self.size])
    w = tf.transpose(w_t)
    b = tf.get_variable('proj_b', [self.vocab_size])
    output_projection = (w, b)

    def sample_loss(labels, inputs):
      labels = tf.reshape(labels, [-1, 1])
      local_w_t = tf.cast(w_t, tf.float32)
      local_b = tf.cast(b, tf.float32)
      local_inputs = tf.cast(inputs, tf.float32)
      return tf.cast(tf.nn.sampled_softmax_loss(weights = local_w_t,
                                                biases = local_b,
                                                inputs = local_inputs,
                                                labels = labels,
                                                num_sampled = 512,
                                                num_classes = self.vocab_size),
                                                dtype = tf.float32)
    softmax_loss_function = sample_loss

    #FIXME add RL function
    def seq2seq_multi(encoder_inputs, decoder_inputs, mode):
      embedding = tf.get_variable("embedding", [self.vocab_size, self.size])
      loop_function_RL = None
      if mode == 'MLE':
        feed_previous = False
      elif mode == 'TEST':
        feed_previous = True
      # need loop_function
      elif mode == 'RL':
        feed_previous = True

        def loop_function_RL(prev, i):
          prev = tf.matmul(prev, output_projection[0]) + output_projection[1]
          prev_index = tf.multinomial(tf.log(tf.nn.softmax(prev)), 1)
          
          if i == 1:
            for index, RL in enumerate(self.RL_index):
              if RL is None:
                self.RL_index[index] = prev_index
                self.index = index
                break
          else:
            self.RL_index[self.index] = tf.concat([self.RL_index[self.index], prev_index], axis = 1)
          prev_index = tf.reshape(prev_index, [-1])
          # decide which to be the next time step input
          sample = tf.nn.embedding_lookup(embedding, prev_index)
          from_decoder = tf.nn.embedding_lookup(embedding, decoder_inputs[i])

          return tf.where(self.loop_or_not, sample, from_decoder)

      return seq2seq.embedding_attention_seq2seq(
             encoder_inputs,
             decoder_inputs,
             cell,
             num_encoder_symbols = self.vocab_size,
             num_decoder_symbols = self.vocab_size,
             embedding_size = self.size,
             output_projection = output_projection,
             feed_previous = feed_previous,
             dtype = tf.float32,
             embedding = embedding,
             loop = loop_function_RL)
    
    # inputs
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []

    for i in xrange(buckets[-1][0]):
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape = [None],
                                                name = 'encoder{0}'.format(i)))
    for i in xrange(buckets[-1][1] + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape = [None],
                                                name = 'decoder{0}'.format(i)))
      self.target_weights.append(tf.placeholder(tf.float32, shape = [None],
                                                name = 'weight{0}'.format(i)))
    targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)]

    def single_cell():
      return tf.contrib.rnn.GRUCell(self.size)
    cell = single_cell()
    if self.num_layers > 1:
      cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(self.num_layers)])

    if self.mode == 'MLE':
      self.outputs, self.losses = seq2seq.model_with_buckets(
           self.encoder_inputs, self.decoder_inputs, targets,
           self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode),
           softmax_loss_function = softmax_loss_function)
      
      for b in xrange(len(self.buckets)):
        self.outputs[b] = [tf.matmul(output, output_projection[0]) + output_projection[1]
                           for output in self.outputs[b]]

      self.update = []
      optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
      for b in xrange(len(self.buckets)):
        gradients = tf.gradients(self.losses[b], tf.trainable_variables()) 
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        self.update.append(optimizer.apply_gradients(zip(clipped_gradients, tf.trainable_variables())))

    elif self.mode == 'TEST':

      self.outputs, self.losses = seq2seq.model_with_buckets(
           self.encoder_inputs, self.decoder_inputs, targets,
           self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode),
           softmax_loss_function = softmax_loss_function)
    
      for b in xrange(len(self.buckets)):
        self.outputs[b] = [tf.matmul(output, output_projection[0]) + output_projection[1]
                           for output in self.outputs[b]]

    elif self.mode == 'RL':

      self.outputs, self.losses = seq2seq.model_with_buckets(
           self.encoder_inputs, self.decoder_inputs, targets,
           self.target_weights, self.buckets, lambda x, y: seq2seq_multi(x, y, self.mode),
           softmax_loss_function = softmax_loss_function, per_example_loss = True)
    
      for b in xrange(len(self.buckets)):
        self.outputs[b] = [tf.matmul(output, output_projection[0]) + output_projection[1]
                           for output in self.outputs[b]]

      for i, b in enumerate(self.outputs):
        prev_index = tf.multinomial(tf.log(tf.nn.softmax(b[self.buckets[i][1] - 1])), 1)
        self.RL_index[i] = tf.concat([self.RL_index[i], prev_index], axis = 1)

      self.update = []
      optimizer = tf.train.GradientDescentOptimizer(0.01)
      #optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
      for b in xrange(len(self.buckets)):
        scaled_loss = tf.multiply(self.losses[b], batch_reward)
        self.losses[b] = tf.reduce_mean(scaled_loss)
        gradients = tf.gradients(self.losses[b], tf.trainable_variables())
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        self.update.append(optimizer.apply_gradients(zip(clipped_gradients, tf.trainable_variables())))

    # specify saver
    self.saver = tf.train.Saver(max_to_keep = 2)
예제 #10
0
  def __init__(self, source_vocab_size, target_vocab_size, buckets, size, state_size,
               num_layers, max_gradient_norm, batch_size, learning_rate,
               learning_rate_decay_factor, keep_prob=1.0, forward_only=False):
    """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      state_size: size of environment representation.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      keep_prob: probability DO NOT dropout.
      forward_only: if set, we do not construct the backward pass in the model.
    """
    self.source_vocab_size = source_vocab_size
    self.target_vocab_size = target_vocab_size
    self.buckets = buckets
    self.batch_size = batch_size
    self.state_size = state_size
    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)

    # Create the internal multi-layer cell for our RNN.
    cell = rnn_cell.BasicLSTMCell(size)
    if keep_prob < 1.0 and (not forward_only):
      cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_prob)
    if num_layers > 1:
      cell = rnn_cell.MultiRNNCell([cell] * num_layers)

    # The seq2seq function: we use embedding for the input and attention.
    # define the seq2seq model
    def seq2seq_f(encoder_inputs, decoder_inputs, decoder_inputs_positions, 
      decoder_inputs_maps, do_decode):
      return seq2seq.embedding_attention_seq2seq(
          encoder_inputs, decoder_inputs, cell, source_vocab_size, 
          target_vocab_size, batch_size, self.state_size,
          decoder_inputs_positions=decoder_inputs_positions,
          decoder_inputs_maps=decoder_inputs_maps, feed_previous=do_decode)


    # Feeds for inputs.
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []
    self.decoder_inputs_positions = []
    for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[self.batch_size],
                                                name="encoder{0}".format(i)))
    for i in xrange(buckets[-1][1] + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[self.batch_size],
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(tf.float32, shape=[self.batch_size],
                                                name="weight{0}".format(i)))
      self.decoder_inputs_positions.append(tf.placeholder(tf.int32, shape=[self.batch_size, 3],
                                                name="position{0}".format(i)))
    
    self.decoder_inputs_maps = tf.placeholder(tf.int32, shape=[self.batch_size], name="mapNo")

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]


    # Training outputs and losses.
    if forward_only:
      self.outputs, self.losses, self.attentions, self.environments, self.positions = seq2seq.model_with_buckets(
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets, self.target_vocab_size,
          lambda x, y, p, m: seq2seq_f(x, y, p, m, True),
          decoder_inputs_positions=self.decoder_inputs_positions, decoder_inputs_maps=self.decoder_inputs_maps)
    else:
      self.positions = None
      self.outputs, self.losses, self.attentions, self.environments, _ = seq2seq.model_with_buckets(
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets, self.target_vocab_size,
          lambda x, y, p, m: seq2seq_f(x, y, p, m, False),
          decoder_inputs_positions=self.decoder_inputs_positions, decoder_inputs_maps=self.decoder_inputs_maps)

    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      # opt = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
      opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
      for b in xrange(len(buckets)):
        gradients = tf.gradients(self.losses[b], params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                         max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

    self.saver = tf.train.Saver(tf.all_variables())
예제 #11
0
    def __init__(self,
                 src_vocab_size,
                 trg_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 batch_size,
                 mode,
                 input_keep_prob,
                 output_keep_prob,
                 state_keep_prob,
                 beam_search,
                 beam_size,
                 schedule_sampling='linear',
                 sampling_decay_rate=0.99,
                 sampling_global_step=150000,
                 sampling_decay_steps=500,
                 pretrain_vec=None,
                 pretrain_trainable=False,
                 length_penalty=None,
                 length_penalty_factor=0.6,
                 feed_previous=False):

        self.feed_previous = feed_previous
        self.decoder_max_len = tf.placeholder(tf.int32, [None])
        self.src_vocab_size = src_vocab_size
        self.trg_vocab_size = trg_vocab_size
        self.buckets = buckets
        # units of rnn cell
        self.size = size
        # dimension of words
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(0.5, trainable=False)
        self.mode = mode
        self.dummy_reply = ["what ?", "yeah .", "you are welcome ! ! ! !"]

        # learning rate decay
        self.learning_rate_decay = self.learning_rate.assign(
            self.learning_rate * 0.99)

        # input for Reinforcement part
        self.loop_or_not = tf.placeholder(tf.bool)
        self.reward = tf.placeholder(tf.float32, [None])
        batch_reward = tf.stop_gradient(self.reward)
        self.RL_index = [None for _ in self.buckets]

        # dropout
        self.input_keep_prob = input_keep_prob
        self.output_keep_prob = output_keep_prob
        self.state_keep_prob = state_keep_prob

        # beam search
        self.beam_search = beam_search
        self.beam_size = beam_size
        self.length_penalty = length_penalty
        self.length_penalty_factor = length_penalty_factor

        # if load pretrain word vector
        self.pretrain_vec = pretrain_vec
        self.pretrain_trainable = pretrain_trainable

        # schedule sampling
        self.sampling_probability_clip = None
        self.schedule_sampling = schedule_sampling
        if self.schedule_sampling == 'False': self.schedule_sampling = False
        self.init_sampling_probability = 1.0
        self.sampling_global_step = sampling_global_step
        self.sampling_decay_steps = sampling_decay_steps
        self.sampling_decay_rate = sampling_decay_rate

        if self.schedule_sampling == 'linear':
            self.decay_fixed = self.init_sampling_probability * (
                self.sampling_decay_steps / self.sampling_global_step)
            with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE):
                self.sampling_probability = tf.get_variable(
                    name=self.schedule_sampling,
                    initializer=tf.constant(self.init_sampling_probability),
                    trainable=False)
            self.sampling_probability_decay = tf.assign_sub(
                self.sampling_probability, self.decay_fixed)
            self.sampling_probability_clip = tf.clip_by_value(
                self.sampling_probability, 0.0, 1.0)
            #self.sampling_probability = tf.maximum(self.sampling_probability,tf.constant(0.0))
        elif self.schedule_sampling == 'exp':
            with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE):
                self.sampling_probability = tf.get_variable(
                    name=self.schedule_sampling,
                    initializer=tf.constant(self.init_sampling_probability),
                    trainable=False)
            #self.sampling_probability = tf.train.exponential_decay(
            self.sampling_probability_decay = tf.assign(
                self.sampling_probability,
                tf.train.natural_exp_decay(self.sampling_probability,
                                           self.sampling_global_step,
                                           self.sampling_decay_steps,
                                           self.sampling_decay_rate,
                                           staircase=True))
            self.sampling_probability_clip = tf.clip_by_value(
                self.sampling_probability, 0.0, 1.0)
        elif self.schedule_sampling == 'inverse_sigmoid':
            with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE):
                self.sampling_probability = tf.get_variable(
                    name=self.schedule_sampling,
                    initializer=tf.constant(self.init_sampling_probability),
                    trainable=False)
            self.sampling_probability_decay = tf.assign(
                self.sampling_probability,
                #tf.train.cosine_decay(
                tf.train.linear_cosine_decay(
                    self.sampling_probability,
                    self.sampling_decay_steps,
                    self.sampling_global_step,
                ))
            self.sampling_probability_clip = tf.clip_by_value(
                self.sampling_probability, 0.0, 1.0)
        elif not self.schedule_sampling:
            pass
        else:
            raise ValueError(
                "schedule_sampling must be one of the following: [linear|exp|inverse_sigmoid|False]"
            )

        w_t = tf.get_variable('proj_w', [self.trg_vocab_size, self.size])
        w = tf.transpose(w_t)
        b = tf.get_variable('proj_b', [self.trg_vocab_size])
        output_projection = (w, b)

        def sample_loss(labels, inputs):
            labels = tf.reshape(labels, [-1, 1])
            local_w_t = tf.cast(w_t, tf.float32)
            local_b = tf.cast(b, tf.float32)
            local_inputs = tf.cast(inputs, tf.float32)
            return tf.cast(tf.nn.sampled_softmax_loss(
                weights=local_w_t,
                biases=local_b,
                inputs=local_inputs,
                labels=labels,
                num_sampled=512,
                num_classes=self.trg_vocab_size),
                           dtype=tf.float32)

        softmax_loss_function = sample_loss

        #FIXME add RL function
        def seq2seq_multi(encoder_inputs,
                          decoder_inputs,
                          mode,
                          pretrain_vec=None):
            if pretrain_vec is not None:
                pad_num = self.src_vocab_size - pretrain_vec.shape[0]
                pretrain_vec = np.pad(pretrain_vec, [(0, pad_num), (0, 0)],
                                      mode='constant')
                tag_vec = pretrain_vec[:data_utils.SPECIAL_TAGS_COUNT]
                pretrain_vec = pretrain_vec[data_utils.SPECIAL_TAGS_COUNT:]
                special_tags = tf.get_variable(name="special_tags",
                                               initializer=tag_vec,
                                               trainable=True)
                embedding = tf.get_variable(name="embedding",
                                            initializer=pretrain_vec,
                                            trainable=self.pretrain_trainable)
                embedding = tf.concat([special_tags, embedding], 0)
            else:
                embedding = tf.get_variable("embedding",
                                            [self.src_vocab_size, self.size])
            loop_function_RL = None
            self.loop_function_RL = loop_function_RL

            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=self.src_vocab_size,
                num_decoder_symbols=self.trg_vocab_size,
                embedding_size=self.size,
                output_projection=output_projection,
                feed_previous=self.feed_previous,
                dtype=tf.float32,
                embedding=embedding,
                beam_search=self.beam_search,
                beam_size=self.beam_size,
                loop=loop_function_RL,
                schedule_sampling=self.schedule_sampling,
                sampling_probability=self.sampling_probability_clip,
                length_penalty=self.length_penalty,
                length_penalty_factor=self.length_penalty_factor)

        # inputs
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        for i in range(buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name='encoder{0}'.format(i)))
        for i in range(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name='decoder{0}'.format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name='weight{0}'.format(i)))
        targets = [
            self.decoder_inputs[i + 1]
            for i in range(len(self.decoder_inputs) - 1)
        ]

        def single_cell():
            return tf.contrib.rnn.GRUCell(self.size)
            #return tf.contrib.rnn.BasicLSTMCell(self.size)

        cell = single_cell()
        if self.num_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell(
                [single_cell() for _ in range(self.num_layers)])
            cell = rnn.DropoutWrapper(cell,
                                      input_keep_prob=self.input_keep_prob,
                                      output_keep_prob=self.output_keep_prob,
                                      state_keep_prob=self.state_keep_prob)

        #self.buckets = [(10, self.decoder_max_len), (15, self.decoder_max_len), (25, self.decoder_max_len), (50, self.decoder_max_len)]
        self.buckets = [(10, 50), (15, 50), (25, 50), (50, 50)]

        self.outputs, self.losses = seq2seq.model_with_buckets(
            self.encoder_inputs,
            self.decoder_inputs,
            targets,
            self.target_weights,
            self.buckets,
            lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec),
            softmax_loss_function=softmax_loss_function)

        for b in range(len(self.buckets)):
            #print('self.outputs[b]: ',self.outputs[b])
            self.outputs[b] = [
                tf.nn.log_softmax(
                    tf.matmul(output, output_projection[0]) +
                    output_projection[1]) for output in self.outputs[b]
            ]

        self.saver = tf.train.Saver(max_to_keep=2)
예제 #12
0
    def __init__(self, source_vocab_size, target_vocab_size, buckets, hidden_edim, hidden_units,
                 num_layers, keep_prob, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor,
                 beam_size, forward_only=False):
        """Create the model.

        Args:
          source_vocab_size: size of the source vocabulary.
          target_vocab_size: size of the target vocabulary.
          buckets: a list of pairs (I, O), where I specifies maximum input length
            that will be processed in that bucket, and O specifies maximum output
            length. Training instances that have inputs longer than I or outputs
            longer than O will be pushed to the next bucket and padded accordingly.
            We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
          hidden_edim: number of dimensions for word embedding
          hidden_units: number of hidden units for each layer
          num_layers: number of layers in the model.
          keep_prob: keep probability used for dropout.
          max_gradient_norm: gradients will be clipped to maximally this norm.
          batch_size: the size of the batches used during training;
            the model construction is independent of batch_size, so it can be
            changed after initialization if this is convenient, e.g., for decoding.
          learning_rate: learning rate to start with.
          learning_rate_decay_factor: decay learning rate by this much when needed.
          beam_size: the beam size for beam search decoding
          forward_only: if set, we do not construct the backward pass in the model.
        """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        w = tf.get_variable("proj_w", [hidden_units // 2, self.target_vocab_size],
                            initializer=tf.random_normal_initializer(0, 0.01, seed=123))
        b = tf.get_variable("proj_b", [self.target_vocab_size],
                            initializer=tf.constant_initializer(0.0), trainable=False)
        output_projection = (w, b)  # before softmax, there is an output projection

        def softmax_loss_function(logit, target):  # loss function of seq2seq model
            logit = nn_ops.xw_plus_b(logit, output_projection[0], output_projection[1])
            target = array_ops.reshape(target, [-1])
            return nn_ops.sparse_softmax_cross_entropy_with_logits(
                    logit, target)

        single_cell = rnn_cell.GRUCell(hidden_units)
        cell = single_cell
        if num_layers > 1:
            cell = rnn_cell.MultiRNNCell([single_cell] * num_layers)
        if not forward_only:
            cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=float(keep_prob), seed=123)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, encoder_mask, decoder_inputs, do_decode):
            return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs, encoder_mask, decoder_inputs, cell,
                    num_encoder_symbols=source_vocab_size,
                    num_decoder_symbols=target_vocab_size,
                    embedding_size=hidden_edim,
                    beam_size=beam_size,
                    output_projection=output_projection,
                    num_layers=num_layers,
                    feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                      name="encoder{0}".format(i)))

        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
                                                      name="decoder{0}".format(i)))
            self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
                                                      name="weight{0}".format(i)))
        self.encoder_mask = tf.placeholder(tf.int32, shape=[None, None],
                                           name="encoder_mask")

        # Our targets are decoder inputs shifted by one.
        targets = [self.decoder_inputs[i + 1]
                   for i in xrange(len(self.decoder_inputs) - 1)]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses, self.symbols = seq2seq.model_with_buckets(
                    self.encoder_inputs, self.encoder_mask, self.decoder_inputs, targets,
                    self.target_weights, buckets, lambda x, y, z: seq2seq_f(x, y, z, True),
                    softmax_loss_function=softmax_loss_function)
        else:
            self.outputs, self.losses, self.symbols = seq2seq.model_with_buckets(
                    self.encoder_inputs, self.encoder_mask, self.decoder_inputs, targets,
                    self.target_weights, buckets,
                    lambda x, y, z: seq2seq_f(x, y, z, False),
                    softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params_to_update = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.gradient_norms_print = []
            self.updates = []
            opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params_to_update,
                                         aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)
                clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                                 max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(opt.apply_gradients(
                        zip(clipped_gradients, params_to_update), global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables(), max_to_keep=1000,  # keep all checkpoints
                                    keep_checkpoint_every_n_hours=6)
예제 #13
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False,
                 train_mode=True,
                 name='Seq2SeqModel'):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
        with tf.variable_scope(name) as vs:
            self.vocab_size = vocab_size
            #self.target_vocab_size = target_vocab_size
            #print(type(target_vocab_size))
            self.buckets = buckets
            self.batch_size = batch_size
            self.learning_rate = tf.Variable(float(learning_rate),
                                             trainable=False)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
            self.global_step = tf.Variable(0, trainable=False)
            # If we use sampled softmax, we need an output projection.
            output_projection = None
            softmax_loss_function = None
            self.embeddings = tf.get_variable(
                name='embeddings',
                shape=[self.vocab_size, embedding_dim],
                initializer=tf.random_uniform_initializer())
            # Sampled softmax only makes sense if we sample less than vocabulary size.
            if num_samples > 0 and num_samples < self.vocab_size:
                w = tf.get_variable("proj_w", [size, self.vocab_size])
                w_t = tf.transpose(w)
                b = tf.get_variable("proj_b", [self.vocab_size])
                output_projection = (w, b)
                # hidden_size = 128
                # output_size = 1

                # def weighted_sampled_loss(labels,inputs):#bug fixed
                #   labels = tf.reshape(labels, [-1, 1])
                #   inputs_ = tf.matmul(inputs,w) + b
                #   with tf.variable_scope('mlp_weight_loss') as vs:
                #
                #     weight = tf.nn.relu(tf.matmul(inputs,w_i)+b_i,name='input_relu')
                #     weight = tf.nn.relu(tf.matmul(weight,w_h)+b_h,name='hidden_relu')
                #     weight = tf.nn.relu(tf.matmul(weight,w_o),name='output_relu')
                #     weight = tf.reshape(weight,shape=[-1])
                #    #labels_ = tf.one_hot(labels,self.target_vocab_size,1,0)
                #   losses_ = tf.nn.sampled_softmax_loss(w_t, b,labels,inputs, num_samples, self.target_vocab_size)
                #   #losses_ = tf.nn.softmax_cross_entropy_with_logits(labels=labels_, logits=inputs_)
                #   #print('losses_shape:',losses_.get_shape())
                #   weight = tf.nn.softmax(losses_)
                #   return tf.multiply(weight,losses_)
                wi_cell = tf.contrib.rnn.GRUCell(10)
                wo_cell = tf.contrib.rnn.GRUCell(10)

                def weight(inputs, outputs):
                    q, _ = tf.contrib.rnn.static_rnn(wi_cell,
                                                     inputs,
                                                     dtype=tf.float32)
                    a, _ = tf.contrib.rnn.static_rnn(wo_cell,
                                                     outputs,
                                                     dtype=tf.float32)
                    return tf.reduce_mean(q[-1], axis=-1) - tf.reduce_mean(
                        a[-1], axis=-1)

                def sampled_loss(labels, inputs):
                    labels = tf.reshape(labels, [-1, 1])
                    return tf.nn.sampled_softmax_loss(w_t, b, labels, inputs,
                                                      num_samples,
                                                      self.vocab_size)

                # if train_mode:
                #   softmax_loss_function = weighted_sampled_loss
                # else:
                softmax_loss_function = sampled_loss

            # Create the internal multi-layer cell for our RNN.
            # The seq2seq function: we use embedding for the input and attention.
            def seq2seq_f(encoder_inputs=None,
                          decoder_inputs=None,
                          do_decode=False):
                return seq2seq.embedding_attention_seq2seq(
                    encoder_inputs,
                    decoder_inputs,
                    tf.contrib.rnn.MultiRNNCell([
                        tf.contrib.rnn.GRUCell(size) for i in range(num_layers)
                    ]),
                    # num_encoder_symbols=source_vocab_size,
                    # num_decoder_symbols=target_vocab_size,
                    # embedding_size=size,
                    num_symbols=self.vocab_size,
                    embeddings=self.embeddings,
                    output_projection=output_projection,
                    feed_previous=do_decode)

            # Feeds for inputs.
            self.encoder_inputs = []
            self.decoder_inputs = []
            self.target_weights = []
            for i in range(buckets[-1][0]):  # Last bucket is the biggest one.
                self.encoder_inputs.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="encoder{0}".format(i)))
            for i in range(buckets[-1][1] + 1):
                self.decoder_inputs.append(
                    tf.placeholder(tf.int32,
                                   shape=[None],
                                   name="decoder{0}".format(i)))
                self.target_weights.append(
                    tf.placeholder(tf.float32,
                                   shape=[None],
                                   name="weight{0}".format(i)))

            # Our targets are decoder inputs shifted by one.
            targets = [
                self.decoder_inputs[i + 1]
                for i in range(len(self.decoder_inputs) - 1)
            ]
            emb_encoder_inputs = [
                tf.nn.embedding_lookup(self.embeddings, ele)
                for ele in self.encoder_inputs
            ]
            emb_decoder_inputs = [
                tf.nn.embedding_lookup(self.embeddings, ele)
                for ele in self.decoder_inputs
            ]
            # Training outputs and losses.
            if forward_only:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    emb_encoder_inputs,
                    emb_decoder_inputs,
                    targets,
                    self.target_weights,
                    buckets,
                    lambda x, y: seq2seq_f(x, y, True),
                    softmax_loss_function=softmax_loss_function)
                # If we use output projection, we need to project outputs for decoding.
                if output_projection is not None:
                    for b in range(len(buckets)):
                        self.outputs[b] = [[
                            tf.matmul(output_, output_projection[0]) +
                            output_projection[1] for output_ in output
                        ] for output in self.outputs[b]]
            else:
                self.outputs, self.losses = seq2seq.model_with_buckets(
                    emb_encoder_inputs,
                    emb_decoder_inputs,
                    targets,
                    self.target_weights,
                    buckets,
                    lambda x, y: seq2seq_f(x, y, False),
                    softmax_loss_function=softmax_loss_function)

            # Gradients and SGD update operation for training the model.
            params = []
            for ele in tf.trainable_variables():
                if ele.name.startswith(name):
                    params.append(ele)
            if not forward_only:
                self.gradient_norms = []
                self.updates = []
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
                for b in range(len(buckets)):
                    gradients = tf.gradients(self.losses[b], params)
                    clipped_gradients, norm = tf.clip_by_global_norm(
                        gradients, max_gradient_norm)
                    self.gradient_norms.append(norm)
                    self.updates.append(
                        opt.apply_gradients(zip(clipped_gradients, params),
                                            global_step=self.global_step))

            self.saver = tf.train.Saver(params)
            self.variables = []
            for ele in tf.global_variables():
                if ele.name.startswith(name):
                    self.variables.append(ele)
예제 #14
0
    def __init__(self, mode, length_penalty=None, length_penalty_factor=0.6):

        self.src_vocab_size = FLAGS.src_vocab_size
        self.trg_vocab_size = FLAGS.trg_vocab_size
        self.buckets = buckets
        # units of rnn cell
        self.size = FLAGS.hidden_size
        # dimension of words
        self.num_layers = FLAGS.num_layers
        self.batch_size = FLAGS.batch_size if mode == 'RL' or mode == 'MLE' else 1
        self.learning_rate = tf.Variable(0.5, trainable=False)
        self.mode = mode
        self.dummy_reply = ["哈哈 , 是啊 。", "怎麼 了 ?", "你 在 哪 ?"]

        self.r1 = FLAGS.r1
        self.r2 = FLAGS.r2
        self.r3 = FLAGS.r3

        # learning rate decay
        self.learning_rate_decay = self.learning_rate.assign(
            self.learning_rate * 0.99)

        # input for Reinforcement part
        self.loop_or_not = tf.placeholder(tf.bool)
        self.reward = tf.placeholder(tf.float32, [None])
        batch_reward = tf.stop_gradient(self.reward)
        self.RL_index = [None for _ in self.buckets]

        # dropout
        self.input_keep_prob = FLAGS.input_keep_prob
        self.output_keep_prob = FLAGS.output_keep_prob
        self.state_keep_prob = FLAGS.state_keep_prob

        # beam search
        self.beam_search = FLAGS.beam_search
        self.beam_size = FLAGS.beam_size
        self.length_penalty = length_penalty
        self.length_penalty_factor = length_penalty_factor

        # if load pretrain word vector
        self.pretrain_vec = FLAGS.pretrain_vec
        self.pretrain_trainable = FLAGS.pretrain_trainable

        # schedule sampling
        self.sampling_probability_clip = None
        self.schedule_sampling = FLAGS.schedule_sampling
        if self.schedule_sampling == 'False': self.schedule_sampling = False
        self.init_sampling_probability = 1.0
        self.sampling_global_step = FLAGS.sampling_global_step
        self.sampling_decay_steps = FLAGS.sampling_decay_steps
        self.sampling_decay_rate = FLAGS.sampling_decay_rate

        if self.schedule_sampling == 'linear':
            self.decay_fixed = self.init_sampling_probability * (
                self.sampling_decay_steps / self.sampling_global_step)
            with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE):
                self.sampling_probability = tf.get_variable(
                    name=self.schedule_sampling,
                    initializer=tf.constant(self.init_sampling_probability),
                    trainable=False)
            self.sampling_probability_decay = tf.assign_sub(
                self.sampling_probability, self.decay_fixed)
            self.sampling_probability_clip = tf.clip_by_value(
                self.sampling_probability, 0.0, 1.0)
            #self.sampling_probability = tf.maximum(self.sampling_probability,tf.constant(0.0))
        elif self.schedule_sampling == 'exp':
            with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE):
                self.sampling_probability = tf.get_variable(
                    name=self.schedule_sampling,
                    initializer=tf.constant(self.init_sampling_probability),
                    trainable=False)
            #self.sampling_probability = tf.train.exponential_decay(
            self.sampling_probability_decay = tf.assign(
                self.sampling_probability,
                tf.train.natural_exp_decay(self.sampling_probability,
                                           self.sampling_global_step,
                                           self.sampling_decay_steps,
                                           self.sampling_decay_rate,
                                           staircase=True))
            self.sampling_probability_clip = tf.clip_by_value(
                self.sampling_probability, 0.0, 1.0)
        elif self.schedule_sampling == 'inverse_sigmoid':
            with tf.variable_scope('sampling_prob', reuse=tf.AUTO_REUSE):
                self.sampling_probability = tf.get_variable(
                    name=self.schedule_sampling,
                    initializer=tf.constant(self.init_sampling_probability),
                    trainable=False)
            self.sampling_probability_decay = tf.assign(
                self.sampling_probability,
                #tf.train.cosine_decay(
                tf.train.linear_cosine_decay(
                    self.sampling_probability,
                    self.sampling_decay_steps,
                    self.sampling_global_step,
                ))
            self.sampling_probability_clip = tf.clip_by_value(
                self.sampling_probability, 0.0, 1.0)
        elif not self.schedule_sampling:
            pass
        else:
            raise ValueError(
                "schedule_sampling must be one of the following: [linear|exp|inverse_sigmoid|False]"
            )

        w_t = tf.get_variable('proj_w', [self.trg_vocab_size, self.size])
        w = tf.transpose(w_t)
        b = tf.get_variable('proj_b', [self.trg_vocab_size])
        output_projection = (w, b)

        def sample_loss(labels, inputs):
            labels = tf.reshape(labels, [-1, 1])
            local_w_t = tf.cast(w_t, tf.float32)
            local_b = tf.cast(b, tf.float32)
            local_inputs = tf.cast(inputs, tf.float32)
            return tf.cast(tf.nn.sampled_softmax_loss(
                weights=local_w_t,
                biases=local_b,
                inputs=local_inputs,
                labels=labels,
                num_sampled=512,
                num_classes=self.trg_vocab_size),
                           dtype=tf.float32)

        softmax_loss_function = sample_loss

        #FIXME add RL function
        def seq2seq_multi(encoder_inputs,
                          decoder_inputs,
                          mode,
                          pretrain_vec=None):
            if pretrain_vec is not None:
                pad_num = self.src_vocab_size - pretrain_vec.shape[0]
                pretrain_vec = np.pad(pretrain_vec, [(0, pad_num), (0, 0)],
                                      mode='constant')
                tag_vec = pretrain_vec[:data_utils.SPECIAL_TAGS_COUNT]
                pretrain_vec = pretrain_vec[data_utils.SPECIAL_TAGS_COUNT:]
                special_tags = tf.get_variable(name="special_tags",
                                               initializer=tag_vec,
                                               trainable=True)
                embedding = tf.get_variable(name="embedding",
                                            initializer=pretrain_vec,
                                            trainable=self.pretrain_trainable)
                embedding = tf.concat([special_tags, embedding], 0)
            else:
                embedding = tf.get_variable("embedding",
                                            [self.src_vocab_size, self.size])
            loop_function_RL = None
            if mode == 'MLE':
                feed_previous = False
            elif mode == 'TEST':
                feed_previous = True

            # need loop_function
            elif mode == 'RL':
                feed_previous = True

                def loop_function_RL(prev, i):
                    prev = tf.matmul(
                        prev, output_projection[0]) + output_projection[1]
                    prev_index = tf.multinomial(tf.log(tf.nn.softmax(prev)), 1)

                    if i == 1:
                        for index, RL in enumerate(self.RL_index):
                            if RL is None:
                                self.RL_index[index] = prev_index
                                self.index = index
                                break
                    else:
                        self.RL_index[self.index] = tf.concat(
                            [self.RL_index[self.index], prev_index], axis=1)
                    #self.RL_index: [(?,9),(?,14),(?,24),(?,49)]
                    #RL_index指的是取樣後每個字的index
                    prev_index = tf.reshape(prev_index, [-1])
                    #prev_index: (?,)
                    # decide which to be the next time step input
                    sample = tf.nn.embedding_lookup(embedding, prev_index)
                    #sample: (?,256)
                    from_decoder = tf.nn.embedding_lookup(
                        embedding, decoder_inputs[i])
                    #from_decoder: (?,256)
                    return tf.where(self.loop_or_not, sample, from_decoder)
            self.loop_function_RL = loop_function_RL

            return seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=self.src_vocab_size,
                num_decoder_symbols=self.trg_vocab_size,
                embedding_size=self.size,
                output_projection=output_projection,
                feed_previous=feed_previous,
                dtype=tf.float32,
                embedding=embedding,
                beam_search=self.beam_search,
                beam_size=self.beam_size,
                loop=loop_function_RL,
                schedule_sampling=self.schedule_sampling,
                sampling_probability=self.sampling_probability_clip,
                length_penalty=self.length_penalty,
                length_penalty_factor=self.length_penalty_factor)

        # inputs
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        for i in range(buckets[-1][0]):
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name='encoder{0}'.format(i)))
        for i in range(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name='decoder{0}'.format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name='weight{0}'.format(i)))
        targets = [
            self.decoder_inputs[i + 1]
            for i in range(len(self.decoder_inputs) - 1)
        ]

        def single_cell():
            return tf.contrib.rnn.GRUCell(self.size)
            #return tf.contrib.rnn.BasicLSTMCell(self.size)

        cell = single_cell()
        if self.num_layers > 1:
            cell = tf.contrib.rnn.MultiRNNCell(
                [single_cell() for _ in range(self.num_layers)])
            cell = rnn.DropoutWrapper(cell,
                                      input_keep_prob=self.input_keep_prob,
                                      output_keep_prob=self.output_keep_prob,
                                      state_keep_prob=self.state_keep_prob)

        if self.mode == 'MLE':
            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.buckets,
                lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec),
                softmax_loss_function=softmax_loss_function)

            for b in range(len(self.buckets)):
                self.outputs[b] = [
                    tf.matmul(output, output_projection[0]) +
                    output_projection[1] for output in self.outputs[b]
                ]

            self.update = []
            optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in range(len(self.buckets)):
                gradients = tf.gradients(self.losses[b],
                                         tf.trainable_variables())
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                self.update.append(
                    optimizer.apply_gradients(
                        zip(clipped_gradients, tf.trainable_variables())))

        elif self.mode == 'TEST':
            #self.buckets = [(10, 50), (15, 50), (25, 50), (50, 50)]

            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.buckets,
                lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec),
                softmax_loss_function=softmax_loss_function)

            for b in range(len(self.buckets)):
                #print('self.outputs[b]: ',self.outputs[b])
                self.outputs[b] = [
                    tf.matmul(output, output_projection[0]) +
                    output_projection[1] for output in self.outputs[b]
                ]
                #print('self.outputs[b]: ',self.outputs[b])

        elif self.mode == 'RL':

            self.outputs, self.losses = seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                self.buckets,
                lambda x, y: seq2seq_multi(x, y, self.mode, self.pretrain_vec),
                softmax_loss_function=softmax_loss_function,
                per_example_loss=True)

            #print('self.buckets: ',len(self.buckets))
            for b in range(len(self.buckets)):
                self.outputs[b] = [
                    tf.matmul(output, output_projection[0]) +
                    output_projection[1] for output in self.outputs[b]
                ]

            #print('self.RL_index: ',self.RL_index)
            #print('self.outputs: ',len(self.outputs[0]),len(self.outputs[1]),len(self.outputs[2]),len(self.outputs[3]))
            #print('self.RL_index: ',len(self.RL_index))
            #print('self.outputs: ',len(self.outputs))
            for i, b in enumerate(self.outputs):
                prev_index = tf.multinomial(
                    tf.log(tf.nn.softmax(b[self.buckets[i][1] - 1])), 1)
                #下面一行目的為補足最後一個decoder output,因為在decoder當中呼叫一次loop_function,RL_index才會append一次,但最後一個input得到的output不會再當prev丟入下一個loop_function,因此要從self.outputs的最後一個物件來補齊。
                self.RL_index[i] = tf.concat([self.RL_index[i], prev_index],
                                             axis=1)
                #print(i,len(b))
                #print('self.buckets: ',self.buckets)
                #print('self.buckets[i][1]: ',self.buckets[i][1])
                #print('self.buckets[i][1] - 1: ',self.buckets[i][1] - 1)
                #print('b[self.buckets[i][1] - 1]: ', b[self.buckets[i][1] - 1])
                #print('prev_index: ',prev_index)
                #print('self.RL_index[i]: ',self.RL_index[i])
                #print('----------------')
            #self.outputs: list of 4 buckets, each (?,6258)
            #print('self.RL_index: ',self.RL_index)

            self.update = []
            optimizer = tf.train.GradientDescentOptimizer(0.01)
            #optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in range(len(self.buckets)):

                scaled_loss = tf.multiply(self.losses[b], batch_reward)
                self.losses[b] = tf.reduce_mean(scaled_loss)

                gradients = tf.gradients(self.losses[b],
                                         tf.trainable_variables())
                clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                self.update.append(
                    optimizer.apply_gradients(
                        zip(clipped_gradients, tf.trainable_variables())))

        # specify saver
        self.saver = tf.train.Saver(max_to_keep=10)
예제 #15
0
  def __init__(self,
               source_vocab_size,
               target_vocab_size,
               buckets,
               size,
               num_layers,
               max_gradient_norm,
               batch_size,
               learning_rate,
               learning_rate_decay_factor,
               use_lstm=False,
               num_samples=512,
               forward_only=False,
               dtype=tf.float32):
    """Create the model.
    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
    self.source_vocab_size = source_vocab_size
    self.target_vocab_size = target_vocab_size
    self.buckets = buckets
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(
        float(learning_rate), trainable=False, dtype=dtype)
    self.learning_rate_decay_op = self.learning_rate.assign(
        self.learning_rate * learning_rate_decay_factor)
    self.global_step = tf.Variable(0, trainable=False)
    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if num_samples > 0 and num_samples < self.target_vocab_size:
      w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype)
      w = tf.transpose(w_t)
      b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype)
      output_projection = (w, b)

      def sampled_loss(labels, logits):
        labels = tf.reshape(labels, [-1, 1])
        # We need to compute the sampled_softmax_loss using 32bit floats to
        # avoid numerical instabilities.
        local_w_t = tf.cast(w_t, tf.float32)
        local_b = tf.cast(b, tf.float32)
        local_inputs = tf.cast(logits, tf.float32)
        return tf.cast(
            tf.nn.sampled_softmax_loss(
                weights=local_w_t,
                biases=local_b,
                labels=labels,
                inputs=local_inputs,
                num_sampled=num_samples,
                num_classes=self.target_vocab_size),
            dtype)
      softmax_loss_function = sampled_loss


    def get_lstm(): # MK add this function
       cell = tf.contrib.rnn.BasicLSTMCell(size, state_is_tuple=True,reuse=tf.get_variable_scope().reuse)
       return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=0.8)


    # Create the internal multi-layer cell for our RNN.
    def single_cell():
      #return tf.contrib.rnn.GRUCell(size) #MK add dropout
      cell = tf.contrib.rnn.GRUCell(size) 
      #cell = GRUCell(size) #MK
      return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=0.8)
    if use_lstm:
      def single_cell():
        return tf.contrib.rnn.BasicLSTMCell(size)
    cell = single_cell()
    if num_layers > 1: #MK change for testing
      cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)])
      #cell = tf.contrib.rnn.MultiRNNCell([get_lstm() for _ in range(num_layers)],state_is_tuple=True)  # MK change

    # The seq2seq function: we use embedding for the input and attention.
    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
       return seq2seq.embedding_rnn_seq2seq(
           encoder_inputs,
           decoder_inputs,
           cell,
           num_encoder_symbols=source_vocab_size,
           num_decoder_symbols=target_vocab_size,
           embedding_size=size,
           output_projection=output_projection,
           feed_previous=do_decode,
           dtype=dtype)



    # Feeds for inputs.
    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []
    for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], #MK
                                                name="encoder{0}".format(i)))

    for i in xrange(buckets[-1][1] + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], #MK
                                                name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(dtype, shape=[None], #MK
                                                name="weight{0}".format(i)))

    # Our targets are decoder inputs shifted by one.
    targets = [self.decoder_inputs[i + 1]
               for i in xrange(len(self.decoder_inputs) - 1)]

    # Training outputs and losses.
    if forward_only:
      #self.outputs, self.losses,self.states = tf.contrib.legacy_seq2seq.model_with_buckets( #MK change
      self.outputs, self.losses,self.states,self.enc_outputs = seq2seq.model_with_buckets( #MK change
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True),
          softmax_loss_function=softmax_loss_function)
      # If we use output projection, we need to project outputs for decoding.
      if output_projection is not None:
        for b in xrange(len(buckets)):
          self.outputs[b] = [
              tf.matmul(output, output_projection[0]) + output_projection[1]
              for output in self.outputs[b]
          ]
    else:
     # self.outputs, self.losses,_ = tf.contrib.legacy_seq2seq.model_with_buckets( # MK change
      self.outputs, self.losses,self.states,self.enc_outputs = seq2seq.model_with_buckets( # MK change 
          self.encoder_inputs, self.decoder_inputs, targets,
          self.target_weights, buckets,
          lambda x, y: seq2seq_f(x, y, False),
          softmax_loss_function=softmax_loss_function)

    # Gradients and SGD update operation for training the model.
    params = tf.trainable_variables()

    #if not forward_only: #MK
    if True:   
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      for b in xrange(len(buckets)):
        gradients = tf.gradients(self.losses[b], params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients,
                                                         max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(
            zip(clipped_gradients, params), global_step=self.global_step))

   
    self.saver = tf.train.Saver(tf.global_variables())
예제 #16
0
  def __init__(self, source_target_vocab_size, buckets, size,
               num_layers, max_gradient_norm, batch_size, learning_rate,
               learning_rate_decay_factor,
               scheduling_rate, scheduling_rate_decay_factor,
               num_samples = 4096, forward_only=False):
    """Create the model.

    Args:
      source_target_vocab_size: size of the source/target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      scheduling_rate:
      scheduling_rate_decay_factor:
      forward_only: if set, we do not construct the backward pass in the model.
    """
    self.source_target_vocab_size = source_target_vocab_size
    self.buckets = buckets
    self.batch_size = batch_size
    self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
    self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor)

    self.scheduling_rate = tf.Variable(float(scheduling_rate), trainable=False)
    self.scheduling_rate_decay_op = self.scheduling_rate.assign(self.scheduling_rate * scheduling_rate_decay_factor)

    self.global_step = tf.Variable(0, trainable=False)

    # If we use sampled softmax, we need an output projection.
    output_projection = None
    softmax_loss_function = None
    # Sampled softmax only makes sense if we sample less than vocabulary size.
    if num_samples > 0 and num_samples < self.source_target_vocab_size:
      with tf.device("/cpu:0"):
        w = tf.get_variable("proj_w", [size, self.source_target_vocab_size])
        w_t = tf.transpose(w)
        b = tf.get_variable("proj_b", [self.source_target_vocab_size])
      output_projection = (w, b)

      def sampled_loss(inputs, labels):
        with tf.device("/cpu:0"):
          labels = tf.reshape(labels, [-1, 1])
          return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                                            self.source_target_vocab_size)
      softmax_loss_function = sampled_loss

    # Create the internal multi-layer cell for our RNN.
    single_cell = tf.nn.rnn_cell.BasicLSTMCell(size)
    cell = single_cell
    if num_layers > 1:
      cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)

    def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
        return seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_target_vocab_size, num_decoder_symbols=source_target_vocab_size, embedding_size = size, output_projection = output_projection, feed_previous=do_decode, scheduling_rate = self.scheduling_rate )

    self.encoder_inputs = []
    self.decoder_inputs = []
    self.target_weights = []

    for i in xrange(buckets[-1][0]):
      self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i)))
    

    for i in xrange(buckets[-1][1] + 1):
      self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i)))
      self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i)))

    targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)]



    if forward_only:
      self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x,y,True), softmax_loss_function=softmax_loss_function)
      print ("i'm in here")
      if output_projection is not None:
        for b in xrange(len(buckets)):
          self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1]
          for output in self.outputs[b] ]
    else:
      self.outputs, self.losses = seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x,y,False), softmax_loss_function = softmax_loss_function)

    params = tf.trainable_variables()
    if not forward_only:
      self.gradient_norms = []
      self.updates = []
      opt = tf.train.GradientDescentOptimizer(self.learning_rate)
      for b in xrange(len(buckets)):
        gradients = tf.gradients(self.losses[b], params)
        clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
        self.gradient_norms.append(norm)
        self.updates.append(opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step))

    self.saver = tf.train.Saver(tf.all_variables())
예제 #17
0
    def __init__(self,
                 buckets,
                 source_vocab_sizes,
                 target_vocab_sizes,
                 size,
                 source_embedding_sizes,
                 target_embedding_sizes,
                 target_data_types,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 decoder_type,
                 use_lstm=True,
                 average_loss_across_timesteps=True,
                 forward_only=False,
                 feed_previous=False,
                 predict_span_end_pointers=False,
                 use_adam=False,
                 restrict_decoder_structure=False,
                 transition_vocab_sets=None,
                 transition_state_map=None,
                 encoder_decoder_vocab_map=None,
                 use_bidirectional_encoder=False,
                 pretrained_word_embeddings=None,
                 word_embeddings=None,
                 dtype=tf.float32):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      forward_only: if set, we do not construct the backward pass in the model.
      dtype: the data type to use to store internal variables.
    """
        self.buckets = buckets
        self.batch_size = batch_size
        self.decoder_type = decoder_type

        self.transition_vocab_sets = transition_vocab_sets
        if transition_state_map is None:
            self.transition_state_map = None
        else:
            self.transition_state_map = tf.constant(transition_state_map)
        self.encoder_decoder_vocab_map = tf.constant(encoder_decoder_vocab_map)
        self.use_stack_decoder = decoder_type == data_utils.STACK_DECODER_STATE
        self.average_loss_across_timesteps = average_loss_across_timesteps
        self.input_keep_prob = tf.placeholder(tf.float32,
                                              name="input_keep_probability")
        self.output_keep_prob = tf.placeholder(tf.float32,
                                               name="output_keep_probability")

        if not use_adam:
            self.learning_rate = tf.Variable(float(learning_rate),
                                             trainable=False,
                                             dtype=dtype)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        self.embedding_weights = {}
        for source_type in source_embedding_sizes.iterkeys():
            self.embedding_weights[source_type] = tf.Variable(
                tf.constant(0.0,
                            shape=[
                                source_vocab_sizes[source_type],
                                source_embedding_sizes[source_type]
                            ]),
                trainable=(source_type <> 'em'),
                name=source_type + "_encoder_embeddings")
            if source_type == 'en':
                assert word_embeddings is not None
                assert source_embedding_sizes['en'] == word_embeddings.shape[1]
                self.embedding_weights['en'].assign(word_embeddings)
            elif source_type == 'em':
                assert pretrained_word_embeddings is not None
                assert source_embedding_sizes[
                    'em'] == pretrained_word_embeddings.shape[1]
                self.embedding_weights['em'].assign(pretrained_word_embeddings)
            else:
                init_vectors = np.random.uniform(
                    -np.sqrt(3), np.sqrt(3),
                    (source_vocab_sizes[source_type],
                     source_embedding_sizes[source_type]))
                self.embedding_weights[source_type].assign(init_vectors)

        output_projections = {}
        for target_type in target_vocab_sizes.iterkeys():
            vocab_size = target_vocab_sizes[target_type]
            w = tf.get_variable(
                target_type + "_proj_w", [size, vocab_size],
                initializer=tf.uniform_unit_scaling_initializer(),
                dtype=dtype)
            w_t = tf.transpose(w)
            b = tf.get_variable(target_type + "_proj_b", [vocab_size],
                                dtype=dtype)
            output_projections[target_type] = (w, b)

        def full_loss(logits, labels):
            labels = tf.reshape(labels, [-1])
            return tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits, labels)

        def full_output_loss(inputs, labels):
            logits = tf.nn.xw_plus_b(inputs, w, b)
            labels = tf.reshape(labels, [-1])
            return tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits, labels)

        softmax_loss_function = full_loss

        def create_cell(use_dropout=True):
            # Create the internal cell for our RNN.
            if use_lstm:
                cell = tf.nn.rnn_cell.LSTMCell(
                    size,
                    use_peepholes=False,
                    state_is_tuple=True,
                    initializer=tf.uniform_unit_scaling_initializer())
            else:
                cell = tf.nn.rnn_cell.GRUCell(size)
            if use_dropout:
                cell = tf.nn.rnn_cell.DropoutWrapper(cell,
                                                     self.input_keep_prob,
                                                     self.output_keep_prob)
            return cell

        with tf.variable_scope("encoder_fw"):
            fw_cell = create_cell()
        with tf.variable_scope("encoder_bw"):
            bw_cell = create_cell()

        with tf.variable_scope("decoder_main"):
            dec_cell = create_cell()
        with tf.variable_scope("decoder_aux"):
            dec_aux_cell = create_cell(False)
        if self.decoder_type == data_utils.MEMORY_STACK_DECODER_STATE:
            with tf.variable_scope("decoder_lin_mem"):
                dec_mem_cell = create_cell()
        else:
            dec_mem_cell = None

        self.decoder_restrictions = []
        num_decoder_restrictions = 0
        if restrict_decoder_structure:
            num_decoder_restrictions = data_utils.NUM_TR_STATES
        for i in xrange(num_decoder_restrictions):
            self.decoder_restrictions.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="restrictions{0}".format(i)))

        if self.transition_vocab_sets is None:
            self.decoder_transition_map = None
        else:
            self.decoder_transition_map = data_utils.construct_transition_map(
                self.transition_vocab_sets, False)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            return seq2seq.embedding_attention_seq2seq(
                self.decoder_type,
                encoder_inputs,
                decoder_inputs,
                fw_cell,
                bw_cell,
                dec_cell,
                dec_aux_cell,
                dec_mem_cell,
                source_vocab_sizes,
                target_vocab_sizes,
                source_embedding_sizes,
                target_embedding_sizes,
                predict_span_end_pointers=predict_span_end_pointers,
                decoder_restrictions=self.decoder_restrictions,
                output_projections=output_projections,
                word_vectors=self.embedding_weights,
                transition_state_map=self.transition_state_map,
                encoder_decoder_vocab_map=self.encoder_decoder_vocab_map,
                use_bidirectional_encoder=use_bidirectional_encoder,
                feed_previous=do_decode,
                dtype=dtype)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []

        # For now assume that we only have embedding inputs, and single sequence
        # of target weights.

        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append({})
            for key in source_vocab_sizes.iterkeys():
                self.encoder_inputs[-1][key] = tf.placeholder(
                    tf.int32,
                    shape=[None],
                    name="encoder_{0}_{1}".format(key, i))

        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append({})
            for key in target_data_types:
                self.decoder_inputs[-1][key] = tf.placeholder(
                    tf.int32,
                    shape=[None],
                    name="decoder_{0}_{1}".format(key, i))

        for i in xrange(buckets[-1][1] + 1):
            self.target_weights.append({})
            for key in target_data_types:
                if key == "parse" or key == "predicate" or key == "ind":
                    self.target_weights[-1][key] = tf.placeholder(
                        dtype,
                        shape=[None],
                        name="weight_{0}_{1}".format(key, i))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        self.outputs, self.losses = seq2seq.model_with_buckets(
            self.encoder_inputs,
            self.decoder_inputs,
            targets,
            self.target_weights,
            buckets,
            lambda x, y: seq2seq_f(x, y, feed_previous),
            forward_only,
            softmax_loss_function=softmax_loss_function,
            average_across_timesteps=self.average_loss_across_timesteps)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            if use_adam:
                opt = tf.train.AdamOptimizer(learning_rate, epsilon=1e-02)
            else:
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                if max_gradient_norm > 0:
                    clipped_gradients, norm = tf.clip_by_global_norm(
                        gradients, max_gradient_norm)
                    self.gradient_norms.append(norm)
                    self.updates.append(
                        opt.apply_gradients(zip(clipped_gradients, params),
                                            global_step=self.global_step))
                else:
                    self.gradient_norms.append(tf.zeros([1]))
                    self.updates.append(
                        opt.apply_gradients(zip(gradients, params),
                                            global_step=self.global_step))

        self.saver = tf.train.Saver(tf.all_variables())