示例#1
0
    def _make_input(self, embed):
        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        self.index2symbol = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value=_UNK,
                                             shared_name="out_table",
                                             name="out_table",
                                             checkpoint=True)
        with tf.variable_scope("input"):
            self.post_string = tf.placeholder(tf.string, (None, None),
                                              'post_string')
            self.ref_string = tf.placeholder(tf.string, (None, None),
                                             'ref_string')
            self.response_string = tf.placeholder(tf.string, (None, None),
                                                  'response_string')

            self.post = self.symbol2index.lookup(self.post_string)
            self.post_len = tf.placeholder(tf.int32, (None, ), 'post_len')
            self.ref = self.symbol2index.lookup(self.ref_string)
            self.ref_len = tf.placeholder(tf.int32, (None, ), 'ref_len')
            self.response = self.symbol2index.lookup(self.response_string)
            self.response_len = tf.placeholder(tf.int32, (None, ),
                                               'response_len')

            with tf.variable_scope("embedding") as scope:
                if embed is None:
                    # initialize the embedding randomly
                    self.emb_enc = self.emb_dec = tf.get_variable(
                        "emb_share", [self.vocab_size, self.embed_size],
                        dtype=tf.float32)
                else:
                    # initialize the embedding by pre-trained word vectors
                    print "share pre-trained embed"
                    self.emb_enc = self.emb_dec = tf.get_variable(
                        'emb_share', dtype=tf.float32, initializer=embed)

            self.enc_post = tf.nn.embedding_lookup(self.emb_enc, self.post)
            self.enc_ref = tf.nn.embedding_lookup(self.emb_enc, self.ref)
            self.enc_response = tf.nn.embedding_lookup(self.emb_enc,
                                                       self.response)

            self.batch_len = tf.shape(self.response)[1]
            self.batch_size = tf.shape(self.response)[0]
            self.response_input = tf.concat([
                tf.ones((self.batch_size, 1), dtype=tf.int64) * GO_ID,
                tf.split(self.response, [self.batch_len - 1, 1], axis=1)[0]
            ], 1)
            self.dec_inp = tf.nn.embedding_lookup(self.emb_dec,
                                                  self.response_input)

            self.keep_prob = tf.placeholder_with_default(1.0, ())
            self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior")
示例#2
0
 def _init_vocabs(self):
     self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                          value_dtype=tf.int64,
                                          default_value=UNK_ID,
                                          shared_name="in_table",
                                          name="in_table",
                                          checkpoint=True)
     self.index2symbol = MutableHashTable(key_dtype=tf.int64,
                                          value_dtype=tf.string,
                                          default_value='_UNK',
                                          shared_name="out_table",
                                          name="out_table",
                                          checkpoint=True)
     self.entity2index = MutableHashTable(key_dtype=tf.string,
                                          value_dtype=tf.int64,
                                          default_value=NONE_ID,
                                          shared_name="entity_in_table",
                                          name="entity_in_table",
                                          checkpoint=True)
     self.index2entity = MutableHashTable(key_dtype=tf.int64,
                                          value_dtype=tf.string,
                                          default_value='_NONE',
                                          shared_name="entity_out_table",
                                          name="entity_out_table",
                                          checkpoint=True)
示例#3
0
文件: model.py 项目: juvu/seq2seq_cn
    def __init__(
            self,
            embed,  # 词嵌入
            vocabulary,
            vocabulary_count,
            num_layers,  # encoder和decoder的层数
            num_units,  # encoder和decoder的隐藏状态维度
            learning_rate,
            max_gradient_norm,
            max_len):

        self.post_string = tf.placeholder(
            dtype=tf.string, shape=(None, None),
            name="post_string")  # post字符串,batch_size*length
        self.response_string = tf.placeholder(
            dtype=tf.string, shape=(None, None),
            name="response_string")  # response字符串,batch_size*length
        self.label_string = tf.placeholder(dtype=tf.string,
                                           shape=(None, None),
                                           name="label_string")
        self.post_len = tf.placeholder(dtype=tf.int32,
                                       shape=(None, ),
                                       name="post_len")  # post长度
        self.response_len = tf.placeholder(dtype=tf.int32,
                                           shape=(None, ),
                                           name="reponse_len")  # response长度
        self.embed = tf.get_variable(dtype=tf.float32,
                                     initializer=embed,
                                     name="embed")  # 词嵌入,作为变量训练
        self.vocabulary = tf.constant(vocabulary, dtype=tf.string)  # 词汇表

        self.batch_size = tf.shape(self.post_string)[0]
        self.encoder_len = tf.shape(self.post_string)[1]
        self.decoder_len = tf.shape(self.response_string)[1]

        self.mask = tf.cumsum(tf.one_hot(self.response_len - 1,
                                         self.decoder_len),
                              axis=1,
                              reverse=True)

        # 将字符转化成id表示的表
        self.string_to_id = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=1,
                                             shared_name="string_to_id",
                                             name="string_to_id",
                                             checkpoint=True)
        # 将id转化成字符串表示的表
        self.id_to_string = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value="_NDW",
                                             shared_name="id_to_string",
                                             name="id_to_string",
                                             checkpoint=True)

        # 将post和response转化成id表示
        self.post_id = self.string_to_id.lookup(
            self.post_string)  # batch_size*length
        self.response_id = self.string_to_id.lookup(
            self.response_string)  # batch_size*length
        self.label_id = self.string_to_id.lookup(self.label_string)  #

        # 将post和response转化成嵌入表示
        self.post_embed = tf.nn.embedding_lookup(
            embed, self.post_id)  # batch_size*length*embed_size
        self.response_embed = tf.nn.embedding_lookup(
            embed, self.response_id)  # batch_size*length*embed_size

        # encoder和decoder的层数和维度
        encoder_cell = MultiRNNCell(
            [LSTMCell(num_units) for _ in range(num_layers)])
        decoder_cell = MultiRNNCell(
            [LSTMCell(num_units) for _ in range(num_layers)])

        projection_fn, loss_fn, inference_fn = get_project_funtion(
            vocabulary_count)

        with tf.variable_scope("encoder"):
            self.encoder_output, self.encoder_state = tf.nn.dynamic_rnn(
                encoder_cell, self.post_embed, self.post_len, dtype=tf.float32)
            # self.encoder_output_shape = tf.shape(self.encoder_output)  # [batch_size encoder_len num_units]
            # self.encoder_state_shape = tf.shape(self.encoder_state)  # [num_layers 2 batch_size num_units]

        with tf.variable_scope("decoder"):
            self.decoder_output, self.decoder_state, self.loop_state = dynamic_decoder(
                decoder_cell,
                encoder_state=self.encoder_state,
                input=self.response_embed,
                response_len=self.response_len)
            # self.decoder_output_shape = tf.shape(self.decoder_output)  # [batch_size decoder_len num_units]
            # self.decoder_state_shape = tf.shape(self.decoder_state)  # [num_layers 2 batch_size num_units]
            # self.softmaxed_probability = projection_function(self.decoder_output)  # 词汇表softmaxed后的概率 [batch_size decoder_len vovabulary_count]
            # self.maximum_likelihood_id = tf.argmax(self.softmaxed_probability, axis=2)  # [batch_size decoder_len]
            # self.output_string = self.id_to_string.lookup(self.maximum_likelihood_id)
            self.loss, self.avg_loss = loss_fn(self.decoder_output,
                                               self.label_id, self.mask)

        with tf.variable_scope("decoder", reuse=True):
            self.inference_output, self.inference_state, self.inference_loop_state = dynamic_decoder(
                decoder_cell,
                encoder_state=self.encoder_state,
                projection_function=projection_fn,
                embed=self.embed,
                max_len=max_len)
            self.inference_maximum_likelihood_id = inference_fn(
                self.inference_output)
            self.inference_string = self.id_to_string.lookup(
                self.inference_maximum_likelihood_id
            )  # [batch_size decoder_len]

        self.global_step = tf.Variable(0, trainable=False, name="global_step")
        self.params = tf.global_variables()
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        self.saver = tf.train.Saver(max_to_keep=3)
示例#4
0
class CVAE(object):
    def __init__(self, tfFLAGS, embed=None):
        self.vocab_size = tfFLAGS.vocab_size
        self.embed_size = tfFLAGS.embed_size
        self.num_units = tfFLAGS.num_units
        self.num_layers = tfFLAGS.num_layers
        self.beam_width = tfFLAGS.beam_width
        self.use_lstm = tfFLAGS.use_lstm
        self.attn_mode = tfFLAGS.attn_mode
        self.train_keep_prob = tfFLAGS.keep_prob
        self.max_decode_len = tfFLAGS.max_decode_len
        self.bi_encode = tfFLAGS.bi_encode
        self.recog_hidden_units = tfFLAGS.recog_hidden_units
        self.prior_hidden_units = tfFLAGS.prior_hidden_units
        self.z_dim = tfFLAGS.z_dim
        self.full_kl_step = tfFLAGS.full_kl_step

        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.max_gradient_norm = 5.0
        if tfFLAGS.opt == 'SGD':
            self.learning_rate = tf.Variable(float(tfFLAGS.learning_rate),
                                             trainable=False,
                                             dtype=tf.float32)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * tfFLAGS.learning_rate_decay_factor)
            self.opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        elif tfFLAGS.opt == 'Momentum':
            self.opt = tf.train.MomentumOptimizer(
                learning_rate=tfFLAGS.learning_rate, momentum=tfFLAGS.momentum)
        else:
            self.learning_rate = tfFLAGS.learning_rate
            self.opt = tf.train.AdamOptimizer()

        self._make_input(embed)

        with tf.variable_scope("output_layer"):
            self.output_layer = Dense(
                self.vocab_size,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.1))

        with tf.variable_scope("encoders",
                               initializer=tf.orthogonal_initializer()):
            self.enc_post_outputs, self.enc_post_state = self._build_encoder(
                scope='post_encoder',
                inputs=self.enc_post,
                sequence_length=self.post_len)
            self.enc_ref_outputs, self.enc_ref_state = self._build_encoder(
                scope='ref_encoder',
                inputs=self.enc_ref,
                sequence_length=self.ref_len)
            self.enc_response_outputs, self.enc_response_state = self._build_encoder(
                scope='resp_encoder',
                inputs=self.enc_response,
                sequence_length=self.response_len)

            self.post_state = self._get_representation_from_enc_state(
                self.enc_post_state)
            self.ref_state = self._get_representation_from_enc_state(
                self.enc_ref_state)
            self.response_state = self._get_representation_from_enc_state(
                self.enc_response_state)
            self.cond_embed = tf.concat([self.post_state, self.ref_state],
                                        axis=-1)

        with tf.variable_scope("RecognitionNetwork"):
            recog_input = tf.concat([self.cond_embed, self.response_state],
                                    axis=-1)
            recog_hidden = tf.layers.dense(inputs=recog_input,
                                           units=self.recog_hidden_units,
                                           activation=tf.nn.tanh)
            recog_mulogvar = tf.layers.dense(inputs=recog_hidden,
                                             units=self.z_dim * 2,
                                             activation=None)
            # recog_mulogvar = tf.layers.dense(inputs=recog_input, units=self.z_dim * 2, activation=None)
            recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=-1)

        with tf.variable_scope("PriorNetwork"):
            prior_input = self.cond_embed
            prior_hidden = tf.layers.dense(inputs=prior_input,
                                           units=self.prior_hidden_units,
                                           activation=tf.nn.tanh)
            prior_mulogvar = tf.layers.dense(inputs=prior_hidden,
                                             units=self.z_dim * 2,
                                             activation=None)
            prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=-1)

        with tf.variable_scope("GenerationNetwork"):
            latent_sample = tf.cond(
                self.use_prior,
                lambda: sample_gaussian(prior_mu, prior_logvar),
                lambda: sample_gaussian(recog_mu, recog_logvar),
                name='latent_sample')

            gen_input = tf.concat([self.cond_embed, latent_sample], axis=-1)
            if self.use_lstm:
                self.dec_init_state = tuple([
                    tf.contrib.rnn.LSTMStateTuple(
                        c=tf.layers.dense(inputs=gen_input,
                                          units=self.num_units,
                                          activation=None),
                        h=tf.layers.dense(inputs=gen_input,
                                          units=self.num_units,
                                          activation=None))
                    for _ in range(self.num_layers)
                ])
                print self.dec_init_state
            else:
                self.dec_init_state = tuple([
                    tf.layers.dense(inputs=gen_input,
                                    units=self.num_units,
                                    activation=None)
                    for _ in range(self.num_layers)
                ])

            kld = gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar)
            self.avg_kld = tf.reduce_mean(kld)
            self.kl_weights = tf.minimum(
                tf.to_float(self.global_step) / self.full_kl_step, 1.0)
            self.kl_loss = self.kl_weights * self.avg_kld

        self._build_decoder()
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=1,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
        for var in tf.trainable_variables():
            print var

    def _make_input(self, embed):
        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        self.index2symbol = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value=_UNK,
                                             shared_name="out_table",
                                             name="out_table",
                                             checkpoint=True)
        with tf.variable_scope("input"):
            self.post_string = tf.placeholder(tf.string, (None, None),
                                              'post_string')
            self.ref_string = tf.placeholder(tf.string, (None, None),
                                             'ref_string')
            self.response_string = tf.placeholder(tf.string, (None, None),
                                                  'response_string')

            self.post = self.symbol2index.lookup(self.post_string)
            self.post_len = tf.placeholder(tf.int32, (None, ), 'post_len')
            self.ref = self.symbol2index.lookup(self.ref_string)
            self.ref_len = tf.placeholder(tf.int32, (None, ), 'ref_len')
            self.response = self.symbol2index.lookup(self.response_string)
            self.response_len = tf.placeholder(tf.int32, (None, ),
                                               'response_len')

            with tf.variable_scope("embedding") as scope:
                if embed is None:
                    # initialize the embedding randomly
                    self.emb_enc = self.emb_dec = tf.get_variable(
                        "emb_share", [self.vocab_size, self.embed_size],
                        dtype=tf.float32)
                else:
                    # initialize the embedding by pre-trained word vectors
                    print "share pre-trained embed"
                    self.emb_enc = self.emb_dec = tf.get_variable(
                        'emb_share', dtype=tf.float32, initializer=embed)

            self.enc_post = tf.nn.embedding_lookup(self.emb_enc, self.post)
            self.enc_ref = tf.nn.embedding_lookup(self.emb_enc, self.ref)
            self.enc_response = tf.nn.embedding_lookup(self.emb_enc,
                                                       self.response)

            self.batch_len = tf.shape(self.response)[1]
            self.batch_size = tf.shape(self.response)[0]
            self.response_input = tf.concat([
                tf.ones((self.batch_size, 1), dtype=tf.int64) * GO_ID,
                tf.split(self.response, [self.batch_len - 1, 1], axis=1)[0]
            ], 1)
            self.dec_inp = tf.nn.embedding_lookup(self.emb_dec,
                                                  self.response_input)

            self.keep_prob = tf.placeholder_with_default(1.0, ())
            self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior")

    def _build_encoder(self, scope, inputs, sequence_length):
        with tf.variable_scope(scope):
            if self.bi_encode:
                cell_fw, cell_bw = self._build_biencoder_cell()
                outputs, states = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=cell_fw,
                    cell_bw=cell_bw,
                    inputs=inputs,
                    sequence_length=sequence_length,
                    dtype=tf.float32)
                enc_outputs = tf.concat(outputs, axis=-1)
                enc_state = []
                for i in range(self.num_layers):
                    if self.use_lstm:
                        encoder_state_c = tf.concat(
                            [states[0][i].c, states[1][i].c], axis=-1)
                        encoder_state_h = tf.concat(
                            [states[0][i].h, states[1][i].h], axis=-1)
                        enc_state.append(
                            tf.contrib.rnn.LSTMStateTuple(c=encoder_state_c,
                                                          h=encoder_state_h))
                    else:
                        enc_state.append(
                            tf.concat([states[0][i], states[1][i]], axis=-1))
                enc_state = tuple(enc_state)
                return enc_outputs, enc_state
            else:
                enc_cell = self._build_encoder_cell()
                enc_outputs, enc_state = tf.nn.dynamic_rnn(
                    cell=enc_cell,
                    inputs=inputs,
                    sequence_length=sequence_length,
                    dtype=tf.float32)
                return enc_outputs, enc_state

    def _get_representation_from_enc_state(self, enc_state):
        if self.use_lstm:
            return tf.concat([state.h for state in enc_state], axis=-1)
        else:
            return tf.concat(enc_state, axis=-1)

    def _build_decoder(self):
        with tf.variable_scope("decode",
                               initializer=tf.orthogonal_initializer()):
            dec_cell, init_state = self._build_decoder_cell(
                self.enc_post_outputs, self.post_len, self.dec_init_state)

            train_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=self.dec_inp, sequence_length=self.response_len)
            train_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=dec_cell,
                helper=train_helper,
                initial_state=init_state,
                output_layer=self.output_layer)
            train_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=train_decoder,
                maximum_iterations=self.max_decode_len,
            )
            logits = train_output.rnn_output

            mask = tf.sequence_mask(self.response_len,
                                    self.batch_len,
                                    dtype=tf.float32)

            crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.response, logits=logits)
            crossent = tf.reduce_sum(crossent * mask)
            self.sen_loss = crossent / tf.to_float(self.batch_size)

            # ppl(loss avg) across each timestep, the same as :
            # self.loss = tf.contrib.seq2seq.sequence_loss(train_output.rnn_output,
            #                                              self.response,
            #                                              mask)
            self.ppl_loss = crossent / tf.reduce_sum(mask)

            # add kld:
            self.elbo = self.sen_loss + self.kl_loss

            # Calculate and clip gradients
            params = tf.trainable_variables()
            gradients = tf.gradients(self.elbo, params)
            clipped_gradients, _ = tf.clip_by_global_norm(
                gradients, self.max_gradient_norm)
            self.train_op = self.opt.apply_gradients(
                zip(clipped_gradients, params), global_step=self.global_step)

            self.train_out = self.index2symbol.lookup(tf.cast(
                train_output.sample_id, tf.int64),
                                                      name='train_out')

        with tf.variable_scope("decode", reuse=True):
            dec_cell, init_state = self._build_decoder_cell(
                self.enc_post_outputs, self.post_len, self.dec_init_state)

            start_tokens = tf.tile(tf.constant([GO_ID], dtype=tf.int32),
                                   [self.batch_size])
            end_token = EOS_ID
            infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                self.emb_dec, start_tokens, end_token)
            infer_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=dec_cell,
                helper=infer_helper,
                initial_state=init_state,
                output_layer=self.output_layer)
            infer_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=infer_decoder,
                maximum_iterations=self.max_decode_len,
            )

            self.inference = self.index2symbol.lookup(tf.cast(
                infer_output.sample_id, tf.int64),
                                                      name='inference')

        with tf.variable_scope("decode", reuse=True):
            dec_init_state = tf.contrib.seq2seq.tile_batch(
                self.dec_init_state, self.beam_width)
            enc_outputs = tf.contrib.seq2seq.tile_batch(
                self.enc_post_outputs, self.beam_width)
            post_len = tf.contrib.seq2seq.tile_batch(self.post_len,
                                                     self.beam_width)

            dec_cell, init_state = self._build_decoder_cell(
                enc_outputs,
                post_len,
                dec_init_state,
                beam_width=self.beam_width)

            beam_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                cell=dec_cell,
                embedding=self.emb_dec,
                start_tokens=tf.ones_like(self.post_len) * GO_ID,
                end_token=EOS_ID,
                initial_state=init_state,
                beam_width=self.beam_width,
                output_layer=self.output_layer)
            beam_output, _, beam_lengths = tf.contrib.seq2seq.dynamic_decode(
                decoder=beam_decoder,
                maximum_iterations=self.max_decode_len,
            )

            self.beam_out = self.index2symbol.lookup(tf.cast(
                beam_output.predicted_ids, tf.int64),
                                                     name='beam_out')

    def _build_encoder_cell(self):
        if self.use_lstm:
            cell = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.rnn.LSTMCell(self.num_units), self.keep_prob)
                for _ in range(self.num_layers)
            ])
        else:
            cell = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.rnn.GRUCell(self.num_units), self.keep_prob)
                for _ in range(self.num_layers)
            ])
        return cell

    def _build_biencoder_cell(self):
        if self.use_lstm:
            cell_fw = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.rnn.LSTMCell(self.num_units / 2),
                    self.keep_prob) for _ in range(self.num_layers)
            ])
            cell_bw = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.rnn.LSTMCell(self.num_units / 2),
                    self.keep_prob) for _ in range(self.num_layers)
            ])
        else:
            cell_fw = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.rnn.GRUCell(self.num_units / 2), self.keep_prob)
                for _ in range(self.num_layers)
            ])
            cell_bw = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.rnn.GRUCell(self.num_units / 2), self.keep_prob)
                for _ in range(self.num_layers)
            ])
        return cell_fw, cell_bw

    def _build_decoder_cell(self,
                            memory,
                            memory_len,
                            encode_state,
                            beam_width=1):
        if self.use_lstm:
            cell = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.rnn.LSTMCell(self.num_units), self.keep_prob)
                for _ in range(self.num_layers)
            ])
        else:
            cell = tf.contrib.rnn.MultiRNNCell([
                tf.contrib.rnn.DropoutWrapper(
                    tf.contrib.rnn.GRUCell(self.num_units), self.keep_prob)
                for _ in range(self.num_layers)
            ])
        if self.attn_mode == 'Luong':
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                num_units=self.num_units,
                memory=memory,
                memory_sequence_length=memory_len,
                scale=True)
        elif self.attn_mode == 'Bahdanau':
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                num_units=self.num_units,
                memory=memory,
                memory_sequence_length=memory_len,
                scale=True)
        else:
            return cell, encode_state
        attn_cell = tf.contrib.seq2seq.AttentionWrapper(
            cell=cell,
            attention_mechanism=attention_mechanism,
            attention_layer_size=self.num_units,
        )
        return attn_cell, attn_cell.zero_state(
            self.batch_size * beam_width,
            tf.float32).clone(cell_state=encode_state)

    def initialize(self, sess, vocab):
        op_in = self.symbol2index.insert(
            constant_op.constant(vocab),
            constant_op.constant(range(len(vocab)), dtype=tf.int64))
        op_out = self.index2symbol.insert(
            constant_op.constant(range(len(vocab)), dtype=tf.int64),
            constant_op.constant(vocab))
        sess.run(tf.global_variables_initializer())
        sess.run([op_in, op_out])

    def step(self, sess, data, is_train=False):
        input_feed = {
            self.post_string: data['post'],
            self.post_len: data['post_len'],
            self.ref_string: data['ref'],
            self.ref_len: data['ref_len'],
            self.response_string: data['response'],
            self.response_len: data['response_len'],
            self.use_prior: is_train,
        }
        if is_train:
            output_feed = [
                self.train_op,
                self.ppl_loss,
                self.elbo,
                self.sen_loss,
                self.kl_loss,
                self.avg_kld,
                self.kl_weights,
                # self.post_string,
                # self.response_string,
                # self.train_out,
                # self.inference,
                # self.beam_out,
            ]
            input_feed[self.keep_prob] = self.train_keep_prob
        else:
            output_feed = [
                self.ppl_loss,
                self.elbo,
                self.sen_loss,
                self.kl_loss,
                self.avg_kld,
                self.kl_weights,
                # self.post_string,
                # self.response_string,
                # self.train_out,
                # self.inference,
                # self.beam_out,
            ]
        return sess.run(output_feed, input_feed)
示例#5
0
    def __init__(
            self,
            num_symbols,  # 词汇表size
            num_embed_units,  # 词嵌入size
            num_units,  # RNN 每层单元数
            num_layers,  # RNN 层数
            embed,  # 词嵌入
            entity_embed=None,  # 实体+关系的嵌入
            num_entities=0,  # 实体+关系的总个数
            num_trans_units=100,  # 实体嵌入的维度
            memory_units=100,
            learning_rate=0.0001,  # 学习率
            learning_rate_decay_factor=0.95,  # 学习率衰退,并没有采用这种方式
            max_gradient_norm=5.0,  #
            num_samples=500,  # 样本个数,sampled softmax
            max_length=60,
            mem_use=True,
            output_alignments=True,
            use_lstm=False):

        self.posts = tf.placeholder(tf.string, (None, None),
                                    'enc_inps')  # [batch_size, encoder_len]
        self.posts_length = tf.placeholder(tf.int32, (None),
                                           'enc_lens')  # [batch_size]
        self.responses = tf.placeholder(
            tf.string, (None, None), 'dec_inps')  # [batch_size, decoder_len]
        self.responses_length = tf.placeholder(tf.int32, (None),
                                               'dec_lens')  # [batch_size]
        self.entities = tf.placeholder(
            tf.string, (None, None, None),
            'entities')  # [batch_size, triple_num, triple_len]
        self.entity_masks = tf.placeholder(tf.string, (None, None),
                                           'entity_masks')  # 没用到
        self.triples = tf.placeholder(
            tf.string, (None, None, None, 3),
            'triples')  # [batch_size, triple_num, triple_len, 3]
        self.posts_triple = tf.placeholder(
            tf.int32, (None, None, 1),
            'enc_triples')  # [batch_size, encoder_len, 1]
        self.responses_triple = tf.placeholder(
            tf.string, (None, None, 3),
            'dec_triples')  # [batch_size, decoder_len, 3]
        self.match_triples = tf.placeholder(
            tf.int32, (None, None, None),
            'match_triples')  # [batch_size, decoder_len, triple_num]

        # 编码器batch_size,编码器encoder_len
        encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts))
        triple_num = tf.shape(self.triples)[1]  # 知识图个数
        triple_len = tf.shape(self.triples)[2]  # 知识三元组个数

        # 使用的知识三元组
        one_hot_triples = tf.one_hot(
            self.match_triples,
            triple_len)  # [batch_size, decoder_len, triple_num, triple_len]
        # 用 1 标注了哪个时间步产生的回复用了知识三元组
        use_triples = tf.reduce_sum(one_hot_triples,
                                    axis=[2, 3])  # [batch_size, decoder_len]

        # 词汇映射到index的hash table
        self.symbol2index = MutableHashTable(
            key_dtype=tf.string,  # key张量的类型
            value_dtype=tf.int64,  # value张量的类型
            default_value=UNK_ID,  # 缺少key的默认值
            shared_name=
            "in_table",  # If non-empty, this table will be shared under the given name across multiple sessions
            name="in_table",  # 操作名
            checkpoint=True
        )  # if True, the contents of the table are saved to and restored from checkpoints. If shared_name is empty for a checkpointed table, it is shared using the table node name.

        # index映射到词汇的hash table
        self.index2symbol = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value='_UNK',
                                             shared_name="out_table",
                                             name="out_table",
                                             checkpoint=True)

        # 实体映射到index的hash table
        self.entity2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=NONE_ID,
                                             shared_name="entity_in_table",
                                             name="entity_in_table",
                                             checkpoint=True)

        # index映射到实体的hash table
        self.index2entity = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value='_NONE',
                                             shared_name="entity_out_table",
                                             name="entity_out_table",
                                             checkpoint=True)

        self.posts_word_id = self.symbol2index.lookup(
            self.posts)  # [batch_size, encoder_len]
        self.posts_entity_id = self.entity2index.lookup(
            self.posts)  # [batch_size, encoder_len]

        self.responses_target = self.symbol2index.lookup(
            self.responses)  # [batch_size, decoder_len]
        # 获得解码器的batch_size,decoder_len
        batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(
            self.responses)[1]
        # 去掉responses_target的最后一列,给第一列加上GO_ID
        self.responses_word_id = tf.concat([
            tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID,
            tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0]
        ], 1)  # [batch_size, decoder_len]

        # 得到response的mask
        self.decoder_mask = tf.reshape(
            tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len),
                      reverse=True,
                      axis=1), [-1, decoder_len])  # [batch_size, decoder_len]

        # 初始化词嵌入和实体嵌入,传入了参数就直接赋值,没有的话就随机初始化
        if embed is None:
            self.embed = tf.get_variable('word_embed',
                                         [num_symbols, num_embed_units],
                                         tf.float32)
        else:
            self.embed = tf.get_variable('word_embed',
                                         dtype=tf.float32,
                                         initializer=embed)
        if entity_embed is None:  # 实体嵌入不随着模型的训练而更新
            self.entity_trans = tf.get_variable(
                'entity_embed', [num_entities, num_trans_units],
                tf.float32,
                trainable=False)
        else:
            self.entity_trans = tf.get_variable('entity_embed',
                                                dtype=tf.float32,
                                                initializer=entity_embed,
                                                trainable=False)

        # 将实体嵌入传入一个全连接层
        self.entity_trans_transformed = tf.layers.dense(
            self.entity_trans,
            num_trans_units,
            activation=tf.tanh,
            name='trans_transformation')
        # 添加['_NONE', '_PAD_H', '_PAD_R', '_PAD_T', '_NAF_H', '_NAF_R', '_NAF_T']这7个的嵌入
        padding_entity = tf.get_variable('entity_padding_embed',
                                         [7, num_trans_units],
                                         dtype=tf.float32,
                                         initializer=tf.zeros_initializer())
        self.entity_embed = tf.concat(
            [padding_entity, self.entity_trans_transformed], axis=0)

        # triples_embedding: [batch_size, triple_num, triple_len, 3*num_trans_units] 知识图三元组的嵌入
        triples_embedding = tf.reshape(
            tf.nn.embedding_lookup(self.entity_embed,
                                   self.entity2index.lookup(self.triples)),
            [encoder_batch_size, triple_num, -1, 3 * num_trans_units])
        # entities_word_embedding: [batch_size, triple_num*triple_len, num_embed_units] 知识图中用到的所有实体的嵌入
        entities_word_embedding = tf.reshape(
            tf.nn.embedding_lookup(self.embed,
                                   self.symbol2index.lookup(self.entities)),
            [encoder_batch_size, -1, num_embed_units])
        # 分离知识图三元组的头、关系和尾 [batch_size, triple_num, triple_len, num_trans_units]
        head, relation, tail = tf.split(triples_embedding,
                                        [num_trans_units] * 3,
                                        axis=3)

        # 静态图注意力机制
        with tf.variable_scope('graph_attention'):
            # 将头尾连接起来 [batch_size, triple_num, triple_len, 2*num_trans_units]
            head_tail = tf.concat([head, tail], axis=3)
            # 将头尾送入全连接层 [batch_size, triple_num, triple_len, num_trans_units]
            head_tail_transformed = tf.layers.dense(head_tail,
                                                    num_trans_units,
                                                    activation=tf.tanh,
                                                    name='head_tail_transform')
            # 将关系送入全连接层 [batch_size, triple_num, triple_len, num_trans_units]
            relation_transformed = tf.layers.dense(relation,
                                                   num_trans_units,
                                                   name='relation_transform')
            # 求头尾和关系两个向量的内积,获得对三元组的注意力系数
            e_weight = tf.reduce_sum(
                relation_transformed * head_tail_transformed,
                axis=3)  # [batch_size, triple_num, triple_len]
            alpha_weight = tf.nn.softmax(
                e_weight)  # [batch_size, triple_num, triple_len]
            # tf.expand_dims 使 alpha_weight 维度+1 [batch_size, triple_num, triple_len, 1]
            # 对第2个维度求和,由此产生静态图的向量表示
            graph_embed = tf.reduce_sum(
                tf.expand_dims(alpha_weight, 3) * head_tail,
                axis=2)  # [batch_size, triple_num, 2*num_trans_units]
        """graph_embed_input
        1、首先一维的range列表[0, 1, 2... encoder_batch_size个]转化成三维的[encoder_batch_size, 1, 1]的矩阵
        [[[0]], [[1]], [[2]],...]
        2、然后tf.tile将矩阵的第1维复制encoder_len遍,变成[encoder_batch_size, encoder_len, 1]
        [[[0],[0]...]],...]
        3、与posts_triple: [batch_size, encoder_len, 1]在第2维上进行拼接,形成一个indices: [batch_size, encoder_len, 2]矩阵,
        indices矩阵:
        [
         [[0 0], [0 0], [0 0], [0 0], [0 1], [0 0], [0 2], [0 0],...encoder_len],
         [[1 0], [1 0], [1 0], [1 0], [1 1], [1 0], [1 2], [1 0],...encoder_len],
         [[2 0], [2 0], [2 0], [2 0], [2 1], [2 0], [2 2], [2 0],...encoder_len]
         ,...batch_size
        ]
        4、tf.gather_nd根据索引检索graph_embed: [batch_size, triple_num, 2*num_trans_units]再回填至indices矩阵
        indices矩阵最后一个维度是2,例如有[0, 2],表示这个时间步第1个batch用了第2个图,
        则找到这个知识图的静态图向量填入到indices矩阵的[0, 2]位置最后得到结果维度
        [encoder_batch_size, encoder_len, 2*num_trans_units]表示每个时间步用的静态图向量
        """
        # graph_embed_input = tf.gather_nd(graph_embed, tf.concat(
        #     [tf.tile(tf.reshape(tf.range(encoder_batch_size, dtype=tf.int32), [-1, 1, 1]), [1, encoder_len, 1]),
        #      self.posts_triple],
        #     axis=2))

        # 将responses_triple转化成实体嵌入 [batch_size, decoder_len, 300],标识了response每个时间步用了哪个三元组的嵌入
        # triple_embed_input = tf.reshape(
        #     tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.responses_triple)),
        #     [batch_size, decoder_len, 3 * num_trans_units])

        post_word_input = tf.nn.embedding_lookup(
            self.embed,
            self.posts_word_id)  # [batch_size, encoder_len, num_embed_units]
        response_word_input = tf.nn.embedding_lookup(
            self.embed, self.responses_word_id
        )  # [batch_size, decoder_len, num_embed_units]

        # post_word_input和graph_embed_input拼接构成编码器输入 [batch_size, encoder_len, num_embed_units+2*num_trans_units]
        # self.encoder_input = tf.concat([post_word_input, graph_embed_input], axis=2)
        # response_word_input和triple_embed_input拼接构成解码器输入 [batch_size, decoder_len, num_embed_units+3*num_trans_units]
        # self.decoder_input = tf.concat([response_word_input, triple_embed_input], axis=2)

        encoder_cell = MultiRNNCell(
            [GRUCell(num_units) for _ in range(num_layers)])
        decoder_cell = MultiRNNCell(
            [GRUCell(num_units) for _ in range(num_layers)])

        # rnn encoder
        # encoder_state: [num_layers, 2, batch_size, num_units] 编码器输出状态 LSTM GRU:[num_layers, batch_size, num_units]
        encoder_output, encoder_state = tf.nn.dynamic_rnn(encoder_cell,
                                                          post_word_input,
                                                          self.posts_length,
                                                          dtype=tf.float32,
                                                          scope="encoder")

        # self.encoder_state_shape = tf.shape(encoder_state)

        ########记忆网络                                                                                                     ###
        response_encoder_cell = MultiRNNCell(
            [GRUCell(num_units) for _ in range(num_layers)])
        response_encoder_output, response_encoder_state = tf.nn.dynamic_rnn(
            response_encoder_cell,
            response_word_input,
            self.responses_length,
            dtype=tf.float32,
            scope="response_encoder")

        # graph_embed: [batch_size, triple_num, 2*num_trans_units] 静态图向量
        # encoder_state: [num_layers, batch_size, num_units]
        with tf.variable_scope("post_memory_network"):
            # 将静态知识图转化成输入向量m
            post_input = tf.layers.dense(graph_embed,
                                         memory_units,
                                         use_bias=False,
                                         name="post_weight_a")
            post_input = tf.tile(
                tf.reshape(post_input,
                           (1, encoder_batch_size, triple_num, memory_units)),
                multiples=(
                    num_layers, 1, 1,
                    1))  # [num_layers, batch_size, triple_num, memory_units]
            # 将静态知识库转化成输出向量c
            post_output = tf.layers.dense(graph_embed,
                                          memory_units,
                                          use_bias=False,
                                          name="post_weight_c")
            post_output = tf.tile(
                tf.reshape(post_output,
                           (1, encoder_batch_size, triple_num, memory_units)),
                multiples=(
                    num_layers, 1, 1,
                    1))  # [num_layers, batch_size, triple_num, memory_units]
            # 将question转化成状态向量u
            encoder_hidden_state = tf.reshape(
                tf.concat(encoder_state,
                          axis=0), (num_layers, encoder_batch_size, num_units))
            post_state = tf.layers.dense(encoder_hidden_state,
                                         memory_units,
                                         use_bias=False,
                                         name="post_weight_b")
            post_state = tf.tile(
                tf.reshape(post_state,
                           (num_layers, encoder_batch_size, 1, memory_units)),
                multiples=(
                    1, 1, triple_num,
                    1))  # [num_layers, batch_size, triple_num, memory_units]
            # 概率p
            post_p = tf.reshape(
                tf.nn.softmax(tf.reduce_sum(post_state * post_input, axis=3)),
                (num_layers, encoder_batch_size, triple_num,
                 1))  # [num_layers, batch_size, triple_num, 1]
            # 输出o
            post_o = tf.reduce_sum(
                post_output * post_p,
                axis=2)  # [num_layers, batch_size, memory_units]
            post_xstar = tf.concat(
                [
                    tf.layers.dense(post_o,
                                    memory_units,
                                    use_bias=False,
                                    name="post_weight_r"), encoder_state
                ],
                axis=2)  # [num_layers, batch_size, num_units+memory_units]

        with tf.variable_scope("response_memory_network"):
            # 将静态知识图转化成输入向量m
            response_input = tf.layers.dense(graph_embed,
                                             memory_units,
                                             use_bias=False,
                                             name="response_weight_a")
            response_input = tf.tile(
                tf.reshape(response_input,
                           (1, batch_size, triple_num, memory_units)),
                multiples=(
                    num_layers, 1, 1,
                    1))  # [num_layers, batch_size, triple_num, memory_units]
            # 将静态知识库转化成输出向量c
            response_output = tf.layers.dense(graph_embed,
                                              memory_units,
                                              use_bias=False,
                                              name="response_weight_c")
            response_output = tf.tile(
                tf.reshape(response_output,
                           (1, batch_size, triple_num, memory_units)),
                multiples=(
                    num_layers, 1, 1,
                    1))  # [num_layers, batch_size, triple_num, memory_units]
            # 将question转化成状态向量u
            response_hidden_state = tf.reshape(
                tf.concat(response_encoder_state, axis=0),
                (num_layers, batch_size, num_units))
            response_state = tf.layers.dense(response_hidden_state,
                                             memory_units,
                                             use_bias=False,
                                             name="response_weight_b")
            response_state = tf.tile(
                tf.reshape(response_state,
                           (num_layers, batch_size, 1, memory_units)),
                multiples=(
                    1, 1, triple_num,
                    1))  # [num_layers, batch_size, triple_num, memory_units]
            # 概率p
            response_p = tf.reshape(
                tf.nn.softmax(
                    tf.reduce_sum(response_state * response_input, axis=3)),
                (num_layers, batch_size, triple_num,
                 1))  # [num_layers, batch_size, triple_num, 1]
            # 输出o
            response_o = tf.reduce_sum(
                response_output * response_p,
                axis=2)  # [num_layers, batch_size, memory_units]
            response_ystar = tf.concat(
                [
                    tf.layers.dense(response_o,
                                    memory_units,
                                    use_bias=False,
                                    name="response_weight_r"),
                    response_encoder_state
                ],
                axis=2)  # [num_layers, batch_size, num_units+memory_units]

        with tf.variable_scope("memory_network"):
            memory_hidden_state = tf.layers.dense(tf.concat(
                [post_xstar, response_ystar], axis=2),
                                                  num_units,
                                                  use_bias=False,
                                                  activation=tf.tanh,
                                                  name="output_weight")
            memory_hidden_state = tf.reshape(
                memory_hidden_state, (num_layers * batch_size, num_units))
            # [num_layers, batch_size, num_units]
            memory_hidden_state = tuple(
                tf.split(memory_hidden_state, [batch_size] * num_layers,
                         axis=0))
            # self.memory_hidden_state_shape = tf.shape(memory_hidden_state)
########                                                                                                             ###

        output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss =\
            output_projection_layer(num_units, num_symbols, num_samples)

        ########用于训练的decoder                                                                                            ###
        with tf.variable_scope('decoder'):
            attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \
                    = prepare_attention(encoder_output,
                                        'bahdanau',
                                        num_units,
                                        imem=(graph_embed, triples_embedding),
                                        output_alignments=output_alignments and mem_use)

            # 训练时处理每个时间步输出和下个时间步输入的函数
            decoder_fn_train = attention_decoder_fn_train(
                memory_hidden_state,
                attention_keys_init,
                attention_values_init,
                attention_score_fn_init,
                attention_construct_fn_init,
                output_alignments=output_alignments and mem_use,
                max_length=tf.reduce_max(self.responses_length))

            self.decoder_output, _, alignments_ta = dynamic_rnn_decoder(
                decoder_cell,
                decoder_fn_train,
                response_word_input,
                self.responses_length,
                scope="decoder_rnn")

            if output_alignments:
                self.alignments = tf.transpose(alignments_ta.stack(),
                                               perm=[1, 0, 2, 3])
                self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss(
                    self.decoder_output, self.responses_target,
                    self.decoder_mask, self.alignments, triples_embedding,
                    use_triples, one_hot_triples)
                self.sentence_ppx = tf.identity(self.sentence_ppx,
                                                name='ppx_loss')
            else:
                self.decoder_loss = sequence_loss(self.decoder_output,
                                                  self.responses_target,
                                                  self.decoder_mask)
########                                                                                                             ###
########用于推导的decoder                                                                                            ###
        with tf.variable_scope('decoder', reuse=True):
            attention_keys, attention_values, attention_score_fn, attention_construct_fn \
                    = prepare_attention(encoder_output,
                                        'bahdanau',
                                        num_units,
                                        reuse=True,
                                        imem=(graph_embed, triples_embedding),
                                        output_alignments=output_alignments and mem_use)

            decoder_fn_inference = \
                attention_decoder_fn_inference(output_fn,
                                               memory_hidden_state,
                                               attention_keys,
                                               attention_values,
                                               attention_score_fn,
                                               attention_construct_fn,
                                               self.embed,
                                               GO_ID,
                                               EOS_ID,
                                               max_length,
                                               num_symbols,
                                               imem=(entities_word_embedding,  # imem: ([batch_size,triple_num*triple_len,num_embed_units],
                                                     tf.reshape(triples_embedding, [encoder_batch_size, -1, 3*num_trans_units])),  # [encoder_batch_size, triple_num*triple_len, 3*num_trans_units]) 实体词嵌入和三元组嵌入的元组
                                               selector_fn=selector_fn)
            # decoder_distribution: [batch_size, decoder_len, num_symbols]
            # output_ids_ta: tensorarray: decoder_len [batch_size]
            self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder(
                decoder_cell, decoder_fn_inference, scope="decoder_rnn")

            output_len = tf.shape(self.decoder_distribution)[1]  # decoder_len
            output_ids = tf.transpose(
                output_ids_ta.gather(
                    tf.range(output_len)))  # [batch_size, decoder_len]

            # 对output的值域行裁剪,因为存在负值表示用了实体词
            word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols),
                               tf.int64)  # [batch_size, decoder_len]

            # 计算的是实体词在entities中的实际位置 [batch_size, decoder_len]
            # 1、tf.shape(entities_word_embedding)[1] = triple_num*triple_len
            # 2、tf.range(encoder_batch_size): [batch_size]
            # 3、tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]): [batch_size, 1] 实体词在entities中的基地址
            # 4、tf.clip_by_value(-output_ids, 0, num_symbols): [batch_size, decoder_len] 实体词在entities中的偏移量
            # 5、entity_ids: [batch_size, decoder_len] 实体词在entities中的实际位置
            entity_ids = tf.reshape(
                tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape(
                    tf.range(encoder_batch_size) *
                    tf.shape(entities_word_embedding)[1], [-1, 1]), [-1])

            # 计算的是所用的实体词 [batch_size, decoder_len]
            # 1、entities: [batch_size, triple_num, triple_len]
            # 2、tf.reshape(self.entities, [-1]): [batch_size*triple_num*triple_len]
            # 3、tf.gather: [batch_size*decoder_len]
            # 4、entities: [batch_size, decoder_len]
            entities = tf.reshape(
                tf.gather(tf.reshape(self.entities, [-1]), entity_ids),
                [-1, output_len])

            words = self.index2symbol.lookup(word_ids)  # 将id转化为实际的词
            # output_ids>0为bool张量,True的位置用words中该位置的词替换
            self.generation = tf.where(output_ids > 0, words, entities)
            self.generation = tf.identity(
                self.generation,
                name='generation')  # [batch_size, decoder_len]
########                                                                                                             ###

# 初始化训练过程
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)

        # 并没有使用衰退的学习率
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)

        # 更新参数的次数
        self.global_step = tf.Variable(0, trainable=False)

        # 要训练的参数
        self.params = tf.global_variables()

        # 选择优化算法
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)

        self.lr = opt._lr

        # 根据 decoder_loss 计算 params 梯度
        gradients = tf.gradients(self.decoder_loss, self.params)
        # 梯度裁剪
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        # 记录损失
        tf.summary.scalar('decoder_loss', self.decoder_loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)  # 记录变量的训练情况
        self.merged_summary_op = tf.summary.merge_all()

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
        self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                          max_to_keep=1000,
                                          pad_step_number=True)
示例#6
0
    def __init__(self,
                 num_lstm_units,
                 embed,
                 neg_num=4,
                 gradient_clip_threshold=5.0):
        self.queries = tf.placeholder(dtype=tf.string, shape=[None, None])  # shape: batch*len
        self.queries_length = tf.placeholder(dtype=tf.int32, shape=[None])  # shape: batch
        self.docs = tf.placeholder(dtype=tf.string, shape=[neg_num + 1, None, None])  # shape: (neg_num + 1)*batch*len
        self.docs_length = tf.placeholder(dtype=tf.int32, shape=[neg_num + 1, None])  # shape: batch*(neg_num + 1)

        self.word2index = MutableHashTable(
            key_dtype=tf.string,
            value_dtype=tf.int64,
            default_value=UNK_ID,
            shared_name="in_table",
            name="in_table",
            checkpoint=True
        )

        self.learning_rate = tf.Variable(0.001, trainable=False, dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)
        self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32)

        self.index_queries = self.word2index.lookup(self.queries)  # batch*len
        self.index_docs = [self.word2index.lookup(doc) for doc in tf.unstack(self.docs)]

        self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed)
        self.embed_queries = tf.nn.embedding_lookup(self.embed, self.index_queries)
        self.embed_docs = [tf.nn.embedding_lookup(self.embed, index_doc) for index_doc in self.index_docs]

        with tf.variable_scope('query_lstm'):
            self.cell_q = SimpleLSTMCell(num_lstm_units)
        with tf.variable_scope('doc_lstm'):
            self.cell_d = SimpleLSTMCell(num_lstm_units)

        self.states_q = dynamic_rnn(self.cell_q, self.embed_queries, self.queries_length, dtype=tf.float32,
                                         scope="simple_lstm_cell_query")[1][1]  # shape: batch*num_units
        self.states_d = [dynamic_rnn(self.cell_d, self.embed_docs[i], self.docs_length[i], dtype=tf.float32,
                                            scope="simple_lstm_cell_doc")[1][1] for i in range(neg_num + 1)]  # shape: (neg_num + 1)*batch*num_units
        self.queries_norm = tf.sqrt(tf.reduce_sum(tf.square(self.states_q), axis=1))
        self.docs_norm = [tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1)) for i in range(neg_num + 1)]
        self.prods = [tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1) for i in range(neg_num + 1)]
        self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i])) for i in range(neg_num + 1)]  # shape: (neg_num + 1)*batch
        self.sims = tf.convert_to_tensor(self.sims)
        self.gamma = tf.Variable(initial_value=1.0, expected_shape=[], dtype=tf.float32)  # scaling factor according to the paper
        self.origin_sims = self.sims
        self.sims = self.sims * self.gamma
        self.prob = tf.nn.softmax(self.sims, dim=0)  # shape: (neg_num + 1)*batch
        self.hit_prob = tf.transpose(self.prob[0])

        self.loss = -tf.reduce_mean(tf.log(self.hit_prob))

        self.params = tf.trainable_variables()
        opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True)  # use Nesterov's method, according to the paper
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, gradient_clip_threshold)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step)
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
示例#7
0
    def __init__(self,
            num_symbols,
            num_embed_units,
            num_units,
            num_layers,
            num_labels,
            embed,
            learning_rate=0.005,
            max_gradient_norm=5.0,
			param_da=150,
			param_r=10,
            model_choose='lstm'):
        
        self.texts = tf.placeholder(tf.string, (None, None), 'texts')  # shape: [batch, length]

        #todo: implement placeholders
        self.texts_length = tf.placeholder(tf.int32, (None, ), 'texts_length')  # shape: [batch]
        self.labels = tf.placeholder(tf.int64, (None, ), 'labels')  # shape: [batch]
        
        self.symbol2index = MutableHashTable(
                key_dtype=tf.string,
                value_dtype=tf.int64,
                default_value=UNK_ID,
                shared_name="in_table",
                name="in_table",
                checkpoint=True)
		
        batch_size = tf.shape(self.texts)[0]
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate), 
                trainable=False, dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)


        self.index_input = self.symbol2index.lookup(self.texts)   # shape: [batch, length]
        
        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed)

        #todo: implement embedding inputs
        self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input) #shape: [batch, length, num_embed_units]

        #todo: implement 3 RNNCells (BasicRNNCell, GRUCell, BasicLSTMCell) in a multi-layer setting with #num_units neurons and #num_layers layers
        if model_choose not in ['rnn','lstm', 'gru']:
            model_choose = 'lstm'
        cell_type = {'rnn': BasicRNNCell, 'lstm': BasicLSTMCell, 'gru': GRUCell}[model_choose]
        cell_fw = MultiRNNCell([cell_type(num_units) for x in range(num_layers)])
        cell_bw = MultiRNNCell([cell_type(num_units) for x in range(num_layers)])

        #todo: implement bidirectional RNN
        outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn")
        H = tf.concat(outputs, 2) # shape: (batch, length, 2*num_units)
        
        
        with tf.variable_scope('logits'):
            #todo: implement self-attention mechanism, feel free to add codes to calculate temporary results
            Ws1 = tf.get_variable("Ws1", [2 * num_units, param_da])
            Ws2 = tf.get_variable("Ws2", [param_da, param_r])
            A = tf.nn.softmax(tf.einsum("ijk,kl->ijl", tf.nn.tanh(tf.einsum("ijk,kl->ijl", H, Ws1)), Ws2))
            M = tf.matmul(A, H, transpose_a=True) # shape: [batch, param_r, 2*num_units]
            flatten_M = tf.reshape(M, shape=[batch_size, param_r*2*num_units]) # shape: [batch, param_r*2*num_units]

            logits = tf.layers.dense(flatten_M, num_labels, activation=None, name='projection') # shape: [batch, num_labels]
		
        #todo: calculate additional loss, feel free to add codes to calculate temporary results
        identity = tf.reshape(tf.tile(tf.diag(tf.ones([param_r])), [batch_size, 1]), [batch_size, param_r, param_r])
        self.penalized_term = tf.norm(tf.matmul(A, A, transpose_a=True) - identity)
        
        self.loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') + 0.0001*self.penalized_term
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy')

        self.params = tf.trainable_variables()
            
        # calculate the gradient of parameters
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, 
                max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params), 
                global_step=self.global_step)
        
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, 
                max_to_keep=5, pad_step_number=True)
示例#8
0
    def __init__(self,
            num_symbols,        # 18430, vocabulary size.
            num_embed_units,    # 300, Size of word embedding.
            num_units,          # 512, Size of each model layer.
            num_layers,         # 1, Number of layers in the model.
            num_labels,         # 5, Number of labels.
            embed,              # (18430, 300), word2vector list.
            learning_rate=0.5,
            max_gradient_norm=5.0):
        # todo: implement placeholders
        self.texts = tf.placeholder(dtype=tf.string, shape=[None, None], name='texts')  # shape: batch*len
        self.texts_length = tf.placeholder(dtype=tf.int64, shape=[None], name='texts_length')  # shape: batch
        self.labels = tf.placeholder(dtype=tf.int64, shape=[None], name='labels')  # shape: batch
        
        self.symbol2index = MutableHashTable(
                key_dtype=tf.string,
                value_dtype=tf.int64,
                default_value=UNK_ID,
                shared_name="in_table",
                name="in_table",
                checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)

        self.index_input = self.symbol2index.lookup(self.texts)   # batch*len
        
        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed)

        self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input) #batch*len*embed_unit

        if num_layers == 1:
            # cell = BasicLSTMCell(num_units)
            cell = GRUCell(num_units)
            # cell = BasicRNNCell(num_units)

        keep_prob = 0.95
        dropped_input = tf.nn.dropout(self.embed_input, keep_prob=keep_prob)
        outputs, states = dynamic_rnn(cell, dropped_input, self.texts_length, dtype=tf.float32, scope="rnn")

        # todo: implement unfinished networks
        # logits = tf.layers.dense(inputs=states, units=num_labels)
        l1 = tf.nn.dropout(states, keep_prob=keep_prob)
        inner_layer = tf.layers.dense(inputs=l1, units=256, activation=tf.nn.relu)
        l2 = tf.nn.dropout(inner_layer, keep_prob=keep_prob)
        logits = tf.layers.dense(inputs=l2, units=num_labels)

        self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss')
        mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32)
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy')

        self.params = tf.trainable_variables()
            
        # calculate the gradient of parameters
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step)

        tf.summary.scalar('loss/step', self.loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()
        
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, 
                max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
示例#9
0
文件: model.py 项目: juvu/seq2seq_cn
    def __init__(
            self,
            num_symbols,  # 词汇表size
            num_embed_units,  # 词嵌入size
            num_units,  # RNN 每层单元数
            num_layers,  # RNN 层数
            embed,  # 词嵌入
            entity_embed=None,  #
            num_entities=0,  #
            num_trans_units=100,  #
            learning_rate=0.0001,
            learning_rate_decay_factor=0.95,  #
            max_gradient_norm=5.0,  #
            num_samples=500,  # 样本个数,sampled softmax
            max_length=60,
            mem_use=True,
            output_alignments=True,
            use_lstm=False):

        self.posts = tf.placeholder(tf.string, (None, None),
                                    'enc_inps')  # batch_size * encoder_len
        self.posts_length = tf.placeholder(tf.int32, (None),
                                           'enc_lens')  # batch_size
        self.responses = tf.placeholder(tf.string, (None, None),
                                        'dec_inps')  # batch_size * decoder_len
        self.responses_length = tf.placeholder(tf.int32, (None),
                                               'dec_lens')  # batch_size
        self.entities = tf.placeholder(
            tf.string, (None, None, None),
            'entities')  # batch_size * triple_num * triple_len
        self.entity_masks = tf.placeholder(tf.string, (None, None),
                                           'entity_masks')  # 没用到
        self.triples = tf.placeholder(
            tf.string, (None, None, None, 3),
            'triples')  # batch_size * triple_num * triple_len * 3
        self.posts_triple = tf.placeholder(
            tf.int32, (None, None, 1),
            'enc_triples')  # batch_size * encoder_len
        self.responses_triple = tf.placeholder(
            tf.string, (None, None, 3),
            'dec_triples')  # batch_size * decoder_len * 3
        self.match_triples = tf.placeholder(
            tf.int32, (None, None, None),
            'match_triples')  # batch_size * decoder_len * triple_num

        # 获得 encoder_batch_size ,编码器的 encoder_len
        encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts))
        # 获得 triple_num
        # 每个 post 包含的知识图个数(补齐过的)
        triple_num = tf.shape(self.triples)[1]
        # 获得 triple_len
        # 每个知识图包含的关联实体个数(补齐过的)
        triple_len = tf.shape(self.triples)[2]

        # 使用的知识三元组
        one_hot_triples = tf.one_hot(
            self.match_triples,
            triple_len)  # batch_size * decoder_len * triple_num * triple_len
        # 用 1 标注了哪个时间步产生的回复用了知识三元组
        use_triples = tf.reduce_sum(one_hot_triples,
                                    axis=[2, 3])  # batch_size * decoder_len

        # 词汇映射到 index 的 hash table
        self.symbol2index = MutableHashTable(
            key_dtype=tf.string,  # key张量的类型
            value_dtype=tf.int64,  # value张量的类型
            default_value=UNK_ID,  # 缺少key的默认值
            shared_name=
            "in_table",  # If non-empty, this table will be shared under the given name across multiple sessions
            name="in_table",  # 操作名
            checkpoint=True
        )  # if True, the contents of the table are saved to and restored from checkpoints. If shared_name is empty for a checkpointed table, it is shared using the table node name.

        # index 映射到词汇的 hash table
        self.index2symbol = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value='_UNK',
                                             shared_name="out_table",
                                             name="out_table",
                                             checkpoint=True)

        # 实体映射到 index 的 hash table
        self.entity2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=NONE_ID,
                                             shared_name="entity_in_table",
                                             name="entity_in_table",
                                             checkpoint=True)

        # index 映射到实体的 hash table
        self.index2entity = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value='_NONE',
                                             shared_name="entity_out_table",
                                             name="entity_out_table",
                                             checkpoint=True)

        # 将 post 的 string 映射成词汇 id
        self.posts_word_id = self.symbol2index.lookup(
            self.posts)  # batch_size * encoder_len
        # 将 post 的 string 映射成实体 id
        self.posts_entity_id = self.entity2index.lookup(
            self.posts)  # batch_size * encoder_len

        # 将 response 的 string 映射成词汇 id
        self.responses_target = self.symbol2index.lookup(
            self.responses)  # batch_size * decoder_len
        # 获得解码器的 batch_size,decoder_len
        batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(
            self.responses)[1]
        #  去掉 responses_target 的最后一列,给第一列加上 GO_ID
        self.responses_word_id = tf.concat([
            tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID,
            tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0]
        ], 1)  # batch_size * decoder_len

        # 得到 response 的 mask
        # 首先将回复的长度 one_hot 编码
        # 然后横着从右向左累计求和,形成一个如果该位置在长度范围内,则为1,否则则为0的矩阵,最后一步 reshape 应该没有必要
        self.decoder_mask = tf.reshape(
            tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len),
                      reverse=True,
                      axis=1), [-1, decoder_len])  # batch_size * decoder_len

        # 初始化 词嵌入 和 实体嵌入,传入了参数就直接赋值,没有的话就随机初始化
        if embed is None:
            self.embed = tf.get_variable('word_embed',
                                         [num_symbols, num_embed_units],
                                         tf.float32)
        else:
            self.embed = tf.get_variable('word_embed',
                                         dtype=tf.float32,
                                         initializer=embed)
        if entity_embed is None:
            self.entity_trans = tf.get_variable(
                'entity_embed', [num_entities, num_trans_units],
                tf.float32,
                trainable=False)
        else:
            self.entity_trans = tf.get_variable('entity_embed',
                                                dtype=tf.float32,
                                                initializer=entity_embed,
                                                trainable=False)

        # 添加一个全连接层,输入是实体的嵌入,该层的 size=num_trans_units,激活函数是tanh
        # 为什么还要用全连接层连一下??????
        self.entity_trans_transformed = tf.layers.dense(
            self.entity_trans,
            num_trans_units,
            activation=tf.tanh,
            name='trans_transformation')
        # 7 * num_trans_units 的全零初始化的数组
        padding_entity = tf.get_variable('entity_padding_embed',
                                         [7, num_trans_units],
                                         dtype=tf.float32,
                                         initializer=tf.zeros_initializer())

        # 把 padding_entity 添加到 entity_trans_transformed 的最前,补了有什么用?????????????
        self.entity_embed = tf.concat(
            [padding_entity, self.entity_trans_transformed], axis=0)

        # tf.nn.embedding_lookup 以后维度会+1,所以通过reshape来取消这个多出来的维度
        triples_embedding = tf.reshape(
            tf.nn.embedding_lookup(self.entity_embed,
                                   self.entity2index.lookup(self.triples)),
            [encoder_batch_size, triple_num, -1, 3 * num_trans_units])
        entities_word_embedding = tf.reshape(
            tf.nn.embedding_lookup(self.embed,
                                   self.symbol2index.lookup(self.entities)),
            [encoder_batch_size, -1, num_embed_units
             ])  # [batch_size,triple_num*triple_len,num_embed_units]

        # 把 head,relation,tail分割开来
        head, relation, tail = tf.split(triples_embedding,
                                        [num_trans_units] * 3,
                                        axis=3)

        # 静态图注意力机制
        with tf.variable_scope('graph_attention'):
            # 将头和尾连接起来
            head_tail = tf.concat(
                [head, tail],
                axis=3)  # batch_size * triple_num * triple_len * 200

            # tanh(dot(W, head_tail))
            head_tail_transformed = tf.layers.dense(
                head_tail,
                num_trans_units,
                activation=tf.tanh,
                name='head_tail_transform'
            )  # batch_size * triple_num * triple_len * 100

            # dot(W, relation)
            relation_transformed = tf.layers.dense(
                relation, num_trans_units, name='relation_transform'
            )  # batch_size * triple_num * triple_len * 100

            # 两个向量先元素乘,再求和,等于两个向量的内积
            # dot(traspose(dot(W, relation)), tanh(dot(W, head_tail)))
            e_weight = tf.reduce_sum(
                relation_transformed * head_tail_transformed,
                axis=3)  # batch_size * triple_num * triple_len

            # 图中每个三元组的 alpha 权值
            alpha_weight = tf.nn.softmax(
                e_weight)  # batch_size * triple_num * triple_len

            # tf.expand_dims 使 alpha_weight 维度+1 batch_size * triple_num * triple_len * 1
            # 对第2个维度求和,由此产生每个图 100 维的图向量表示
            graph_embed = tf.reduce_sum(
                tf.expand_dims(alpha_weight, 3) * head_tail,
                axis=2)  # batch_size * triple_num * 100
        """
        [0, 1, 2... encoder_batch_size] 转化成 encoder_batch_size * 1 * 1 的矩阵 [[[0]], [[1]], [[2]],...]
        tf.tile 将矩阵的第 1 维进行扩展 encoder_batch_size * encoder_len * 1 [[[0],[0]...]],...]
        与 posts_triple 在第 2 维度上进行拼接,形成 indices 矩阵
        indices 矩阵:
        [
         [[0 0], [0 0], [0 0], [0 0], [0 1], [0 0], [0 2], [0 0],...encoder_len],
         [[1 0], [1 0], [1 0], [1 0], [1 1], [1 0], [1 2], [1 0],...encoder_len],
         [[2 0], [2 0], [2 0], [2 0], [2 1], [2 0], [2 2], [2 0],...encoder_len]
         ,...batch_size
        ]
        tf.gather_nd 将 graph_embed 中根据上面矩阵提供的索引检索图向量,再回填至 indices 矩阵
        encoder_batch_size * encoder_len * 100
        """
        graph_embed_input = tf.gather_nd(
            graph_embed,
            tf.concat([
                tf.tile(
                    tf.reshape(tf.range(encoder_batch_size, dtype=tf.int32),
                               [-1, 1, 1]), [1, encoder_len, 1]),
                self.posts_triple
            ],
                      axis=2))

        # 将 responses_triple 转化成实体嵌入 batch_size * decoder_len * 300
        triple_embed_input = tf.reshape(
            tf.nn.embedding_lookup(
                self.entity_embed,
                self.entity2index.lookup(self.responses_triple)),
            [batch_size, decoder_len, 3 * num_trans_units])

        # 将 posts_word_id 转化成词嵌入
        post_word_input = tf.nn.embedding_lookup(
            self.embed, self.posts_word_id)  # batch_size * encoder_len * 300

        # 将 responses_word_id 转化成词嵌入
        response_word_input = tf.nn.embedding_lookup(
            self.embed,
            self.responses_word_id)  # batch_size * decoder_len * 300

        # post_word_input, graph_embed_input 在第二个维度上拼接
        self.encoder_input = tf.concat(
            [post_word_input, graph_embed_input],
            axis=2)  # batch_size * encoder_len * 400
        # response_word_input, triple_embed_input 在第二个维度上拼接
        self.decoder_input = tf.concat(
            [response_word_input, triple_embed_input],
            axis=2)  # batch_size * decoder_len * 600

        # 构造 deep RNN
        encoder_cell = MultiRNNCell(
            [GRUCell(num_units) for _ in range(num_layers)])
        decoder_cell = MultiRNNCell(
            [GRUCell(num_units) for _ in range(num_layers)])

        # rnn encoder
        encoder_output, encoder_state = dynamic_rnn(encoder_cell,
                                                    self.encoder_input,
                                                    self.posts_length,
                                                    dtype=tf.float32,
                                                    scope="encoder")

        # 由于词汇表维度过大,所以输出的维度不可能和词汇表一样。通过 projection 函数,可以实现从低维向高维的映射
        # 返回:输出函数,选择器函数,计算序列损失,采样序列损失,总体损失的函数
        output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss = output_projection_layer(
            num_units, num_symbols, num_samples)

        # 用于训练的 decoder
        with tf.variable_scope('decoder'):
            # 得到注意力函数
            # 准备注意力
            # attention_keys_init: 注意力的 keys
            # attention_values_init: 注意力的 values
            # attention_score_fn_init: 计算注意力上下文的函数
            # attention_construct_fn_init: 计算所有上下文拼接的函数
            attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \
                    = prepare_attention(encoder_output, 'bahdanau', num_units, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use)#'luong', num_units)

            # 返回训练时解码器每一个时间步对输入的处理函数
            decoder_fn_train = attention_decoder_fn_train(
                encoder_state,
                attention_keys_init,
                attention_values_init,
                attention_score_fn_init,
                attention_construct_fn_init,
                output_alignments=output_alignments and mem_use,
                max_length=tf.reduce_max(self.responses_length))

            # 输出,最终状态,alignments 的 TensorArray
            self.decoder_output, _, alignments_ta = dynamic_rnn_decoder(
                decoder_cell,
                decoder_fn_train,
                self.decoder_input,
                self.responses_length,
                scope="decoder_rnn")

            if output_alignments:

                self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss(
                    self.decoder_output, self.responses_target,
                    self.decoder_mask, self.alignments, triples_embedding,
                    use_triples, one_hot_triples)
                self.sentence_ppx = tf.identity(
                    self.sentence_ppx,
                    name='ppx_loss')  # 将 sentence_ppx 转化成一步操作
            else:
                self.decoder_loss = sequence_loss(self.decoder_output,
                                                  self.responses_target,
                                                  self.decoder_mask)

        # 用于推导的 decoder
        with tf.variable_scope('decoder', reuse=True):
            # 得到注意力函数
            attention_keys, attention_values, attention_score_fn, attention_construct_fn \
                    = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use)#'luong', num_units)
            decoder_fn_inference = attention_decoder_fn_inference(
                output_fn,
                encoder_state,
                attention_keys,
                attention_values,
                attention_score_fn,
                attention_construct_fn,
                self.embed,
                GO_ID,
                EOS_ID,
                max_length,
                num_symbols,
                imem=(entities_word_embedding,
                      tf.reshape(
                          triples_embedding,
                          [encoder_batch_size, -1, 3 * num_trans_units])),
                selector_fn=selector_fn)
            # imem: ([batch_size,triple_num*triple_len,num_embed_units],[encoder_batch_size, triple_num*triple_len, 3*num_trans_units]) 实体次嵌入和三元组嵌入的元组

            self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder(
                decoder_cell, decoder_fn_inference, scope="decoder_rnn")

            output_len = tf.shape(self.decoder_distribution)[1]  # decoder_len
            output_ids = tf.transpose(
                output_ids_ta.gather(
                    tf.range(output_len)))  # [batch_size, decoder_len]

            # 对 output 的值域行裁剪
            word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols),
                               tf.int64)  # [batch_size, decoder_len]

            # 计算的是采用的实体词在 entities 的位置
            # 1、tf.shape(entities_word_embedding)[1] = triple_num*triple_len
            # 2、tf.range(encoder_batch_size): [batch_size]
            # 3、tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]): [batch_size, 1] 实体词在 entities 中的偏移量
            # 4、tf.clip_by_value(-output_ids, 0, num_symbols): [batch_size, decoder_len] 实体词的相对位置
            # 5、entity_ids: [batch_size * decoder_len] 加上偏移量之后在 entities 中的实际位置
            entity_ids = tf.reshape(
                tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape(
                    tf.range(encoder_batch_size) *
                    tf.shape(entities_word_embedding)[1], [-1, 1]), [-1])

            # 计算的是所用的实体词
            # 1、entities: [batch_size, triple_num, triple_len]
            # 2、tf.reshape(self.entities, [-1]): [batch_size * triple_num * triple_len]
            # 3、tf.gather: [batch_size*decoder_len]
            # 4、entities: [batch_size, output_len]
            entities = tf.reshape(
                tf.gather(tf.reshape(self.entities, [-1]), entity_ids),
                [-1, output_len])

            words = self.index2symbol.lookup(word_ids)  # 将 id 转化为实际的词
            # output_ids > 0 为 bool 张量,True 的位置用 words 中该位置的词替换
            self.generation = tf.where(output_ids > 0, words, entities)
            self.generation = tf.identity(self.generation, name='generation')

        # 初始化训练过程
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)

        # ???
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)

        # 更新参数的次数
        self.global_step = tf.Variable(0, trainable=False)

        # 要训练的参数
        self.params = tf.global_variables()

        # 选择优化算法
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)

        self.lr = opt._lr

        # 根据 decoder_loss 计算 params 梯度
        gradients = tf.gradients(self.decoder_loss, self.params)
        # 梯度裁剪
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        tf.summary.scalar('decoder_loss', self.decoder_loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
        self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                          max_to_keep=1000,
                                          pad_step_number=True)
示例#10
0
文件: model.py 项目: ivanium/ann-hw4
    def __init__(self,
                 num_symbols,
                 num_embed_units,
                 num_units,
                 num_layers,
                 num_labels,
                 embed,
                 learning_rate=0.5,
                 max_gradient_norm=5.0):
        #todo: implement placeholders
        self.texts = tf.placeholder(tf.string, [None, None],
                                    name="texts")  # shape: batch*len
        self.texts_length = tf.placeholder(tf.int64, [None],
                                           name="texts_length")  # shape: batch
        self.labels = tf.placeholder(tf.int64, [None],
                                     name="labels")  # shape: batch

        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)
        learning_rate_decay_factor = 0.9
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)

        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)

        self.index_input = self.symbol2index.lookup(self.texts)  # batch*len

        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed',
                                         [num_symbols, num_embed_units],
                                         tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed',
                                         dtype=tf.float32,
                                         initializer=embed)

        self.embed_input = tf.nn.embedding_lookup(
            self.embed, self.index_input)  #batch*len*embed_unit

        model = 'lstm'

        if num_layers == 1:
            if (model == 'rnn'):
                cell = BasicRNNCell(num_units)
            elif (model == 'gru'):
                cell = GRUCell(num_units)
            elif (model == 'lstm'):
                cell = BasicLSTMCell(num_units)

            cell_do = tf.nn.rnn_cell.DropoutWrapper(
                cell, input_keep_prob=1.0, output_keep_prob=FLAGS.keep_prob)
            outputs, states = dynamic_rnn(cell_do,
                                          self.embed_input,
                                          self.texts_length,
                                          dtype=tf.float32,
                                          scope="rnn")
            #todo: implement unfinished networks
            outputs_flat = tf.reduce_mean(outputs, 1)
            if (model == 'lstm'):
                states = states[0]
            # W_f = weight_variable([tf.app.flags.FLAGS.units, 5])
            # b_f = bias_variable([5])
            # logits = tf.matmul(outputs_flat, W_f) + b_f
            # fc_layer = tf.layers.dense(inputs = states, units = 32, activation = tf.nn.relu)
            logits = tf.layers.dense(inputs=states, units=5, activation=None)

        else:
            self.reverse_texts = tf.placeholder(
                tf.string, [None, None],
                name="reverse_texts")  # shape: batch*len
            self.index_reverse_input = self.symbol2index.lookup(
                self.reverse_texts)
            self.embed_reverse_input = tf.nn.embedding_lookup(
                self.embed, self.index_reverse_input)  #batch*len*embed_unit

            if (model == 'rnn'):
                cell1 = BasicRNNCell(num_units)
                cell2 = BasicRNNCell(num_units)
            elif (model == 'gru'):
                cell1 = GRUCell(num_units)
                cell2 = GRUCell(num_units)
            elif (model == 'lstm'):
                cell1 = BasicLSTMCell(num_units)
                cell2 = BasicLSTMCell(num_units)

            cell1_do = tf.nn.rnn_cell.DropoutWrapper(
                cell1, input_keep_prob=1.0, output_keep_prob=FLAGS.keep_prob)
            cell2_do = tf.nn.rnn_cell.DropoutWrapper(
                cell2, input_keep_prob=1.0, output_keep_prob=FLAGS.keep_prob)

            outputs1, states1 = dynamic_rnn(cell1_do,
                                            self.embed_input,
                                            self.texts_length,
                                            dtype=tf.float32,
                                            scope="rnn")
            outputs2, states2 = dynamic_rnn(cell2_do,
                                            self.embed_reverse_input,
                                            self.texts_length,
                                            dtype=tf.float32,
                                            scope="rnn")

            if (model == 'lstm'):
                states = states1[0] + states2[0]
            else:
                states = states1 + states2

            # fc_layer = tf.layers.dense(inputs = states, units = 32, activation = tf.nn.relu)
            logits = tf.layers.dense(inputs=states, units=5, activation=None)

        self.loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels,
                                                           logits=logits),
            name='loss')
        mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0],
                                        dtype=tf.float32)
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(
            tf.equal(self.labels, predict_labels), tf.int32),
                                      name='accuracy')

        self.params = tf.trainable_variables()

        # calculate the gradient of parameters
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        # opt = tf.train.AdamOptimizer(self.learning_rate)

        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        tf.summary.scalar('loss/step', self.loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
示例#11
0
文件: model.py 项目: isrugeek/sst
class RNN(object):
    def __init__(self,
            num_symbols,
            num_embed_units,
            num_units,
            num_layers,
            num_labels,
            embed,
            learning_rate=0.005,
            max_gradient_norm=5.0):
        
        self.texts = tf.placeholder(tf.string, (None, None), 'texts')  # batch*len
        self.texts_length = tf.placeholder(tf.int32, (None), 'texts_length')  # batch
        self.labels = tf.placeholder(tf.int64, (None), 'labels')  # batch
        
        self.symbol2index = MutableHashTable(
                key_dtype=tf.string,
                value_dtype=tf.int64,
                default_value=UNK_ID,
                shared_name="in_table",
                name="in_table",
                checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate), 
                trainable=False, dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)


        self.index_input = self.symbol2index.lookup(self.texts)   # batch*len
        
        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed)
        
        self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input) #batch*len*embed_unit

        cell = MultiRNNCell([BasicLSTMCell(num_units) for _ in range(num_layers)])
        
        outputs, states = dynamic_rnn(cell, self.embed_input, 
                self.texts_length, dtype=tf.float32, scope="rnn")

        vectors = states[-1][-1]

        with tf.variable_scope('logits'):
            weight = tf.get_variable("weights", [num_units, num_labels])
            bias = tf.get_variable("biases", [num_labels])
            logits = tf.matmul(vectors, weight) + bias

        self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss')
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy')

        self.params = tf.trainable_variables()
            
        # calculate the gradient of parameters
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, 
                max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params), 
                global_step=self.global_step)
        
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, 
                max_to_keep=5, pad_step_number=True)

    def print_parameters(self):
        for item in self.params:
            print('%s: %s' % (item.name, item.get_shape()))
    
    def train_step(self, session, data):
        input_feed = {self.texts: data['texts'],
                self.texts_length: data['texts_length'],
                self.labels: data['labels']}
        output_feed = [self.loss, self.accuracy, self.gradient_norm, self.update]
        return session.run(output_feed, input_feed)
示例#12
0
    def __init__(self,
                 num_lstm_units,
                 num_labels,
                 embed,
                 max_gradient_norm=5.0):
        self.num_lstm_units = num_lstm_units
        self.texts1 = tf.placeholder(tf.string, [None, None],
                                     name='texts1')  # batch_size*max_len
        self.texts2 = tf.placeholder(
            tf.string, [None, None], name='texts2'
        )  # batch_size*max_len, PAD THE TWO TEXTS TO SAME LENGTH
        self.texts_length1 = tf.placeholder(
            tf.int32, [None], name='texts_length1')  # shape: batch
        self.texts_length2 = tf.placeholder(
            tf.int32, [None], name='texts_length2')  # shape: batch
        self.labels = tf.placeholder(tf.int64, [None],
                                     name='labels')  # shape: batch
        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        self.learning_rate = tf.Variable(0.01,
                                         trainable=False,
                                         dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)
        self.index_input1 = self.symbol2index.lookup(
            self.texts1)  # batch*max_len
        self.index_input2 = self.symbol2index.lookup(
            self.texts2)  # batch*max_len
        self.embed = tf.get_variable('embed',
                                     dtype=tf.float32,
                                     initializer=embed)
        self.embed_input1 = tf.nn.embedding_lookup(
            self.embed, self.index_input1)  # batch*max_len*embed_unit
        self.embed_input2 = tf.nn.embedding_lookup(
            self.embed, self.index_input2)  # batch*max_len*embed_unit

        # zero padding
        self._batch_size = tf.shape(self.texts_length1)[0]
        self._max_length = tf.shape(self.texts1)[1]
        self.mask1 = tf.sequence_mask(self.texts_length1,
                                      maxlen=self._max_length,
                                      dtype=tf.float32)  # shape: batch*max_len
        self.mask1_extended = tf.concat(
            [tf.zeros([self._batch_size, 1], tf.float32), self.mask1], 1)
        self.mask2 = tf.sequence_mask(self.texts_length2,
                                      maxlen=self._max_length,
                                      dtype=tf.float32)  # shape: batch*max_len
        self.mask2_extended = tf.concat(
            [tf.zeros([self._batch_size, 1], tf.float32), self.mask2], 1)
        # debug
        print("mask1 size: " + str(self.mask1.shape))
        self.embed_input1 = tf.transpose(
            self.embed_input1,
            [2, 0, 1]) * self.mask1  # shape: embed_unit*batch*max_len
        self.embed_input1 = tf.transpose(
            self.embed_input1, [2, 1, 0])  # shape: max_len*batch*embed_units
        self.embed_input2 = tf.transpose(
            self.embed_input2,
            [2, 0, 1]) * self.mask2  # shape: embed_unit*batch*max_len
        self.embed_input2 = tf.transpose(
            self.embed_input2, [2, 1, 0])  # shape: max_len*batch*embed_units

        zero_state = tf.zeros(shape=[self._batch_size, self.num_lstm_units],
                              dtype=tf.float32)
        h_s1 = tf.TensorArray(dtype=tf.float32,
                              size=0,
                              dynamic_size=True,
                              clear_after_read=False)
        c_s1 = tf.TensorArray(dtype=tf.float32,
                              size=0,
                              dynamic_size=True,
                              clear_after_read=False)
        h_s1 = h_s1.write(0, zero_state)
        c_s1 = c_s1.write(0, zero_state)

        h_s2 = tf.TensorArray(dtype=tf.float32,
                              size=0,
                              dynamic_size=True,
                              clear_after_read=False)
        c_s2 = tf.TensorArray(dtype=tf.float32,
                              size=0,
                              dynamic_size=True,
                              clear_after_read=False)
        h_s2 = h_s2.write(0, zero_state)
        c_s2 = c_s2.write(0, zero_state)

        h_r = tf.TensorArray(dtype=tf.float32,
                             size=0,
                             dynamic_size=True,
                             clear_after_read=False)
        c_r = tf.TensorArray(dtype=tf.float32,
                             size=0,
                             dynamic_size=True,
                             clear_after_read=False)
        h_r = h_r.write(0, zero_state)
        c_r = c_r.write(0, zero_state)

        self._initializer = tf.truncated_normal_initializer(stddev=0.1)

        t = tf.constant(1, dtype=tf.int32)  # TO DO: check this
        c = lambda x, hs1, cs1, hs2, cs2, hr, cr: tf.less(
            x, self._max_length + 1)
        b = lambda x, hs1, cs1, hs2, cs2, hr, cr: self._match_step(
            x, hs1, cs1, hs2, cs2, hr, cr)
        t, self.h_s1, self.c_s1, self.h_s2, self.c_s2, self.h_r, self.c_r = tf.while_loop(
            cond=c, body=b, loop_vars=(t, h_s1, c_s1, h_s2, c_s2, h_r, c_r))

        self.h_r = tf.transpose(
            self.h_r.stack(),
            [1, 0, 2])  # shape: [batch_size, max_len, num_lstm_units]
        # get final states. don't need to subtract seqlen by 1 because we take zero states also in count
        self.final_h_r = tf.gather_nd(
            self.h_r,
            tf.stack([
                tf.range(self._batch_size),
                tf.maximum(self.texts_length1, self.texts_length2)
            ],
                     axis=1))  # shape: [batch_size, num_lstm_units]

        with tf.variable_scope('fully_connect'):
            self.w_fc = tf.get_variable(shape=[num_lstm_units, num_labels],
                                        initializer=self._initializer,
                                        name='w_fc')
            self.b_fc = tf.get_variable(shape=[num_labels],
                                        initializer=self._initializer,
                                        name='b_fc')
        self.logits = tf.matmul(self.final_h_r, self.w_fc) + self.b_fc

        self.loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels,
                                                           logits=self.logits),
            name='loss')
        mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0],
                                        dtype=tf.float32)
        predict_labels = tf.argmax(self.logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(
            tf.equal(self.labels, predict_labels), tf.int64),
                                      name='accuracy')
        self.params = tf.trainable_variables()
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
示例#13
0
    def __init__(self,
            num_symbols,
            num_embed_units,
            num_units,
            num_layers,
            num_labels,
            embed,
            learning_rate=0.5,
            max_gradient_norm=5.0,
	    keep_prob=1.,
	    weight_decay=1e-10,
            RNN_type="BasicRNN"):
        #todo: implement placeholders
        self.texts = tf.placeholder(dtype = tf.string, shape = [None, None])
        self.texts_length = tf.placeholder(dtype = tf.int32, shape = [None])
        self.labels = tf.placeholder(dtype = tf.int64, shape = [None])
        '''
        self.texts = tf.placeholder()  # shape: batch*len
        self.texts_length = tf.placeholder()  # shape: batch
        self.labels = tf.placeholder()  # shape: batch
        '''
        
        self.symbol2index = MutableHashTable(
                key_dtype=tf.string,
                value_dtype=tf.int64,
                default_value=UNK_ID,
                shared_name="in_table",
                name="in_table",
                checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32)
	self.weight_decay = tf.Variable(float(weight_decay), trainable=False, dtype=tf.float32)
	self.keep_prob = tf.Variable(float(keep_prob), trainable=False, dtype=tf.float32)

        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)


        self.index_input = self.symbol2index.lookup(self.texts)   # batch*len
        
        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed)
        
        
        self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input) #batch*len*embed_unit

	# bi-LSTM
	with tf.variable_scope("foward_cell"):
		#fw_cell = tf.contrib.rnn.GRUCell(num_units)
		if RNN_type == "LSTM":
			fw_cell = BasicLSTMCell(num_units)
		else:
			fw_cell = GRUCell(num_units)
		'''
		fw_cell = tf.contrib.rnn.GRUCell(num_units)
		fw_cell = tf.contrib.rnn.GRUCell(num_units)
		'''
	with tf.variable_scope("barkward_cell"):
		#bw_cell = tf.contrib.rnn.GRUCell(num_units)
		if RNN_type == "LSTM":
			bw_cell = BasicLSTMCell(num_units)
		else:
			bw_cell = GRUCell(num_units)
        
	'''
        if num_layers == 1:
            if RNN_type == "BasicRNN":
                cell = BasicRNNCell(num_units)
	        # cell = tf.contrib.rnn.BasicRNNCell(num_units)
            elif RNN_type == "GRU":
                cell = GRUCell(num_units)
            elif RNN_type == "LSTM":
                cell = BasicLSTMCell(num_units)
        outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn")
	'''

	outputs, states = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, self.embed_input, self.texts_length, dtype = tf.float32, scope = "bi_lstm")
	#print "***state: ", states

	#self.y0 = tf.reduce_max(outputs, axis = 1)
	#self.y0 = tf.reduce_max(outputs[0] + outputs[1], axis = 1)
	#self.y0 = tf.reduce_sum(states, axis = 0)
	self.y0 = states[0][1] + states[1][1]
	#print "****** y0:", self.y0
        self.y0_dp = tf.nn.dropout(self.y0, keep_prob = self.keep_prob)

	self.y1 = tf.layers.dense(inputs = self.y0_dp, units = 128, activation = tf.nn.sigmoid)
	self.y2 = tf.layers.dense(inputs = self.y0_dp, units = num_labels)
	logits = self.y2

	'''
        self.W1 = tf.Variable(tf.truncated_normal(stddev = .1, shape = [num_units, 128]))
        self.b1 = tf.Variable(tf.constant(.1, shape = [128]))
        self.u1 = tf.matmul(self.y0_dp, self.W1) + self.b1
        self.y1 = tf.nn.sigmoid(self.u1)

        self.W2 = tf.Variable(tf.truncated_normal(stddev = .1, shape = [128, 5]))
        self.b2 = tf.Variable(tf.constant(.1, shape = [5]))
        self.u2 = tf.matmul(self.y1, self.W2) + self.b2
	'''

	# logits = tf.layers.dense(inputs = self.y1, units = 5)
	# logits = self.u2

        #todo: implement unfinished networks

	with tf.name_scope("l2_loss"):
		vars   = tf.trainable_variables() 
		self.lossL2 = tf.add_n([ tf.nn.l2_loss(v) for v in vars ]) * self.weight_decay

        self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') + self.lossL2
        mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32)
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy')

        self.params = tf.trainable_variables()
            
        # calculate the gradient of parameters
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        #opt = tf.train.AdamOptimizer(self.learning_rate)
        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm)
        self.train_op = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step)
	#self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step,var_list=self.params)

        tf.summary.scalar('loss/step', self.loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()
        
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, 
                max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
示例#14
0
class InteractiveMatchLSTM(object):
    def __init__(self,
                 num_lstm_units,
                 num_labels,
                 embed,
                 max_gradient_norm=5.0):
        self.num_lstm_units = num_lstm_units
        self.texts1 = tf.placeholder(tf.string, [None, None],
                                     name='texts1')  # batch_size*max_len
        self.texts2 = tf.placeholder(
            tf.string, [None, None], name='texts2'
        )  # batch_size*max_len, PAD THE TWO TEXTS TO SAME LENGTH
        self.texts_length1 = tf.placeholder(
            tf.int32, [None], name='texts_length1')  # shape: batch
        self.texts_length2 = tf.placeholder(
            tf.int32, [None], name='texts_length2')  # shape: batch
        self.labels = tf.placeholder(tf.int64, [None],
                                     name='labels')  # shape: batch
        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        self.learning_rate = tf.Variable(0.01,
                                         trainable=False,
                                         dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)
        self.index_input1 = self.symbol2index.lookup(
            self.texts1)  # batch*max_len
        self.index_input2 = self.symbol2index.lookup(
            self.texts2)  # batch*max_len
        self.embed = tf.get_variable('embed',
                                     dtype=tf.float32,
                                     initializer=embed)
        self.embed_input1 = tf.nn.embedding_lookup(
            self.embed, self.index_input1)  # batch*max_len*embed_unit
        self.embed_input2 = tf.nn.embedding_lookup(
            self.embed, self.index_input2)  # batch*max_len*embed_unit

        # zero padding
        self._batch_size = tf.shape(self.texts_length1)[0]
        self._max_length = tf.shape(self.texts1)[1]
        self.mask1 = tf.sequence_mask(self.texts_length1,
                                      maxlen=self._max_length,
                                      dtype=tf.float32)  # shape: batch*max_len
        self.mask1_extended = tf.concat(
            [tf.zeros([self._batch_size, 1], tf.float32), self.mask1], 1)
        self.mask2 = tf.sequence_mask(self.texts_length2,
                                      maxlen=self._max_length,
                                      dtype=tf.float32)  # shape: batch*max_len
        self.mask2_extended = tf.concat(
            [tf.zeros([self._batch_size, 1], tf.float32), self.mask2], 1)
        # debug
        print("mask1 size: " + str(self.mask1.shape))
        self.embed_input1 = tf.transpose(
            self.embed_input1,
            [2, 0, 1]) * self.mask1  # shape: embed_unit*batch*max_len
        self.embed_input1 = tf.transpose(
            self.embed_input1, [2, 1, 0])  # shape: max_len*batch*embed_units
        self.embed_input2 = tf.transpose(
            self.embed_input2,
            [2, 0, 1]) * self.mask2  # shape: embed_unit*batch*max_len
        self.embed_input2 = tf.transpose(
            self.embed_input2, [2, 1, 0])  # shape: max_len*batch*embed_units

        zero_state = tf.zeros(shape=[self._batch_size, self.num_lstm_units],
                              dtype=tf.float32)
        h_s1 = tf.TensorArray(dtype=tf.float32,
                              size=0,
                              dynamic_size=True,
                              clear_after_read=False)
        c_s1 = tf.TensorArray(dtype=tf.float32,
                              size=0,
                              dynamic_size=True,
                              clear_after_read=False)
        h_s1 = h_s1.write(0, zero_state)
        c_s1 = c_s1.write(0, zero_state)

        h_s2 = tf.TensorArray(dtype=tf.float32,
                              size=0,
                              dynamic_size=True,
                              clear_after_read=False)
        c_s2 = tf.TensorArray(dtype=tf.float32,
                              size=0,
                              dynamic_size=True,
                              clear_after_read=False)
        h_s2 = h_s2.write(0, zero_state)
        c_s2 = c_s2.write(0, zero_state)

        h_r = tf.TensorArray(dtype=tf.float32,
                             size=0,
                             dynamic_size=True,
                             clear_after_read=False)
        c_r = tf.TensorArray(dtype=tf.float32,
                             size=0,
                             dynamic_size=True,
                             clear_after_read=False)
        h_r = h_r.write(0, zero_state)
        c_r = c_r.write(0, zero_state)

        self._initializer = tf.truncated_normal_initializer(stddev=0.1)

        t = tf.constant(1, dtype=tf.int32)  # TO DO: check this
        c = lambda x, hs1, cs1, hs2, cs2, hr, cr: tf.less(
            x, self._max_length + 1)
        b = lambda x, hs1, cs1, hs2, cs2, hr, cr: self._match_step(
            x, hs1, cs1, hs2, cs2, hr, cr)
        t, self.h_s1, self.c_s1, self.h_s2, self.c_s2, self.h_r, self.c_r = tf.while_loop(
            cond=c, body=b, loop_vars=(t, h_s1, c_s1, h_s2, c_s2, h_r, c_r))

        self.h_r = tf.transpose(
            self.h_r.stack(),
            [1, 0, 2])  # shape: [batch_size, max_len, num_lstm_units]
        # get final states. don't need to subtract seqlen by 1 because we take zero states also in count
        self.final_h_r = tf.gather_nd(
            self.h_r,
            tf.stack([
                tf.range(self._batch_size),
                tf.maximum(self.texts_length1, self.texts_length2)
            ],
                     axis=1))  # shape: [batch_size, num_lstm_units]

        with tf.variable_scope('fully_connect'):
            self.w_fc = tf.get_variable(shape=[num_lstm_units, num_labels],
                                        initializer=self._initializer,
                                        name='w_fc')
            self.b_fc = tf.get_variable(shape=[num_labels],
                                        initializer=self._initializer,
                                        name='b_fc')
        self.logits = tf.matmul(self.final_h_r, self.w_fc) + self.b_fc

        self.loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels,
                                                           logits=self.logits),
            name='loss')
        mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0],
                                        dtype=tf.float32)
        predict_labels = tf.argmax(self.logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(
            tf.equal(self.labels, predict_labels), tf.int64),
                                      name='accuracy')
        self.params = tf.trainable_variables()
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)

    def _match_step(self, t, h_s1, c_s1, h_s2, c_s2, h_r, c_r):
        """

        :param t: time index(start from 1)
        :self.embed_input1: tensor, shape: [max_length, batch_size, embed_units]
        :param h_s1: TensorArray, hidden states of text1 till last time step, t tensors of size [batch_size, num_lstm_units]
        :self.embed_input2: similar to input1
        :param h_s2: similar to h_s1
        :param h_r: TensorArray, hidden states of lstmr till last time step, t tensors of size [batch_size, num_lstm_units]
        :return: new t and new h_r(t tensors)
        """
        # lstms calculate first
        inputs_s1 = tf.concat(
            [self.embed_input1[t - 1, :, :],
             h_r.read(t - 1)],
            axis=1)  # shape: [batch_size, num_lstm_units * 2]
        inputs_s2 = tf.concat(
            [self.embed_input2[t - 1, :, :],
             h_r.read(t - 1)], axis=1)
        with tf.variable_scope('lstm_s'):
            newc_s1, newh_s1 = self._lstm(inputs=inputs_s1,
                                          states=(c_s1.read(t - 1),
                                                  h_s1.read(t - 1)))
        with tf.variable_scope('lstm_s', reuse=True):
            newc_s2, newh_s2 = self._lstm(inputs=inputs_s2,
                                          states=(c_s2.read(t - 1),
                                                  h_s2.read(t - 1)))
        c_s1 = c_s1.write(t, newc_s1)
        h_s1 = h_s1.write(t, newh_s1)
        c_s2 = c_s2.write(t, newc_s2)
        h_s2 = h_s2.write(t, newh_s2)

        # calculate attention
        with tf.variable_scope('attention'):
            at1 = self._attention(t, h_s1, h_s2, h_r,
                                  self.mask1_extended[:, :t + 1])
        with tf.variable_scope('attention', reuse=True):
            at2 = self._attention(t, h_s2, h_s1, h_r,
                                  self.mask2_extended[:, :t + 1])

        # lstmr update
        inputs_r = tf.concat([at1, at2],
                             axis=1)  # shape: [batch_size, num_lstm_units * 2]
        with tf.variable_scope('lstm_r'):
            newc_r, newh_r = self._lstm(inputs=inputs_r,
                                        states=(c_r.read(t - 1),
                                                h_r.read(t - 1)))
        c_r = c_r.write(t, newc_r)
        h_r = h_r.write(t, newh_r)

        t = tf.add(t, 1)
        return t, h_s1, c_s1, h_s2, c_s2, h_r, c_r

    def _attention(self, t, h_self, h_other, h_r, mask_self):
        """

        :param t: time index(start from 1)
        :param h_self: TensorArray, hidden states of self till last time step, t + 1 tensors of size [batch_size, num_lstm_units]
        :param h_other: TensorArray, hidden states of other, size and tensor shape: same as above
        :param h_r: TensorArray, hidden states of rlstm, t tensors of shape: [batch_size, num_lstm_units]
        :return: a attention-based presentation of 'self', shape: [batch_size, num_lstm_units]
        """
        We = tf.get_variable(shape=[self.num_lstm_units, 1],
                             initializer=self._initializer,
                             name='W_e')
        Wo = tf.get_variable(shape=[self.num_lstm_units, self.num_lstm_units],
                             initializer=self._initializer,
                             name='W_other')
        Ws = tf.get_variable(shape=[self.num_lstm_units, self.num_lstm_units],
                             initializer=self._initializer,
                             name='W_self')
        Wa = tf.get_variable(shape=[self.num_lstm_units, self.num_lstm_units],
                             initializer=self._initializer,
                             name='W_attention')  # shape: batch_size

        etj = tf.einsum('ijk,kl->ijl', h_self.stack(), Ws) + tf.matmul(
            h_other.read(t), Wo) + tf.matmul(h_r.read(t - 1), Wa)
        etj = tf.transpose(etj,
                           [1, 0, 2])  # shape: [batch_size, t, num_lstm_units]
        etj = tf.squeeze(tf.einsum('ijk,kl->ijl', tf.tanh(etj), We),
                         axis=2)  # shape: [batch_size, t]
        etj = tf.exp(etj) * mask_self
        etj_sums = tf.reduce_sum(etj, axis=1)
        atj = tf.transpose(tf.transpose(etj) / etj_sums)
        at = tf.transpose(
            tf.transpose(h_self.stack(), [2, 1, 0]) * atj, [1, 2, 0])
        at = tf.reduce_sum(at, axis=1)  # shape: [batch_size, num_lstm_units]
        return at

    def _lstm(self, inputs, states):
        c, h = states
        _wi = tf.get_variable('lstm_cell_wi',
                              dtype=tf.float32,
                              shape=[
                                  inputs.get_shape()[-1] + h.get_shape()[-1],
                                  self.num_lstm_units
                              ],
                              initializer=tf.orthogonal_initializer())
        _bi = tf.get_variable('lstm_cell_bi',
                              dtype=tf.float32,
                              shape=[self.num_lstm_units],
                              initializer=tf.constant_initializer(0.0))
        _wo = tf.get_variable('lstm_cell_wo',
                              dtype=tf.float32,
                              shape=[
                                  inputs.get_shape()[-1] + h.get_shape()[-1],
                                  self.num_lstm_units
                              ],
                              initializer=tf.orthogonal_initializer())
        _bo = tf.get_variable('lstm_cell_bo',
                              dtype=tf.float32,
                              shape=[self.num_lstm_units],
                              initializer=tf.constant_initializer(0.0))
        _wf = tf.get_variable('lstm_cell_wf',
                              dtype=tf.float32,
                              shape=[
                                  inputs.get_shape()[-1] + h.get_shape()[-1],
                                  self.num_lstm_units
                              ],
                              initializer=tf.orthogonal_initializer())
        _bf = tf.get_variable('lstm_cell_bf',
                              dtype=tf.float32,
                              shape=[self.num_lstm_units],
                              initializer=tf.constant_initializer(1.0))
        _wc = tf.get_variable('lstm_cell_wc',
                              dtype=tf.float32,
                              shape=[
                                  inputs.get_shape()[-1] + h.get_shape()[-1],
                                  self.num_lstm_units
                              ],
                              initializer=tf.orthogonal_initializer())
        _bc = tf.get_variable('lstm_cell_bc',
                              dtype=tf.float32,
                              shape=[self.num_lstm_units],
                              initializer=tf.constant_initializer(0.0))
        i = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), _wi) + _bi)
        o = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), _wo) + _bo)
        f = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), _wf) + _bf)
        _c = tf.tanh(tf.matmul(tf.concat([inputs, h], 1), _wc) + _bc)
        new_c = f * c + i * _c
        new_h = o * tf.tanh(new_c)
        return new_c, new_h

    def print_parameters(self):
        for item in self.params:
            print('%s: %s' % (item.name, item.get_shape()))

    def train_step(self, session, data):
        input_feed = {
            self.texts1: data['texts1'],
            self.texts2: data['texts2'],
            self.texts_length1: data['texts_length1'],
            self.texts_length2: data['texts_length2'],
            self.labels: data['labels']
        }
        # for debug
        # output_feed = [self.loss, self.accuracy, self.update, self.embed_input1, self.embed_input2, self.h_r, self.final_h_r]
        output_feed = [
            self.loss, self.accuracy, self.update, self.final_h_r, self.logits
        ]
        return session.run(output_feed, input_feed)
示例#15
0
class RNN(object):
    def __init__(self,
                 num_symbols,
                 num_embed_units,
                 num_units,
                 num_layers,
                 num_labels,
                 embed,
                 learning_rate=0.005,
                 max_gradient_norm=5.0,
                 prob=1):

        self.texts = tf.placeholder(tf.string, (None, None),
                                    'texts')  # shape: [batch, length]

        #todo: implement placeholders
        self.texts_length = tf.placeholder(tf.float32, None,
                                           'texts_length')  # shape: [batch]
        self.labels = tf.placeholder(tf.int64, None,
                                     'labels')  # shape: [batch]

        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)

        self.prob = tf.Variable(float(prob), trainable=False, dtype=tf.float32)

        self.index_input = self.symbol2index.lookup(
            self.texts)  # shape: [batch, length]

        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed',
                                         [num_symbols, num_embed_units],
                                         tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed',
                                         dtype=tf.float32,
                                         initializer=embed)

        #todo: implement embedding inputs
        self.embed_input = tf.nn.embedding_lookup(
            self.embed,
            self.index_input)  #shape: [batch, length, num_embed_units]

        #todo: implement other RNNCell to replace BasicRNNCell
        #修改下面语句,BasicRNNCell换成GRUCell和BasicLSTMCell分别得到对应模型
        cell = MultiRNNCell(
            [BasicRNNCell(num_units) for _ in range(num_layers)])
        if prob < 1:
            cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=prob)

        outputs, states = dynamic_rnn(cell,
                                      self.embed_input,
                                      self.texts_length,
                                      dtype=tf.float32,
                                      scope="rnn")

        #todo: vectors is the last hidden states of the BasicRNNCell, u may need to change the code to get the right vectors of other RNNCell
        #vectors = states[-1][1] #for lstm
        vectors = states[-1]  #for others

        with tf.variable_scope('logits'):
            weight = tf.get_variable("weights", [num_units, num_labels])
            bias = tf.get_variable("biases", [num_labels])
            #todo: implement the linear transformation: [batch, num_units] -> [batch, num_labels], using vectors, weight, bias
            logits = tf.matmul(vectors, weight) + bias

        self.loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels,
                                                           logits=logits),
            name='loss')
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(
            tf.equal(self.labels, predict_labels), tf.int32),
                                      name='accuracy')

        self.params = tf.trainable_variables()

        # calculate the gradient of parameters
        #        opt = tf.train.AdamOptimizer(self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08, name = 'Adam')
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=5,
                                    pad_step_number=True)

    def print_parameters(self):
        for item in self.params:
            print('%s: %s' % (item.name, item.get_shape()))

    def train_step(self, session, data):
        input_feed = {
            self.texts: data['texts'],
            self.texts_length: data['texts_length'],
            self.labels: data['labels']
        }
        output_feed = [
            self.loss, self.accuracy, self.gradient_norm, self.update
        ]
        return session.run(output_feed, input_feed)
示例#16
0
    def __init__(self,
                 num_symbols,
                 num_embed_units,
                 num_units,
                 num_layers,
                 num_labels,
                 embed,
                 learning_rate=0.001,
                 max_gradient_norm=5.0,
                 learning_rate_decay_factor=0.1):
        #todo: implement placeholders
        # PROBLEMS REMAIN
        self.texts = tf.placeholder(dtype=tf.string,
                                    shape=[None, None])  # shape: batch*len
        self.texts_length = tf.placeholder(dtype=tf.int32,
                                           shape=[None])  # shape: batch
        self.labels = tf.placeholder(dtype=tf.int32,
                                     shape=[None])  # shape: batch
        self.output_keep_prob = tf.placeholder(dtype=tf.float32, shape=[])

        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)
        self.learning_rate_update_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)

        self.index_input = self.symbol2index.lookup(self.texts)  # batch*len

        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed',
                                         [num_symbols, num_embed_units],
                                         tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed',
                                         dtype=tf.float32,
                                         initializer=embed)

        self.embed_input = tf.nn.embedding_lookup(
            self.embed, self.index_input)  #batch*len*embed_unit

        if num_layers == 1:
            # basic rnn
            # cell = BasicRNNCell(num_units)
            # outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn")

            # gru
            # cell = GRUCell(num_units)
            # outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn")

            # lstm
            # cell = BasicLSTMCell(num_units)
            # outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn")
            # states = states[1]

            # final model
            cell = tf.nn.rnn_cell.DropoutWrapper(
                BasicLSTMCell(num_units),
                output_keep_prob=self.output_keep_prob)
            cell_bw = tf.nn.rnn_cell.DropoutWrapper(
                BasicLSTMCell(num_units),
                output_keep_prob=self.output_keep_prob)
            outputs, states = bidirectional_dynamic_rnn(cell,
                                                        cell_bw,
                                                        self.embed_input,
                                                        self.texts_length,
                                                        dtype=tf.float32,
                                                        scope="rnn")
            states = states[0][1] + states[1][1]
        else:
            cells = []
            cells_bw = []
            for _ in range(num_layers):
                cell = tf.nn.rnn_cell.DropoutWrapper(
                    GRUCell(num_units), output_keep_prob=output_keep_prob)
                cells.append(cell)
                cell_bw = tf.nn.rnn_cell.DropoutWrapper(
                    GRUCell(num_units), output_keep_prob=output_keep_prob)
                cells_bw.append(cell_bw)
            cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
            cell_bw = tf.contrib.rnn.MultiRNNCell(cells_bw,
                                                  state_is_tuple=True)
            outputs, states = bidirectional_dynamic_rnn(cell,
                                                        cell_bw,
                                                        self.embed_input,
                                                        self.texts_length,
                                                        dtype=tf.float32,
                                                        scope="stacked_rnn")
            states = states[0][num_layers - 1] + states[1][num_layers - 1]

        #todo: implement unfinished networks
        self.w1 = tf.Variable(
            tf.random_normal(shape=[num_units, num_labels],
                             stddev=tf.sqrt(2.0 / (num_units + num_labels))))
        self.b1 = tf.Variable(tf.constant(0.0, shape=[num_labels]))
        logits = tf.matmul(states, self.w1) + self.b1

        self.loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels,
                                                           logits=logits),
            name='loss')
        mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0],
                                        dtype=tf.float32)
        self.predict_labels = tf.argmax(logits,
                                        1,
                                        'predict_labels',
                                        output_type=tf.int32)
        self.accuracy = tf.reduce_sum(tf.cast(
            tf.equal(self.labels, self.predict_labels), tf.int32),
                                      name='accuracy')

        self.params = tf.trainable_variables()

        # calculate the gradient of parameters
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        tf.summary.scalar('loss/step', self.loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
示例#17
0
class RNN(object):
    def __init__(self,
                 num_symbols,
                 num_embed_units,
                 num_units,
                 num_labels,
                 batch_size,
                 embed,
                 learning_rate=0.001,
                 max_gradient_norm=5.0,
                 learning_rate_decay_factor=0.9):
        # todo: implement placeholders
        self.texts1 = tf.placeholder(tf.string, [batch_size, None],
                                     name='texts1')
        self.texts2 = tf.placeholder(tf.string, [batch_size, None],
                                     name='texts2')  # shape: batch*len
        self.texts_length1 = tf.placeholder(
            tf.int32, [batch_size], name='texts_length1')  # shape: batch
        self.texts_length2 = tf.placeholder(tf.int32, [batch_size],
                                            name='texts_length2')
        self.max_length = tf.placeholder(tf.int32, name='max_length')
        self.labels = tf.placeholder(tf.int64, [batch_size],
                                     name='labels')  # shape: batch
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        self.embed_units = num_embed_units
        self.num_units = num_units
        self.batch_size = batch_size
        self._initializer = tf.truncated_normal_initializer(stddev=0.1)
        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.index_input1 = self.symbol2index.lookup(self.texts1)  # batch*len
        self.index_input2 = self.symbol2index.lookup(self.texts2)
        self.long_length = tf.maximum(self.texts_length1, self.texts_length2)
        print self.long_length.get_shape()
        self.mask_table = tf.sequence_mask(self.long_length, dtype=tf.float32)
        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed',
                                         [num_symbols, num_embed_units],
                                         tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed',
                                         dtype=tf.float32,
                                         initializer=embed)

        self.embed_input1 = tf.nn.embedding_lookup(
            self.embed, self.index_input1)  # batch*len*embed_unit
        self.embed_input2 = tf.nn.embedding_lookup(self.embed,
                                                   self.index_input2)

        with tf.variable_scope('lstm_s'):
            self.lstm_s = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                initializer=tf.orthogonal_initializer,
                forget_bias=0)

        with tf.variable_scope('lstm_r'):
            self.lstm_r = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                initializer=tf.orthogonal_initializer,
                forget_bias=0)

        out_s1, state_s1 = dynamic_rnn(self.lstm_s,
                                       self.embed_input1,
                                       self.texts_length1,
                                       dtype=tf.float32,
                                       scope='rnn')
        out_s2, state_s2 = dynamic_rnn(self.lstm_s,
                                       self.embed_input2,
                                       self.texts_length2,
                                       dtype=tf.float32,
                                       scope='rnn')

        self.h_s1 = out_s1
        self.h_s2 = out_s2

        reshaped_s1 = tf.reshape(self.h_s1, [-1, self.num_units])
        reshaped_s2 = tf.reshape(self.h_s2, [-1, self.num_units])
        with tf.variable_scope('Attn_'):
            W_s = tf.get_variable(shape=[self.num_units, self.num_units],
                                  initializer=self._initializer,
                                  name='W_s')
        self.s_1 = tf.matmul(reshaped_s1, W_s)
        self.s_2 = tf.matmul(reshaped_s2, W_s)
        self.s_1 = tf.transpose(
            tf.reshape(self.s_1, [self.batch_size, -1, self.num_units]),
            [1, 2, 0])
        self.s_2 = tf.transpose(
            tf.reshape(self.s_2, [self.batch_size, -1, self.num_units]),
            [1, 2, 0])
        i = tf.constant(0)

        state_r = self.lstm_r.zero_state(batch_size=batch_size,
                                         dtype=tf.float32)

        def c(t, sr):
            return tf.less(t, self.max_length)

        def b(t, sr):
            return self.attention(t, sr)

        i, state_r = tf.while_loop(cond=c, body=b, loop_vars=(i, state_r))

        with tf.variable_scope('fully_connect'):
            w_fc = tf.get_variable(shape=[self.num_units, num_labels],
                                   initializer=self._initializer,
                                   name='w_fc')
            b_fc = tf.get_variable(shape=[num_labels],
                                   initializer=self._initializer,
                                   name='b_fc')
        logits = tf.matmul(state_r.h, w_fc) + b_fc

        #logits = tf.layers.dense(outputs, num_labels)

        # todo: implement unfinished networks

        self.loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels,
                                                           logits=logits),
            name='loss')
        mean_loss = self.loss / \
            tf.cast(tf.shape(self.labels)[0], dtype=tf.float32)
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(
            tf.equal(self.labels, predict_labels), tf.int64),
                                      name='accuracy')

        self.params = tf.trainable_variables()
        # calculate the gradient of parameters
        for item in tf.global_variables():
            print item
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        #self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(mean_loss, global_step=self.global_step,
        #var_list=self.params)
        tf.summary.scalar('loss/step', self.loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)

    def attention(self, t, hr):
        with tf.variable_scope('Attn_'):
            W_o = tf.get_variable(shape=[self.num_units, self.num_units],
                                  initializer=self._initializer,
                                  name='W_o')
            W_e = tf.get_variable(shape=[self.num_units, 1],
                                  initializer=self._initializer,
                                  name='W_e')
            W_a = tf.get_variable(shape=[self.num_units, self.num_units],
                                  initializer=self._initializer,
                                  name='W_a')
        e1_tj = tf.tanh(self.s_1 + tf.transpose(
            tf.matmul(self.h_s2[:, t, :], W_o) + tf.matmul(hr.h, W_a)))
        e2_tj = tf.tanh(self.s_2 + tf.transpose(
            tf.matmul(self.h_s1[:, t, :], W_o) + tf.matmul(hr.h, W_a)))
        print e1_tj.get_shape()
        #(max_len, num_units, batch_size)
        e1_tj = tf.matmul(
            tf.reshape(tf.transpose(e1_tj, [2, 0, 1]), [-1, self.num_units]),
            W_e)
        e2_tj = tf.matmul(
            tf.reshape(tf.transpose(e2_tj, [2, 0, 1]), [-1, self.num_units]),
            W_e)
        #(max_len*batch_size, 1)
        print e1_tj.get_shape()
        e1_tj = tf.reshape(e1_tj, [self.batch_size, -1])
        e2_tj = tf.reshape(e2_tj, [self.batch_size, -1])
        #(batch_size, max_len)
        print e1_tj.get_shape()

        alpha1_tj = tf.exp(e1_tj) * self.mask_table
        alpha2_tj = tf.exp(e2_tj) * self.mask_table
        alpha1_tj = tf.transpose(alpha1_tj) / tf.reduce_sum(alpha1_tj, 1)
        alpha2_tj = tf.transpose(alpha2_tj) / tf.reduce_sum(alpha2_tj, 1)
        print alpha1_tj.get_shape()
        #(max_len, batch_size)
        a1tj = alpha1_tj * tf.transpose(self.h_s1, [2, 1, 0])
        a2tj = alpha2_tj * tf.transpose(self.h_s2, [2, 1, 0])
        print a1tj.get_shape()
        #(num_units, max_len, batch_size)
        a1tj = tf.reduce_sum(a1tj, 1)
        a2tj = tf.reduce_sum(a2tj, 1)
        print a1tj.get_shape()
        #(num_units, batch_size)
        r_t = tf.transpose(tf.concat([a1tj, a2tj], 0))
        print r_t.get_shape()
        #(batch_size, 2*num_units)
        with tf.variable_scope('lstm_r'):
            out_r, hr = self.lstm_r(inputs=r_t, state=hr)
        t = tf.add(t, 1)
        return t, hr

    def print_parameters(self):
        for item in self.params:
            print('%s: %s' % (item.name, item.get_shape()))

    def train_step(self, session, data, summary=False):
        input_feed = {
            self.texts1: data['texts1'],
            self.texts2: data['texts2'],
            self.texts_length1: data['texts_length1'],
            self.texts_length2: data['texts_length2'],
            self.max_length: data['max_length'],
            self.labels: data['labels'],
            self.keep_prob: data['keep_prob']
        }
        output_feed = [
            self.loss,
            self.accuracy,  #self.train_op]
            self.gradient_norm,
            self.update
        ]
        '''
                       ,self.assign_op1,
                       self.assign_op2, self.assign_op3, self.assign_op4,
                       self.assign_op5, self.ini_op1,
                       self.ini_op2, self.ini_op3, self.ini_op4, self.ini_op5]
        '''
        #print self.symbol2index.lookup(data['texts1'])
        if summary:
            output_feed.append(self.merged_summary_op)
        #print session.run([self.texts1[0,:10],self.index_input1[0,:10]], input_feed)
        return session.run(output_feed, input_feed)
示例#18
0
    def __init__(
            self,
            num_symbol,  # 词汇表大小
            num_units,  # 隐藏层维度
            num_layers,  # 编码/解码器层数
            embed,  # 词嵌入
            max_length=60,
            learning_rate=0.0001,
            max_gradient_norm=5.0,
            output_alignments=False):  # 是否保存注意力权重
        # 词汇映射到 index 的 hash table
        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        # index 映射到词汇的 hash table
        self.index2symbol = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value='_UNK',
                                             shared_name="out_table",
                                             name="out_table",
                                             checkpoint=True)

        # 模型变量
        self.posts_string = tf.placeholder(
            name="posts_string", shape=(None, None),
            dtype=tf.string)  # [batch_size, encoder_len]
        self.posts_len = tf.placeholder(name="posts_len",
                                        shape=(None),
                                        dtype=tf.int32)  # [batch_size]
        self.responses_string = tf.placeholder(
            name="responses_string", shape=(None, None),
            dtype=tf.string)  # [batch_size, decoder_len]
        self.responses_len = tf.placeholder(name="responses_len",
                                            shape=(None),
                                            dtype=tf.int32)  # [batch_size]
        self.embed = tf.get_variable("word_embed",
                                     dtype=tf.float32,
                                     initializer=embed)

        batch_size, encoder_len = tf.unstack(tf.shape(self.posts_string))
        decoder_len = tf.shape(self.responses_string)[1]

        # posts 和 responses 的序列表示
        self.posts_index = self.symbol2index.lookup(
            self.posts_string)  # [batch_size, encoder_len]
        self.responses_index = self.symbol2index.lookup(
            self.responses_string)  # [batch_size, decoder_len]

        # decoder 输入的序列表示
        self.responses_input_index = tf.concat([
            tf.ones((batch_size, 1), dtype=tf.int64) * GO_ID,
            tf.split(self.responses_index, [decoder_len - 1, 1], axis=1)[0]
        ],
                                               axis=1)

        # encoder 和 decoder 的输入
        self.encoder_input = tf.nn.embedding_lookup(
            embed,
            self.posts_index)  # [batch_size, encoder_len, embedding_size]
        # decoder_label = tf.nn.embedding_lookup(embed, responses_index)  # [batch_size, decoder_len, embedding_size]
        self.decoder_input = tf.nn.embedding_lookup(
            embed, self.responses_input_index
        )  # [batch_size, decoder_len, embedding_size]

        self.decoder_mask = tf.reshape(
            tf.cumsum(tf.one_hot(self.responses_len - 1, decoder_len),
                      reverse=True,
                      axis=1), [-1, decoder_len])  # [batch_size, decoder_len]

        encoder_cell = MultiRNNCell(
            [LSTMCell(num_units) for _ in range(num_layers)])
        decoder_cell = MultiRNNCell(
            [LSTMCell(num_units) for _ in range(num_layers)])

        encoder_output, encoder_state = tf.nn.dynamic_rnn(encoder_cell,
                                                          self.encoder_input,
                                                          self.posts_len,
                                                          dtype=tf.float32,
                                                          scope="encoder")

        output_fn, sequence_loss = output_projection_layer(
            num_units, num_symbol)

        # 训练
        with tf.variable_scope("decoder"):
            keys, values, attention_score_fn, attention_construct_fn = \
                prepare_attention(encoder_output, num_units, reuse=False)
            decoder_fn_train = attention_decoder_fn_train(
                encoder_state,
                keys,
                values,
                attention_score_fn,
                attention_construct_fn,
                output_alignments=output_alignments,
                decoder_len=decoder_len)
            self.decoder_output, _, alignments_ta = dynamic_rnn_decoder(
                decoder_cell,
                decoder_fn_train,
                inputs=self.decoder_input,
                sequence_length=self.responses_len,
                scope="decoder_rnn")
            self.total_loss, self.loss = sequence_loss(self.decoder_output,
                                                       self.responses_index,
                                                       self.decoder_mask)

        # 推导
        with tf.variable_scope("decoder", reuse=True):
            # 得到注意力函数
            keys, values, attention_score_fn, attention_construct_fn = \
                prepare_attention(encoder_output, num_units, reuse=True)
            decoder_fn_inference = attention_decoder_fn_inference(
                output_fn, encoder_state, keys, values, attention_score_fn,
                attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length,
                num_symbol)
            # decoder_distribution: [batch_size, decoder_len, num_symbol] 未 softmax 的预测分布
            # output_ids_ta: decoder_len [bath_size]
            self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder(
                decoder_cell, decoder_fn_inference, scope="decoder_rnn")
            # self.word_ids = tf.cast(tf.argmax(tf.nn.softmax(self.decoder_distribution), 2), dtype=tf.int64)
            # self.output_ids = tf.transpose(output_ids_ta.stack())

            output_len = tf.shape(self.decoder_distribution)[1]  # decoder_len
            self.output_ids = tf.transpose(
                output_ids_ta.gather(
                    tf.range(output_len)))  # [batch_size, decoder_len]

            # 对 output 的值域行裁剪
            self.word_ids = tf.cast(
                tf.clip_by_value(self.output_ids, 0, num_symbol),
                tf.int64)  # [batch_size, decoder_len]
            self.words = self.index2symbol.lookup(self.word_ids)

        self.global_step = tf.Variable(0, trainable=False, name="global_step")
        self.params = tf.global_variables()
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(gradients, self.params),
                                          global_step=self.global_step)

        self.saver = tf.train.Saver(max_to_keep=3)
示例#19
0
    def __init__(self,
                 num_symbols,
                 num_embed_units,
                 num_units,
                 num_labels,
                 batch_size,
                 embed,
                 learning_rate=0.001,
                 max_gradient_norm=5.0,
                 learning_rate_decay_factor=0.9):
        # todo: implement placeholders
        self.texts1 = tf.placeholder(tf.string, [batch_size, None],
                                     name='texts1')
        self.texts2 = tf.placeholder(tf.string, [batch_size, None],
                                     name='texts2')  # shape: batch*len
        self.texts_length1 = tf.placeholder(
            tf.int32, [batch_size], name='texts_length1')  # shape: batch
        self.texts_length2 = tf.placeholder(tf.int32, [batch_size],
                                            name='texts_length2')
        self.max_length = tf.placeholder(tf.int32, name='max_length')
        self.labels = tf.placeholder(tf.int64, [batch_size],
                                     name='labels')  # shape: batch
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        self.embed_units = num_embed_units
        self.num_units = num_units
        self.batch_size = batch_size
        self._initializer = tf.truncated_normal_initializer(stddev=0.1)
        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.index_input1 = self.symbol2index.lookup(self.texts1)  # batch*len
        self.index_input2 = self.symbol2index.lookup(self.texts2)
        self.long_length = tf.maximum(self.texts_length1, self.texts_length2)
        print self.long_length.get_shape()
        self.mask_table = tf.sequence_mask(self.long_length, dtype=tf.float32)
        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed',
                                         [num_symbols, num_embed_units],
                                         tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed',
                                         dtype=tf.float32,
                                         initializer=embed)

        self.embed_input1 = tf.nn.embedding_lookup(
            self.embed, self.index_input1)  # batch*len*embed_unit
        self.embed_input2 = tf.nn.embedding_lookup(self.embed,
                                                   self.index_input2)

        with tf.variable_scope('lstm_s'):
            self.lstm_s = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                initializer=tf.orthogonal_initializer,
                forget_bias=0)

        with tf.variable_scope('lstm_r'):
            self.lstm_r = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                initializer=tf.orthogonal_initializer,
                forget_bias=0)

        out_s1, state_s1 = dynamic_rnn(self.lstm_s,
                                       self.embed_input1,
                                       self.texts_length1,
                                       dtype=tf.float32,
                                       scope='rnn')
        out_s2, state_s2 = dynamic_rnn(self.lstm_s,
                                       self.embed_input2,
                                       self.texts_length2,
                                       dtype=tf.float32,
                                       scope='rnn')

        self.h_s1 = out_s1
        self.h_s2 = out_s2

        reshaped_s1 = tf.reshape(self.h_s1, [-1, self.num_units])
        reshaped_s2 = tf.reshape(self.h_s2, [-1, self.num_units])
        with tf.variable_scope('Attn_'):
            W_s = tf.get_variable(shape=[self.num_units, self.num_units],
                                  initializer=self._initializer,
                                  name='W_s')
        self.s_1 = tf.matmul(reshaped_s1, W_s)
        self.s_2 = tf.matmul(reshaped_s2, W_s)
        self.s_1 = tf.transpose(
            tf.reshape(self.s_1, [self.batch_size, -1, self.num_units]),
            [1, 2, 0])
        self.s_2 = tf.transpose(
            tf.reshape(self.s_2, [self.batch_size, -1, self.num_units]),
            [1, 2, 0])
        i = tf.constant(0)

        state_r = self.lstm_r.zero_state(batch_size=batch_size,
                                         dtype=tf.float32)

        def c(t, sr):
            return tf.less(t, self.max_length)

        def b(t, sr):
            return self.attention(t, sr)

        i, state_r = tf.while_loop(cond=c, body=b, loop_vars=(i, state_r))

        with tf.variable_scope('fully_connect'):
            w_fc = tf.get_variable(shape=[self.num_units, num_labels],
                                   initializer=self._initializer,
                                   name='w_fc')
            b_fc = tf.get_variable(shape=[num_labels],
                                   initializer=self._initializer,
                                   name='b_fc')
        logits = tf.matmul(state_r.h, w_fc) + b_fc

        #logits = tf.layers.dense(outputs, num_labels)

        # todo: implement unfinished networks

        self.loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels,
                                                           logits=logits),
            name='loss')
        mean_loss = self.loss / \
            tf.cast(tf.shape(self.labels)[0], dtype=tf.float32)
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(
            tf.equal(self.labels, predict_labels), tf.int64),
                                      name='accuracy')

        self.params = tf.trainable_variables()
        # calculate the gradient of parameters
        for item in tf.global_variables():
            print item
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        #self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(mean_loss, global_step=self.global_step,
        #var_list=self.params)
        tf.summary.scalar('loss/step', self.loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
示例#20
0
    def __init__(self,
                 num_symbols,
                 num_embed_units,
                 num_units,
                 num_labels,
                 batch_size,
                 embed,
                 learning_rate=0.001,
                 max_gradient_norm=5.0
                 ):
        # todo: implement placeholders
        self.texts1 = tf.placeholder(tf.string, [batch_size, None], name='texts1')
        self.texts2 = tf.placeholder(tf.string, [batch_size, None], name='texts2')  # shape: batch*len
        self.texts_length = tf.placeholder(tf.int32, [None], name='texts_length')  # shape: batch
        self.len = tf.constant(1.0, shape=[batch_size])
        self.labels = tf.placeholder(
            tf.int64, [None], name='labels')  # shape: batch
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        self.embed_units = num_embed_units
        self.batch_size = batch_size
        self._initializer = tf.truncated_normal_initializer(stddev=0.1)
        self.symbol2index = MutableHashTable(
            key_dtype=tf.string,
            value_dtype=tf.int64,
            default_value=UNK_ID,
            shared_name="in_table",
            name="in_table",
            checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(
            float(learning_rate), trainable=False, dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)

        self.index_input1 = self.symbol2index.lookup(self.texts1)   # batch*len
        self.index_input2 = self.symbol2index.lookup(self.texts2)
        '''
        self.h_s1 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False)
        self.h_s2 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False)
        self.h_r = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False)
        self.a1 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False)
        self.a2 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False)
        '''
        self.h_s1 = []
        self.h_s2 = []
        self.h_r = []
        self.a1 = []
        self.a2 = []
        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable(
                'embed', [num_symbols, num_embed_units], tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable(
                'embed', dtype=tf.float32, initializer=embed)

        self.embed_input1 = tf.nn.embedding_lookup(
            self.embed, self.index_input1)  # batch*len*embed_unit
        self.embed_input2 = tf.nn.embedding_lookup(
            self.embed, self.index_input2)
        with tf.variable_scope('lstm_s'):
            self.lstm_s = rnn_cell.BasicLSTMCell(num_units=num_embed_units, forget_bias=0)
        '''
        out_s1, state_s1 = tf.nn.dynamic_rnn(self.lstm_s, self.embed_input1, self.texts_length, dtype=tf.float32)
        out_s2, state_s2 = tf.nn.dynamic_rnn(self.lstm_s, self.embed_input2, self.texts_length, dtype=tf.float32)
        self.h_s1 = state_s1
        self.h_s2 = state_s2
        '''
        with tf.variable_scope('lstm_r'):
            self.lstm_r = rnn_cell.BasicLSTMCell(num_units=num_embed_units, forget_bias=0)
        '''
        self.ini_op1 = tf.assign(self.h_s1[0], self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.ini_op2 = tf.assign(self.h_s2[0], self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.ini_op3 = tf.assign(self.h_r[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.ini_op4 = tf.assign(self.a1[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.ini_op5 = tf.assign(self.a2[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32))
        '''
        
        self.h_s1.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.h_s2.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.h_r.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.a1.append(self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.a2.append(self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) 
        
        W = tf.Variable(self._initializer(shape=[num_embed_units, num_labels],dtype=tf.float32))
        bias = tf.Variable(tf.constant(0.0, shape=[num_labels]), dtype=tf.float32)

        i = tf.constant(1, dtype=tf.int64)
        print self.index_input1[1].get_shape()
        length = self._length(self.index_input1[1])
        self.ind = 1
        state_s1 = self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)
        state_s2 = self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)
        state_r = self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)
        def c(t, s1, s2, sr): return tf.less(t, length+1)

        def b(t, s1, s2, sr): return self.attention(t, s1, s2, sr)
        i, state_s1, state_s2, state_r = tf.while_loop(cond=c, body=b, loop_vars=(i, state_s1, state_s2, state_r))

        
        logits = tf.matmul(state_r.h, W) + bias

        #logits = tf.layers.dense(outputs, num_labels)

        # todo: implement unfinished networks

        self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.labels, logits=logits), name='loss')
        mean_loss = self.loss / \
            tf.cast(tf.shape(self.labels)[0], dtype=tf.float32)
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(
            tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy')

        self.params = tf.trainable_variables()
        # calculate the gradient of parameters
        '''
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(
            zip(clipped_gradients, self.params), global_step=self.global_step)
        '''
        self.global_step = tf.Variable(0, trainable=False)
        self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(mean_loss, global_step=self.global_step,
                                                                            var_list=self.params)
        tf.summary.scalar('loss/step', self.loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
示例#21
0
    def __init__(
        self,
        embed,  # 词嵌入 [VOCABULARY_COUNT * 200]
        vocabulary,  # 词汇表 [1 * VOCABULARY_COUNT]
        vocabulary_count,  # 词汇数
        num_layers,  # encoder和decoder的层数
        num_units,  # encoder和decoder的隐藏状态维度
        learning_rate,
        max_gradient_norm,
        max_len,
        # output_alignments=False
    ):  # 解码最大长度

        # placeholder通常用于存储数据,用于feed_dict的配合,接收输入数据(如真实的训练样本)用于训练模型等
        # placeholder在训练过程中会不断被赋予新的值,用于批训练,基本上其值是不会轻易进行加减操作
        self.post_string = tf.placeholder(
            dtype=tf.string, shape=(None, None),
            name="post_string")  # padding后的post  batch_size * encoder_len
        self.response_string = tf.placeholder(
            dtype=tf.string, shape=(None, None), name="response_string"
        )  # padding后的response  batch_size * decoder_len
        self.label_string = tf.placeholder(
            dtype=tf.string, shape=(None, None),
            name="label_string")  # batch_size * decoder_len
        self.post_len = tf.placeholder(
            dtype=tf.int32, shape=(None, ),
            name="post_len")  # 每条post的长度(padding前) batch_size
        self.response_len = tf.placeholder(
            dtype=tf.int32, shape=(None, ),
            name="reponse_len")  # 每条response长度(padding前) batch_size

        # tf.get_variable表示创建或返回指定名称的模型变量——共享变量
        self.embed = tf.get_variable(
            dtype=tf.float32, initializer=embed,
            name="embed")  # 词嵌入,作为变量训练,VOCABULARY_COUNT * 200
        self.vocabulary = tf.constant(vocabulary,
                                      dtype=tf.string)  # 词汇表,VOCABULARY_COUNT

        self.batch_size = tf.shape(self.post_string)[0]
        self.encoder_len = tf.shape(self.post_string)[1]
        self.decoder_len = tf.shape(self.response_string)[1]
        '''
        mask矩阵是一个由0和1组成的矩阵,该矩阵用以指示哪些是真正的数据,哪些是padding
        其中1代表真实数据,0代表padding数据
        [[1. 1. 1. 0. 0.]
         [1. 1. 1. 1. 0.]
         [1. 1. 1. 1. 1.]]
         
        response_len-1:所有长度减去START_WORD所占的位置
        [batch_size * decoder_len]
        tf.cumsum根据列从右往左累计求和
        例如
        右边第一列为原始的[0 0 0],右边倒数第二列[0+0 0+1 1+0],右边倒数第三列[0+0+1 0+1+0 1+0+0]
        response_len = [3, 4, 5]    decoder_len = 5
        onehot = [[0. 0. 1. 0. 0.]
                  [0. 0. 0. 1. 0.]
                  [0. 0. 0. 0. 1.]]
        cumsum = [[1. 1. 1. 0. 0.]
                  [1. 1. 1. 1. 0.]
                  [1. 1. 1. 1. 1.]]
        '''
        # self.post_mask = tf.cumsum(tf.one_hot(self.post_len), self.encoder_len), axis=1, reverse=True)
        self.mask = tf.cumsum(tf.one_hot(self.response_len - 1,
                                         self.decoder_len),
                              axis=1,
                              reverse=True)

        # 将字符(key)转化成id(value)表示的表,默认值为1
        self.string_to_id = MutableHashTable(
            key_dtype=tf.string,  # 键的类型
            value_dtype=tf.int64,  # 值的类型
            default_value=1,  # 当检索不到时的默认值
            shared_name="string_to_id",  # 如果非空,表将在多个session中以该名字共享
            name="string_to_id",  # 操作名
            checkpoint=True)  # 如果为True,表能从checkpoint中保存和恢复

        # 将id转化成字符串表示的表,默认值为"_NDW"
        self.id_to_string = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value="_NDW",
                                             shared_name="id_to_string",
                                             name="id_to_string",
                                             checkpoint=True)

        # 将post和response转化成id表示
        # table.lookup()根据表替换张量值
        self.post_id = self.string_to_id.lookup(
            self.post_string)  # batch_size * encoder_len
        self.response_id = self.string_to_id.lookup(
            self.response_string)  # batch_size * decoder_len
        self.label_id = self.string_to_id.lookup(
            self.label_string)  # batch_size * decoder_len

        # 将post和response转化成嵌入表示
        '''
        tf.nn.embedding_lookup(params, ids,……)根据索引选取一个张量里面对应的元素
        batch_size * encoder_len * embed_size:
            [[[vector_1],
              [vector_2],
              ...
              [vector_encoder_len]],
             [[vector_1],
              [vector_2],
              ...
              [vector_encoder_len]],
             ...,
             [[vector_1],
              [vector_2],
              ...
              [vector_encoder_len]]]
        '''
        self.post_embed = tf.nn.embedding_lookup(
            embed, self.post_id)  # batch_size * encoder_len * embed_size
        self.response_embed = tf.nn.embedding_lookup(
            embed, self.response_id)  # batch_size * decoder_len * embed_size
        '''
        Python中对于无需关注其实际含义的变量可以用_代替,这就和for i in range(5)一样,因为这里我们对i并不关心,所以用_代替仅获取值而已
        [LSTMCell(num_units), LSTMCell(num_units)]
        MultiRNNCell用于构建多层循环神经网络
        '''
        # encoder和decoder的层数和维度
        encoder_cell = MultiRNNCell(
            [LSTMCell(num_units) for _ in range(num_layers)])  # 2层RNN
        decoder_cell = MultiRNNCell(
            [LSTMCell(num_units) for _ in range(num_layers)])

        projection_fn, loss_fn, inference_fn = get_project_funtion(
            vocabulary_count)

        # 定义模型的encoder部分
        # tf.variable_scope表示变量所在的命名空间,指定变量的作用域"encoder/变量"
        with tf.variable_scope("encoder"):
            self.encoder_output, self.encoder_state = tf.nn.dynamic_rnn(
                encoder_cell,  # RNN单元
                self.
                post_embed,  # padding后的post  batch_size * encoder_len * embed_size
                self.post_len,  # post的有效长度  batch_size
                dtype=tf.float32)

            # [batch_size encoder_len num_units] 每个样本每个时间步都对应一个输出
            # self.encoder_output_shape = tf.shape(self.encoder_output)
            # 返回2个LSTMStateTuple(c=array([[batch_size num_units]]),h=array([[batch_size num_units]]))
            # [num_layers(2层) 2(c和h) batch_size num_units] 整个LSTM输出的最终状态,包含C和H,共2层,每个样本都有一个num_units维的状态C和H
            # self.encoder_state_shape = tf.shape(self.encoder_state)

        # 定义模型的decoder部分
        # 训练时decoder
        with tf.variable_scope("decoder"):
            # keys, values, attention_score_fn, attention_construct_fn = \
            #     prepare_attention(self.encoder_output, num_units, reuse=False)
            # decoder_fn_train = attention_decoder_fn_train(self.encoder_state,
            #                                               keys,
            #                                               values,
            #                                               attention_score_fn,
            #                                               attention_construct_fn,
            #                                               output_alignments=output_alignments,
            #                                               decoder_len=self.decoder_len)
            self.decoder_output, self.decoder_state, self.loop_state = dynamic_decoder(
                decoder_cell,
                encoder_state=self.
                encoder_state,  # num_layers * 2 * batch_size * num_units
                input=self.response_embed,
                response_len=self.response_len)

            # self.decoder_output_shape = tf.shape(self.decoder_output)  # [batch_size decoder_len num_units]
            # self.decoder_state_shape = tf.shape(self.decoder_state)  # [num_layers 2 batch_size num_units]

            # self.softmaxed_probability = projection_function(self.decoder_output)  # 词汇表softmaxed后的概率 [batch_size decoder_len vovabulary_count]
            # self.maximum_likelihood_id = tf.argmax(self.softmaxed_probability, axis=2)  # [batch_size decoder_len]
            # self.output_string = self.id_to_string.lookup(self.maximum_likelihood_id)
            self.loss, self.avg_loss = loss_fn(self.decoder_output,
                                               self.label_id, self.mask)
        '''
        通过tf.variable_scope函数可以控制tf.get_variable函数的语义
        当reuse = True时,这个上下文管理器内所有的tf.get_variable都会直接获取已经创建的变量。如果变量不存在,则会报错
        相反,如果reuse = None或者reuse = False,tf.get_variable将创建新的变量,若同名的变量已经存在则报错
        '''
        # 测试时decoder
        with tf.variable_scope("decoder", reuse=True):
            # keys, values, attention_score_fn, attention_construct_fn = \
            #     prepare_attention(self.encoder_output, num_units, reuse=False)
            # decoder_fn_inference = attention_decoder_fn_inference(self.encoder_state,
            #                                                       keys,
            #                                                       values,
            #                                                       attention_score_fn,
            #                                                       attention_construct_fn,
            #                                                       self.embed,
            #                                                       START_WORD_ID,
            #                                                       END_WORD_ID,
            #                                                       max_len,
            #                                                       vocabulary_count)
            self.inference_output, self.inference_state, self.inference_loop_state = dynamic_decoder(
                decoder_cell,
                encoder_state=self.encoder_state,
                projection_function=projection_fn,
                embed=self.embed,
                max_len=max_len)

            self.inference_maximum_likelihood_id = inference_fn(
                self.inference_output)  # [batch_size decoder_len]
            self.inference_string = self.id_to_string.lookup(
                self.inference_maximum_likelihood_id
            )  # [batch_size decoder_len]
        '''
        Variable用于可训练变量,比如网络权重,偏置
        在声明时必须赋予初值,在训练过程中该值很可能会进行不断的加减操作变化
        '''
        self.global_step = tf.Variable(0, trainable=False, name="global_step")
        # 获取程序中的全局变量
        self.params = tf.global_variables()
        # 使用自适应优化器——Adam优化算法,创建一个optimizer
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        # 根据 decoder_loss 计算 params 梯度,gradients长度等于len(params)
        gradients = tf.gradients(self.loss, self.params)
        # 梯度裁剪
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        # 返回一个执行梯度更新的ops
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        self.saver = tf.train.Saver()
示例#22
0
文件: model.py 项目: jcnlp/dssm-lstm
    def __init__(self,
                 num_lstm_units,
                 embed,
                 neg_num=4,
                 gradient_clip_threshold=5.0):
        self.queries = tf.placeholder(dtype=tf.string,
                                      shape=[None, None])  # shape: batch*len
        self.queries_length = tf.placeholder(dtype=tf.int32,
                                             shape=[None])  # shape: batch
        self.docs = tf.placeholder(dtype=tf.string,
                                   shape=[neg_num + 1, None, None
                                          ])  # shape: (neg_num + 1)*batch*len
        self.docs_length = tf.placeholder(
            dtype=tf.int32, shape=[neg_num + 1,
                                   None])  # shape: batch*(neg_num + 1)

        self.word2index = MutableHashTable(key_dtype=tf.string,
                                           value_dtype=tf.int64,
                                           default_value=UNK_ID,
                                           shared_name="in_table",
                                           name="in_table",
                                           checkpoint=True)

        self.learning_rate = tf.Variable(0.001,
                                         trainable=False,
                                         dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)
        self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32)

        self.index_queries = self.word2index.lookup(self.queries)  # batch*len
        self.index_docs = [
            self.word2index.lookup(doc) for doc in tf.unstack(self.docs)
        ]

        self.embed = tf.get_variable('embed',
                                     dtype=tf.float32,
                                     initializer=embed)
        self.embed_queries = tf.nn.embedding_lookup(self.embed,
                                                    self.index_queries)
        self.embed_docs = [
            tf.nn.embedding_lookup(self.embed, index_doc)
            for index_doc in self.index_docs
        ]

        with tf.variable_scope('query_lstm'):
            self.cell_q = SimpleLSTMCell(num_lstm_units)
        with tf.variable_scope('doc_lstm'):
            self.cell_d = SimpleLSTMCell(num_lstm_units)

        self.states_q = dynamic_rnn(
            self.cell_q,
            self.embed_queries,
            self.queries_length,
            dtype=tf.float32,
            scope="simple_lstm_cell_query")[1][1]  # shape: batch*num_units
        self.states_d = [
            dynamic_rnn(self.cell_d,
                        self.embed_docs[i],
                        self.docs_length[i],
                        dtype=tf.float32,
                        scope="simple_lstm_cell_doc")[1][1]
            for i in range(neg_num + 1)
        ]  # shape: (neg_num + 1)*batch*num_units
        self.queries_norm = tf.sqrt(
            tf.reduce_sum(tf.square(self.states_q), axis=1))
        self.docs_norm = [
            tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1))
            for i in range(neg_num + 1)
        ]
        self.prods = [
            tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1)
            for i in range(neg_num + 1)
        ]
        self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i]))
                     for i in range(neg_num + 1)]  # shape: (neg_num + 1)*batch
        self.sims = tf.convert_to_tensor(self.sims)
        self.gamma = tf.Variable(
            initial_value=1.0, expected_shape=[],
            dtype=tf.float32)  # scaling factor according to the paper
        self.sims = self.sims * self.gamma
        self.prob = tf.nn.softmax(self.sims,
                                  dim=0)  # shape: (neg_num + 1)*batch
        self.hit_prob = tf.transpose(self.prob[0])

        self.loss = -tf.reduce_mean(tf.log(self.hit_prob))

        self.params = tf.trainable_variables()
        opt = tf.train.MomentumOptimizer(
            learning_rate=self.learning_rate,
            momentum=self.momentum,
            use_nesterov=True)  # use Nesterov's method, according to the paper
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, gradient_clip_threshold)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
示例#23
0
class RNN(object):
    def __init__(self,
                 num_symbols,
                 num_embed_units,
                 num_units,
                 num_labels,
                 batch_size,
                 embed,
                 learning_rate=0.001,
                 max_gradient_norm=5.0
                 ):
        # todo: implement placeholders
        self.texts1 = tf.placeholder(tf.string, [batch_size, None], name='texts1')
        self.texts2 = tf.placeholder(tf.string, [batch_size, None], name='texts2')  # shape: batch*len
        self.texts_length = tf.placeholder(tf.int32, [None], name='texts_length')  # shape: batch
        self.len = tf.constant(1.0, shape=[batch_size])
        self.labels = tf.placeholder(
            tf.int64, [None], name='labels')  # shape: batch
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        self.embed_units = num_embed_units
        self.batch_size = batch_size
        self._initializer = tf.truncated_normal_initializer(stddev=0.1)
        self.symbol2index = MutableHashTable(
            key_dtype=tf.string,
            value_dtype=tf.int64,
            default_value=UNK_ID,
            shared_name="in_table",
            name="in_table",
            checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(
            float(learning_rate), trainable=False, dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)

        self.index_input1 = self.symbol2index.lookup(self.texts1)   # batch*len
        self.index_input2 = self.symbol2index.lookup(self.texts2)
        '''
        self.h_s1 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False)
        self.h_s2 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False)
        self.h_r = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False)
        self.a1 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False)
        self.a2 = tf.Variable(tf.constant(0.0,shape=[num_units+1, batch_size, num_embed_units]), trainable=False)
        '''
        self.h_s1 = []
        self.h_s2 = []
        self.h_r = []
        self.a1 = []
        self.a2 = []
        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable(
                'embed', [num_symbols, num_embed_units], tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable(
                'embed', dtype=tf.float32, initializer=embed)

        self.embed_input1 = tf.nn.embedding_lookup(
            self.embed, self.index_input1)  # batch*len*embed_unit
        self.embed_input2 = tf.nn.embedding_lookup(
            self.embed, self.index_input2)
        with tf.variable_scope('lstm_s'):
            self.lstm_s = rnn_cell.BasicLSTMCell(num_units=num_embed_units, forget_bias=0)
        '''
        out_s1, state_s1 = tf.nn.dynamic_rnn(self.lstm_s, self.embed_input1, self.texts_length, dtype=tf.float32)
        out_s2, state_s2 = tf.nn.dynamic_rnn(self.lstm_s, self.embed_input2, self.texts_length, dtype=tf.float32)
        self.h_s1 = state_s1
        self.h_s2 = state_s2
        '''
        with tf.variable_scope('lstm_r'):
            self.lstm_r = rnn_cell.BasicLSTMCell(num_units=num_embed_units, forget_bias=0)
        '''
        self.ini_op1 = tf.assign(self.h_s1[0], self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.ini_op2 = tf.assign(self.h_s2[0], self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.ini_op3 = tf.assign(self.h_r[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.ini_op4 = tf.assign(self.a1[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.ini_op5 = tf.assign(self.a2[0], self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32))
        '''
        
        self.h_s1.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.h_s2.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.h_r.append(self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.a1.append(self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32))
        self.a2.append(self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)) 
        
        W = tf.Variable(self._initializer(shape=[num_embed_units, num_labels],dtype=tf.float32))
        bias = tf.Variable(tf.constant(0.0, shape=[num_labels]), dtype=tf.float32)

        i = tf.constant(1, dtype=tf.int64)
        print self.index_input1[1].get_shape()
        length = self._length(self.index_input1[1])
        self.ind = 1
        state_s1 = self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)
        state_s2 = self.lstm_s.zero_state(batch_size=batch_size, dtype=tf.float32)
        state_r = self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32)
        def c(t, s1, s2, sr): return tf.less(t, length+1)

        def b(t, s1, s2, sr): return self.attention(t, s1, s2, sr)
        i, state_s1, state_s2, state_r = tf.while_loop(cond=c, body=b, loop_vars=(i, state_s1, state_s2, state_r))

        
        logits = tf.matmul(state_r.h, W) + bias

        #logits = tf.layers.dense(outputs, num_labels)

        # todo: implement unfinished networks

        self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.labels, logits=logits), name='loss')
        mean_loss = self.loss / \
            tf.cast(tf.shape(self.labels)[0], dtype=tf.float32)
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(
            tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy')

        self.params = tf.trainable_variables()
        # calculate the gradient of parameters
        '''
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(
            zip(clipped_gradients, self.params), global_step=self.global_step)
        '''
        self.global_step = tf.Variable(0, trainable=False)
        self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(mean_loss, global_step=self.global_step,
                                                                            var_list=self.params)
        tf.summary.scalar('loss/step', self.loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
    
    def attention(self, t, s1, s2, sr):
        '''
        h_s1_j = tf.reshape(x1[t], [1, -1])
        h_s2_j = tf.reshape(x2[t], [1, -1])
        h_s1_p = tf.slice(s1, begin=[0, 0], size=[t, self.embed_units])
        h_s2_p = tf.slice(s2, begin=[0, 0], size=[t, self.embed_units])
        '''
        
        s1_t = tf.concat([self.embed_input1[:,t-1], sr.h],1)
        s2_t = tf.concat([self.embed_input2[:,t-1], sr.h],1)
        r_t = tf.concat([self.a1[self.ind-1].h, self.a2[self.ind-1].h],1)
        

        with tf.variable_scope('lstm_s'):
            out_s1, state_s1 = self.lstm_s(inputs=s1_t, state=s1)
            out_s2, state_s2 = self.lstm_s(inputs=s2_t, state=s2)
        with tf.variable_scope('lstm_r'):
            out_r, state_r = self.lstm_r(inputs=r_t, state=sr)
        '''
        self.assign_op1 = tf.assign(self.h_s1[t], state_s1)
        self.assign_op2 = tf.assign(self.h_s2[t], state_s2)
        self.assign_op3 = tf.assign(self.h_r[t], state_r)
        '''
        self.h_s1.append(state_s1)
        self.h_s2.append(state_s2)
        self.h_r.append(state_r)

        a1t = tf.constant(0.0, shape = [self.batch_size, self.embed_units], dtype=tf.float32)
        a2t = tf.constant(0.0, shape = [self.batch_size, self.embed_units], dtype=tf.float32)
        
        def c1(j, t, a1tj, a2tj): return tf.less(j, t)
        def b1(j, t, a1tj, a2tj): return self.match(j,t, a1tj, a2tj)
        k = tf.constant(1, dtype=tf.int64)
        self.j = 1
        k, q, a1t, a2t = tf.while_loop(cond=c1, body=b1, loop_vars=[k ,t, a1t, a2t], shape_invariants=None)
        '''
        self.assign_op4 = tf.assign(self.a1[t], a1t)
        self.assign_op5 = tf.assign(self.a2[t], a2t)
        '''
        self.a1.append(a1t)
        self.a2.append(a2t)
        
        t=tf.add(t,1)
        self.ind+=1
        return t, state_s1, state_s2, state_r


    def match(self, j, t, a1tj, a2tj):
        with tf.variable_scope('Attn_'):
            W_s = tf.get_variable(shape=[self.embed_units, self.embed_units],
                              initializer=self._initializer, name='W_s')
            W_o = tf.get_variable(shape=[self.embed_units, self.embed_units],
                              initializer=self._initializer, name='W_o')
            W_e = tf.get_variable(shape=[self.embed_units, 1],
                              initializer=self._initializer, name='W_e')
            W_a = tf.get_variable(shape=[self.embed_units, self.embed_units],
                              initializer=self._initializer, name='W_a')
        
        e1_tj = tf.matmul(tf.tanh(tf.matmul(self.h_s1[self.j].h, W_s) +
                                       tf.matmul(W_o, self.h_s2[self.ind].h, transpose_b=True) + 
                                       tf.matmul(W_a, self.h_r[self.ind-1].h, transpose_b=True)), W_e)
        e2_tj = tf.matmul(tf.tanh(tf.matmul(W_s, self.h_s2[self.j].h, transpose_b=True) +
                                       tf.matmul(W_o, self.h_s1[self.ind].h, transpose_b=True) + 
                                       tf.matmul(W_a, self.h_r[self.ind-1].h, transpose_b=True)), W_e)
            
        alpha1_tj = tf.reshape(tf.nn.softmax(e1_tj, dim=1),[-1])
        alpha2_tj = tf.reshape(tf.nn.softmax(e2_tj, dim=1),[-1])
        '''
        with tf.variable_scope('atten'):
            a1tj = tf.get_variable(shape = [self.embed_units, batch_size], initializer=tf.constant_initializer(), name='a1tj')
            a2tj = tf.get_variable(shape = [self.embed_units, batch_size], initializer=tf.constant_initializer(), name='a2tj')
        self.add_op1 = tf.assign_add(a1tj, tf.transpose(self.h_s1[j])*alpha1_tj)
        self.add_op2 = tf.assign_add(a2tj, tf.transpose(self.h_s2[j])*alpha2_tj)
        '''

        a1tj = tf.add(a1tj, tf.transpose(self.h_s1[self.j].h)*alpha1_tj)
        a2tj = tf.add(a2tj, tf.transpose(self.h_s2[self.j].h)*alpha2_tj)
        j = tf.add(j,1)        
        self.j+=1
        return j, t, a1tj, a2tj
        

    def _length(self, sequence):
        mask = tf.sign(tf.abs(sequence))
        length = tf.reduce_sum(mask, axis=-1)
        return length

    def print_parameters(self):
        for item in self.params:
            print('%s: %s' % (item.name, item.get_shape()))

    def train_step(self, session, data, summary=False):
        input_feed = {self.texts1: data['texts1'],
                      self.texts2: data['texts2'],
                      self.texts_length: data['texts_length'],
                      self.labels: data['labels'],
                      self.keep_prob: data['keep_prob']}
        output_feed = [self.loss, self.accuracy, self.train_op]
                       #self.gradient_norm, self.update]
        '''
                       ,self.assign_op1,
                       self.assign_op2, self.assign_op3, self.assign_op4,
                       self.assign_op5, self.ini_op1,
                       self.ini_op2, self.ini_op3, self.ini_op4, self.ini_op5]
        '''
        if summary:
            output_feed.append(self.merged_summary_op)
        return session.run(output_feed, input_feed)
示例#24
0
class LSTMDSSM(object):
    """
    The LSTM-DSSM model refering to the paper: Deep Sentence Embedding Using Long Short-Term Memory Networks: Analysis and Application to Information Retrieval.
    papaer available at: https://arxiv.org/abs/1502.06922
    """

    def __init__(self,
                 num_lstm_units,
                 embed,
                 neg_num=4,
                 gradient_clip_threshold=5.0):
        self.queries = tf.placeholder(dtype=tf.string, shape=[None, None])  # shape: batch*len
        self.queries_length = tf.placeholder(dtype=tf.int32, shape=[None])  # shape: batch
        self.docs = tf.placeholder(dtype=tf.string, shape=[neg_num + 1, None, None])  # shape: (neg_num + 1)*batch*len
        self.docs_length = tf.placeholder(dtype=tf.int32, shape=[neg_num + 1, None])  # shape: batch*(neg_num + 1)

        self.word2index = MutableHashTable(
            key_dtype=tf.string,
            value_dtype=tf.int64,
            default_value=UNK_ID,
            shared_name="in_table",
            name="in_table",
            checkpoint=True
        )

        self.learning_rate = tf.Variable(0.001, trainable=False, dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)
        self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32)

        self.index_queries = self.word2index.lookup(self.queries)  # batch*len
        self.index_docs = [self.word2index.lookup(doc) for doc in tf.unstack(self.docs)]

        self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed)
        self.embed_queries = tf.nn.embedding_lookup(self.embed, self.index_queries)
        self.embed_docs = [tf.nn.embedding_lookup(self.embed, index_doc) for index_doc in self.index_docs]

        with tf.variable_scope('query_lstm'):
            self.cell_q = SimpleLSTMCell(num_lstm_units)
        with tf.variable_scope('doc_lstm'):
            self.cell_d = SimpleLSTMCell(num_lstm_units)

        self.states_q = dynamic_rnn(self.cell_q, self.embed_queries, self.queries_length, dtype=tf.float32,
                                         scope="simple_lstm_cell_query")[1][1]  # shape: batch*num_units
        self.states_d = [dynamic_rnn(self.cell_d, self.embed_docs[i], self.docs_length[i], dtype=tf.float32,
                                            scope="simple_lstm_cell_doc")[1][1] for i in range(neg_num + 1)]  # shape: (neg_num + 1)*batch*num_units
        self.queries_norm = tf.sqrt(tf.reduce_sum(tf.square(self.states_q), axis=1))
        self.docs_norm = [tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1)) for i in range(neg_num + 1)]
        self.prods = [tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1) for i in range(neg_num + 1)]
        self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i])) for i in range(neg_num + 1)]  # shape: (neg_num + 1)*batch
        self.sims = tf.convert_to_tensor(self.sims)
        self.gamma = tf.Variable(initial_value=1.0, expected_shape=[], dtype=tf.float32)  # scaling factor according to the paper
        self.origin_sims = self.sims
        self.sims = self.sims * self.gamma
        self.prob = tf.nn.softmax(self.sims, dim=0)  # shape: (neg_num + 1)*batch
        self.hit_prob = tf.transpose(self.prob[0])

        self.loss = -tf.reduce_mean(tf.log(self.hit_prob))

        self.params = tf.trainable_variables()
        opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True)  # use Nesterov's method, according to the paper
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, gradient_clip_threshold)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step)
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)

    def print_parameters(self):
        for item in self.params:
            print('%s: %s' % (item.name, item.get_shape()))

    def train_step(self, session, queries, docs):
        input_feed = {self.queries: queries['texts'],
                      self.queries_length: queries['texts_length'],
                      self.docs: docs['texts'],
                      self.docs_length: docs['texts_length']}

        output_feed = [self.loss, self.update, self.states_q, self.states_d, self.queries_norm, self.docs_norm, self.prods, self.sims, self.prob, self.hit_prob]
        return session.run(output_feed, input_feed)

    def test_step(self, session, queries, docs, ground_truths):
        input_feed = {self.queries: queries['texts'],
                      self.queries_length: queries['texts_length'],
                      self.docs: docs['texts'],
                      self.docs_length: docs['texts_length']}
        output_feed = [self.origin_sims]
        scores = (session.run(output_feed, input_feed)[0][0] + 1) / 2
        # debug
        # print("ground truths: " + str(ground_truths))
        # if max(ground_truths) == 0:
        #     print("predicts for dissimilar pairs: " + str(scores))
        l = len(ground_truths)
        loss = 0
        for i in range(l):
            predict = scores[i]
            ground_truth = ground_truths[i]
            predict = min([max([predict, 1e-15]), 1 - 1e-15])
            if ground_truth == 0:
                loss += math.log(1 - predict)
            else:
                loss += math.log(predict)
        return -loss / l
示例#25
0
文件: model.py 项目: jcnlp/dssm-lstm
class LSTMDSSM(object):
    """
    The LSTM-DSSM model refering to the paper: Deep Sentence Embedding Using Long Short-Term Memory Networks: Analysis and Application to Information Retrieval.
    papaer available at: https://arxiv.org/abs/1502.06922
    """
    def __init__(self,
                 num_lstm_units,
                 embed,
                 neg_num=4,
                 gradient_clip_threshold=5.0):
        self.queries = tf.placeholder(dtype=tf.string,
                                      shape=[None, None])  # shape: batch*len
        self.queries_length = tf.placeholder(dtype=tf.int32,
                                             shape=[None])  # shape: batch
        self.docs = tf.placeholder(dtype=tf.string,
                                   shape=[neg_num + 1, None, None
                                          ])  # shape: (neg_num + 1)*batch*len
        self.docs_length = tf.placeholder(
            dtype=tf.int32, shape=[neg_num + 1,
                                   None])  # shape: batch*(neg_num + 1)

        self.word2index = MutableHashTable(key_dtype=tf.string,
                                           value_dtype=tf.int64,
                                           default_value=UNK_ID,
                                           shared_name="in_table",
                                           name="in_table",
                                           checkpoint=True)

        self.learning_rate = tf.Variable(0.001,
                                         trainable=False,
                                         dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)
        self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32)

        self.index_queries = self.word2index.lookup(self.queries)  # batch*len
        self.index_docs = [
            self.word2index.lookup(doc) for doc in tf.unstack(self.docs)
        ]

        self.embed = tf.get_variable('embed',
                                     dtype=tf.float32,
                                     initializer=embed)
        self.embed_queries = tf.nn.embedding_lookup(self.embed,
                                                    self.index_queries)
        self.embed_docs = [
            tf.nn.embedding_lookup(self.embed, index_doc)
            for index_doc in self.index_docs
        ]

        with tf.variable_scope('query_lstm'):
            self.cell_q = SimpleLSTMCell(num_lstm_units)
        with tf.variable_scope('doc_lstm'):
            self.cell_d = SimpleLSTMCell(num_lstm_units)

        self.states_q = dynamic_rnn(
            self.cell_q,
            self.embed_queries,
            self.queries_length,
            dtype=tf.float32,
            scope="simple_lstm_cell_query")[1][1]  # shape: batch*num_units
        self.states_d = [
            dynamic_rnn(self.cell_d,
                        self.embed_docs[i],
                        self.docs_length[i],
                        dtype=tf.float32,
                        scope="simple_lstm_cell_doc")[1][1]
            for i in range(neg_num + 1)
        ]  # shape: (neg_num + 1)*batch*num_units
        self.queries_norm = tf.sqrt(
            tf.reduce_sum(tf.square(self.states_q), axis=1))
        self.docs_norm = [
            tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1))
            for i in range(neg_num + 1)
        ]
        self.prods = [
            tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1)
            for i in range(neg_num + 1)
        ]
        self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i]))
                     for i in range(neg_num + 1)]  # shape: (neg_num + 1)*batch
        self.sims = tf.convert_to_tensor(self.sims)
        self.gamma = tf.Variable(
            initial_value=1.0, expected_shape=[],
            dtype=tf.float32)  # scaling factor according to the paper
        self.sims = self.sims * self.gamma
        self.prob = tf.nn.softmax(self.sims,
                                  dim=0)  # shape: (neg_num + 1)*batch
        self.hit_prob = tf.transpose(self.prob[0])

        self.loss = -tf.reduce_mean(tf.log(self.hit_prob))

        self.params = tf.trainable_variables()
        opt = tf.train.MomentumOptimizer(
            learning_rate=self.learning_rate,
            momentum=self.momentum,
            use_nesterov=True)  # use Nesterov's method, according to the paper
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, gradient_clip_threshold)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)

    def print_parameters(self):
        for item in self.params:
            print('%s: %s' % (item.name, item.get_shape()))

    def train_step(self, session, queries, docs):
        input_feed = {
            self.queries: queries['texts'],
            self.queries_length: queries['texts_length'],
            self.docs: docs['texts'],
            self.docs_length: docs['texts_length']
        }

        output_feed = [
            self.loss, self.update, self.states_q, self.states_d,
            self.queries_norm, self.docs_norm, self.prods, self.sims,
            self.prob, self.hit_prob
        ]
        return session.run(output_feed, input_feed)

    def test_step(self, session, queries, docs, ground_truths):
        input_feed = {
            self.queries: queries['texts'],
            self.queries_length: queries['texts_length'],
            self.docs: docs['texts'],
            self.docs_length: docs['texts_length']
        }
        output_feed = [self.sims]
        scores = (session.run(output_feed, input_feed)[0][0] + 1) / 2
        l = len(ground_truths)
        loss = 0
        for i in range(l):
            predict = scores[i]
            ground_truth = ground_truths[i]
            predict = max([min([predict, 1 - 1e-15]), 1e-15])
            if ground_truth == 0:
                loss += math.log(1 - predict)
            else:
                loss += math.log(predict)
        return -loss / l
示例#26
0
class Model(object):
    def __init__(self,
                 word_embed,
                 entity_embed,
                 vocab_size=30000,
                 num_embed_units=300,
                 num_units=512,
                 num_layers=2,
                 num_entities=0,
                 num_trans_units=100,
                 max_length=60,
                 learning_rate=0.0001,
                 learning_rate_decay_factor=0.95,
                 max_gradient_norm=5.0,
                 num_samples=500,
                 output_alignments=True):
        # initialize params
        self.vocab_size = vocab_size
        self.num_embed_units = num_embed_units
        self.num_units = num_units
        self.num_layers = num_layers
        self.num_entities = num_entities
        self.num_trans_units = num_trans_units
        self.learning_rate = learning_rate
        self.max_gradient_norm = max_gradient_norm
        self.num_samples = num_samples
        self.max_length = max_length
        self.output_alignments = output_alignments

        # build the embedding table (index to vector)
        if word_embed is None:
            # initialize the embedding randomly
            self.word_embed = tf.get_variable(
                'word_embed', [self.vocab_size, self.num_embed_units],
                tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.word_embed = tf.get_variable('word_embed',
                                              dtype=tf.float32,
                                              initializer=word_embed)
        if entity_embed is None:
            # initialize the embedding randomly
            self.entity_trans = tf.get_variable(
                'entity_embed', [num_entities, num_trans_units],
                tf.float32,
                trainable=False)
        else:
            # initialize the embedding by pre-trained trans vectors
            self.entity_trans = tf.get_variable('entity_embed',
                                                dtype=tf.float32,
                                                initializer=entity_embed,
                                                trainable=False)

        # initialize inputs and outputs
        self.posts = tf.placeholder(tf.string, (None, None),
                                    'enc_inps')  # batch*len
        self.posts_length = tf.placeholder(tf.int32, (None),
                                           'enc_lens')  # batch
        self.responses = tf.placeholder(tf.string, (None, None),
                                        'dec_inps')  # batch*len
        self.responses_length = tf.placeholder(tf.int32, (None),
                                               'dec_lens')  # batch
        self.entities = tf.placeholder(tf.string, (None, None, None),
                                       'entities')  # batch
        self.entity_masks = tf.placeholder(tf.string, (None, None),
                                           'entity_masks')  # batch
        self.triples = tf.placeholder(tf.string, (None, None, None, 3),
                                      'triples')  # batch
        self.posts_triple = tf.placeholder(tf.int32, (None, None, 1),
                                           'enc_triples')  # batch
        self.responses_triple = tf.placeholder(tf.string, (None, None, 3),
                                               'dec_triples')  # batch
        self.match_triples = tf.placeholder(tf.int32, (None, None, None),
                                            'match_triples')  # batch
        self._init_vocabs()

        # build the vocab table (string to index)
        self.posts_word_id = self.symbol2index.lookup(self.posts)  # batch*len
        self.posts_entity_id = self.entity2index.lookup(
            self.posts)  # batch*len
        self.responses_target = self.symbol2index.lookup(
            self.responses)  # batch*len
        batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(
            self.responses)[1]
        self.responses_word_id = tf.concat([
            tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID,
            tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0]
        ], 1)  # batch*len
        self.decoder_mask = tf.reshape(
            tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len),
                      reverse=True,
                      axis=1), [-1, decoder_len])

        # build entity embeddings
        entity_trans_transformed = tf.layers.dense(self.entity_trans,
                                                   self.num_trans_units,
                                                   activation=tf.tanh,
                                                   name='trans_transformation')
        padding_entity = tf.get_variable('entity_padding_embed',
                                         [7, self.num_trans_units],
                                         dtype=tf.float32,
                                         initializer=tf.zeros_initializer())
        self.entity_embed = tf.concat(
            [padding_entity, entity_trans_transformed], axis=0)

        # get knowledge graph embedding, knowledge triple embedding
        self.triples_embedding, self.entities_word_embedding, self.graph_embedding = self._build_kg_embedding(
        )

        # build knowledge graph
        graph_embed_input, triple_embed_input = self._build_kg_graph()

        # build encoder
        encoder_output, encoder_state = self._build_encoder(graph_embed_input)

        # build decoder
        self._build_decoder(encoder_output, encoder_state, triple_embed_input)

        # initialize training process
        self.global_step = tf.Variable(0, trainable=False)
        self.params = tf.global_variables()

        gradients = tf.gradients(self.decoder_loss, self.params)
        self.clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, self.max_gradient_norm)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.update = optimizer.apply_gradients(zip(self.clipped_gradients,
                                                    self.params),
                                                global_step=self.global_step)

        tf.summary.scalar('decoder_loss', self.decoder_loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)
        self.merged_summary_op = tf.summary.merge_all()

    def _init_vocabs(self):
        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        self.index2symbol = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value='_UNK',
                                             shared_name="out_table",
                                             name="out_table",
                                             checkpoint=True)
        self.entity2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=NONE_ID,
                                             shared_name="entity_in_table",
                                             name="entity_in_table",
                                             checkpoint=True)
        self.index2entity = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value='_NONE',
                                             shared_name="entity_out_table",
                                             name="entity_out_table",
                                             checkpoint=True)

    def _build_kg_embedding(self):
        encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts))
        triple_num = tf.shape(self.triples)[1]

        triples_embedding = tf.reshape(
            tf.nn.embedding_lookup(self.entity_embed,
                                   self.entity2index.lookup(self.triples)),
            [encoder_batch_size, triple_num, -1, 3 * self.num_trans_units])

        entities_word_embedding = tf.reshape(
            tf.nn.embedding_lookup(self.word_embed,
                                   self.symbol2index.lookup(self.entities)),
            [encoder_batch_size, -1, self.num_embed_units])

        head, relation, tail = tf.split(triples_embedding,
                                        [self.num_trans_units] * 3,
                                        axis=3)
        with tf.variable_scope('graph_attention', reuse=tf.AUTO_REUSE):
            head_tail = tf.concat([head, tail], axis=3)
            head_tail_transformed = tf.layers.dense(head_tail,
                                                    self.num_trans_units,
                                                    activation=tf.tanh,
                                                    name='head_tail_transform')
            relation_transformed = tf.layers.dense(relation,
                                                   self.num_trans_units,
                                                   name='relation_transform')
            e_weight = tf.reduce_sum(relation_transformed *
                                     head_tail_transformed,
                                     axis=3)
            alpha_weight = tf.nn.softmax(e_weight)
            graph_embedding = tf.reduce_sum(tf.expand_dims(alpha_weight, 3) *
                                            head_tail,
                                            axis=2)
        return triples_embedding, entities_word_embedding, graph_embedding

    def _build_kg_graph(self):
        encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts))
        batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(
            self.responses)[1]
        # knowledge graph vectors
        graph_embed_input = tf.gather_nd(
            self.graph_embedding,
            tf.concat([
                tf.tile(
                    tf.reshape(tf.range(encoder_batch_size, dtype=tf.int32),
                               [-1, 1, 1]), [1, encoder_len, 1]),
                self.posts_triple
            ],
                      axis=2))

        # knowledge triple vectors
        triple_embed_input = tf.reshape(
            tf.nn.embedding_lookup(
                self.entity_embed,
                self.entity2index.lookup(self.responses_triple)),
            [batch_size, decoder_len, 3 * self.num_trans_units])

        return graph_embed_input, triple_embed_input

    def _build_encoder(self, graph_embed_input):
        post_word_input = tf.nn.embedding_lookup(
            self.word_embed, self.posts_word_id)  # batch*len*unit
        encoder_cell = MultiRNNCell(
            [GRUCell(self.num_units) for _ in range(self.num_layers)])

        # encoder input: e(x_t) = [w(x_t); g_i]
        encoder_input = tf.concat([post_word_input, graph_embed_input], axis=2)
        encoder_output, encoder_state = dynamic_rnn(encoder_cell,
                                                    encoder_input,
                                                    self.posts_length,
                                                    dtype=tf.float32,
                                                    scope="encoder")
        # shape:[batch_size, max_time, cell.output_size]
        return encoder_output, encoder_state

    def _build_decoder(self, encoder_output, encoder_state,
                       triple_embed_input):
        # decoder input: e(y_t) = [w(y_t); k_j]
        encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts))
        response_word_input = tf.nn.embedding_lookup(
            self.word_embed, self.responses_word_id)  # batch*len*unit
        decoder_input = tf.concat([response_word_input, triple_embed_input],
                                  axis=2)
        print("decoder_input:", decoder_input.shape)

        # define cell
        decoder_cell = MultiRNNCell(
            [GRUCell(self.num_units) for _ in range(self.num_layers)])

        # get loss functions
        sequence_loss, total_loss = loss_computation(
            self.vocab_size, num_samples=self.num_samples)

        # decoder training process
        with tf.variable_scope('decoder'):
            # prepare attention
            attention_keys, attention_values, attention_score_fn, attention_construct_fn \
                = prepare_attention(encoder_output, 'bahdanau', self.num_units, scope_name="decoder",
                                    imem=(self.graph_embedding, self.triples_embedding),
                                    output_alignments=self.output_alignments)
            print("graph_embedding:", self.graph_embedding.shape)
            print("triples_embedding:", self.triples_embedding.shape)
            decoder_fn_train = attention_decoder_fn_train(
                encoder_state,
                attention_keys,
                attention_values,
                attention_score_fn,
                attention_construct_fn,
                output_alignments=self.output_alignments,
                max_length=tf.reduce_max(self.responses_length))
            # train decoder
            decoder_output, _, decoder_context_state = dynamic_rnn_decoder(
                decoder_cell,
                decoder_fn_train,
                decoder_input,
                self.responses_length,
                scope="decoder_rnn")
            output_fn, selector_fn = output_projection(
                self.vocab_size, scope_name="decoder_rnn")
            output_logits = output_fn(decoder_output)
            selector_logits = selector_fn(decoder_output)
            print("decoder_output:",
                  decoder_output.shape)  # shape: [batch, seq, num_units]
            print("output_logits:", output_logits.shape)
            print("selector_fn:", selector_logits.name)

            triple_len = tf.shape(self.triples)[2]
            one_hot_triples = tf.one_hot(self.match_triples, triple_len)
            use_triples = tf.reduce_sum(one_hot_triples, axis=[2, 3])
            alignments = tf.transpose(decoder_context_state.stack(),
                                      perm=[1, 0, 2, 3])
            self.decoder_loss, self.ppx_loss, self.sentence_ppx \
                = total_loss(output_logits,
                             selector_logits,
                             self.responses_target,
                             self.decoder_mask,
                             alignments,
                             use_triples,
                             one_hot_triples)
            self.sentence_ppx = tf.identity(self.sentence_ppx, name="ppx_loss")

        # decoder inference process
        with tf.variable_scope('decoder', reuse=True):
            # prepare attention
            attention_keys, attention_values, attention_score_fn, attention_construct_fn \
                = prepare_attention(encoder_output, 'bahdanau', self.num_units, scope_name="decoder",
                                    imem=(self.graph_embedding, self.triples_embedding),
                                    output_alignments=self.output_alignments,
                                    reuse=True)
            output_fn, selector_fn = output_projection(self.vocab_size,
                                                       scope_name=None,
                                                       reuse=True)
            decoder_fn_inference \
                = attention_decoder_fn_inference(output_fn, encoder_state,
                                                 attention_keys, attention_values,
                                                 attention_score_fn, attention_construct_fn,
                                                 self.word_embed, GO_ID, EOS_ID, self.max_length, self.vocab_size,
                                                 imem=(self.entities_word_embedding,
                                                       tf.reshape(self.triples_embedding,
                                                                  [encoder_batch_size, -1, 3 * self.num_trans_units])),
                                                 selector_fn=selector_fn)

            # get decoder output
            decoder_distribution, _, infer_context_state \
                = dynamic_rnn_decoder(decoder_cell, decoder_fn_inference, scope="decoder_rnn")

            output_len = tf.shape(decoder_distribution)[1]
            output_ids = tf.transpose(
                infer_context_state.gather(tf.range(output_len)))
            word_ids = tf.cast(
                tf.clip_by_value(output_ids, 0, self.vocab_size), tf.int64)
            entity_ids = tf.reshape(
                tf.clip_by_value(-output_ids, 0, self.vocab_size) + tf.reshape(
                    tf.range(encoder_batch_size) *
                    tf.shape(self.entities_word_embedding)[1], [-1, 1]), [-1])
            entities = tf.reshape(
                tf.gather(tf.reshape(self.entities, [-1]), entity_ids),
                [-1, output_len])
            words = self.index2symbol.lookup(word_ids)
            self.generation = tf.where(output_ids > 0, words, entities)
            self.generation = tf.identity(self.generation, name='generation')

    def set_vocabs(self, session, vocab, entity_vocab, relation_vocab):
        op_in = self.symbol2index.insert(
            constant_op.constant(vocab),
            constant_op.constant(list(range(self.vocab_size)), dtype=tf.int64))
        session.run(op_in)
        op_out = self.index2symbol.insert(
            constant_op.constant(list(range(self.vocab_size)), dtype=tf.int64),
            constant_op.constant(vocab))
        session.run(op_out)
        op_in = self.entity2index.insert(
            constant_op.constant(entity_vocab + relation_vocab),
            constant_op.constant(list(
                range(len(entity_vocab) + len(relation_vocab))),
                                 dtype=tf.int64))
        session.run(op_in)
        op_out = self.index2entity.insert(
            constant_op.constant(list(
                range(len(entity_vocab) + len(relation_vocab))),
                                 dtype=tf.int64),
            constant_op.constant(entity_vocab + relation_vocab))
        session.run(op_out)
        return session

    def print_parameters(self):
        for item in self.params:
            print('%s: %s' % (item.name, item.get_shape().as_list()))

    def step_train(self, session, data, forward_only=False, summary=False):
        input_feed = {
            self.posts: data['posts'],
            self.posts_length: data['posts_length'],
            self.responses: data['responses'],
            self.responses_length: data['responses_length'],
            self.triples: data['triples'],
            self.posts_triple: data['posts_triple'],
            self.responses_triple: data['responses_triple'],
            self.match_triples: data['match_triples']
        }
        if forward_only:
            output_feed = [self.sentence_ppx]
        else:
            output_feed = [self.sentence_ppx, self.decoder_loss, self.update]
        if summary:
            output_feed.append(self.merged_summary_op)

        return session.run(output_feed, input_feed)
示例#27
0
文件: model.py 项目: zyjcs/ccm
    def __init__(self,
            num_symbols,
            num_embed_units,
            num_units,
            num_layers,
            embed,
            entity_embed=None,
            num_entities=0,
            num_trans_units=100,
            learning_rate=0.0001,
            learning_rate_decay_factor=0.95,
            max_gradient_norm=5.0,
            num_samples=512,
            max_length=60,
            output_alignments=True,
            use_lstm=False):
        
        self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps')  # batch*len
        self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens')  # batch
        self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps')  # batch*len
        self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens')  # batch
        self.entities = tf.placeholder(tf.string, (None, None), 'entities')  # batch
        self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks')  # batch
        self.triples = tf.placeholder(tf.string, (None, None, 3), 'triples')  # batch
        self.posts_triple = tf.placeholder(tf.int32, (None, None, 1), 'enc_triples')  # batch
        self.responses_triple = tf.placeholder(tf.string, (None, None, 3), 'dec_triples')  # batch
        self.match_triples = tf.placeholder(tf.int32, (None, None), 'match_triples')  # batch
        encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts))
        triple_num = tf.shape(self.triples)[1]
        
        #use_triples = tf.reduce_sum(tf.cast(tf.greater_equal(self.match_triples, 0), tf.float32), axis=-1)
        one_hot_triples = tf.one_hot(self.match_triples, triple_num)
        use_triples = tf.reduce_sum(one_hot_triples, axis=[2])

        self.symbol2index = MutableHashTable(
                key_dtype=tf.string,
                value_dtype=tf.int64,
                default_value=UNK_ID,
                shared_name="in_table",
                name="in_table",
                checkpoint=True)
        self.index2symbol = MutableHashTable(
                key_dtype=tf.int64,
                value_dtype=tf.string,
                default_value='_UNK',
                shared_name="out_table",
                name="out_table",
                checkpoint=True)
        self.entity2index = MutableHashTable(
                key_dtype=tf.string,
                value_dtype=tf.int64,
                default_value=NONE_ID,
                shared_name="entity_in_table",
                name="entity_in_table",
                checkpoint=True)
        self.index2entity = MutableHashTable(
                key_dtype=tf.int64,
                value_dtype=tf.string,
                default_value='_NONE',
                shared_name="entity_out_table",
                name="entity_out_table",
                checkpoint=True)
        # build the vocab table (string to index)


        self.posts_word_id = self.symbol2index.lookup(self.posts)   # batch*len
        self.posts_entity_id = self.entity2index.lookup(self.posts)   # batch*len
        #self.posts_word_id = tf.Print(self.posts_word_id, ['use_triples', use_triples, 'one_hot_triples', one_hot_triples], summarize=1e6)
        self.responses_target = self.symbol2index.lookup(self.responses)   #batch*len
        
        batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(self.responses)[1]
        self.responses_word_id = tf.concat([tf.ones([batch_size, 1], dtype=tf.int64)*GO_ID,
            tf.split(self.responses_target, [decoder_len-1, 1], 1)[0]], 1)   # batch*len
        self.decoder_mask = tf.reshape(tf.cumsum(tf.one_hot(self.responses_length-1, 
            decoder_len), reverse=True, axis=1), [-1, decoder_len])
        
        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('word_embed', [num_symbols, num_embed_units], tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=embed)
        if entity_embed is None:
            # initialize the embedding randomly
            self.entity_trans = tf.get_variable('entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False)
        else:
            # initialize the embedding by pre-trained word vectors
            self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False)

        self.entity_trans_transformed = tf.layers.dense(self.entity_trans, num_trans_units, activation=tf.tanh, name='trans_transformation')
        padding_entity = tf.get_variable('entity_padding_embed', [7, num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer())

        self.entity_embed = tf.concat([padding_entity, self.entity_trans_transformed], axis=0)

        triples_embedding = tf.reshape(tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, 3 * num_trans_units])
        entities_word_embedding = tf.reshape(tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, num_embed_units])


        self.encoder_input = tf.nn.embedding_lookup(self.embed, self.posts_word_id) #batch*len*unit
        self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_word_id) #batch*len*unit

        encoder_cell = MultiRNNCell([GRUCell(num_units) for _ in range(num_layers)])
        decoder_cell = MultiRNNCell([GRUCell(num_units) for _ in range(num_layers)])
        
        # rnn encoder
        encoder_output, encoder_state = dynamic_rnn(encoder_cell, self.encoder_input, 
                self.posts_length, dtype=tf.float32, scope="encoder")

        # get output projection function
        output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss = output_projection_layer(num_units, 
                num_symbols, num_samples)

        

        with tf.variable_scope('decoder'):
            # get attention function
            attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \
                    = prepare_attention(encoder_output, 'bahdanau', num_units, imem=triples_embedding, output_alignments=output_alignments)#'luong', num_units)

            decoder_fn_train = attention_decoder_fn_train(
                    encoder_state, attention_keys_init, attention_values_init,
                    attention_score_fn_init, attention_construct_fn_init, output_alignments=output_alignments, max_length=tf.reduce_max(self.responses_length))
            self.decoder_output, _, alignments_ta = dynamic_rnn_decoder(decoder_cell, decoder_fn_train, 
                    self.decoder_input, self.responses_length, scope="decoder_rnn")
            if output_alignments: 
                self.alignments = tf.transpose(alignments_ta.stack(), perm=[1,0,2])
                #self.alignments = tf.Print(self.alignments, [self.alignments], summarize=1e8)
                self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss(self.decoder_output, self.responses_target, self.decoder_mask, self.alignments, triples_embedding, use_triples, one_hot_triples)
                self.sentence_ppx = tf.identity(self.sentence_ppx, 'ppx_loss')
                #self.decoder_loss = tf.Print(self.decoder_loss, ['decoder_loss', self.decoder_loss], summarize=1e6)
            else:
                self.decoder_loss, self.sentence_ppx = sequence_loss(self.decoder_output, 
                        self.responses_target, self.decoder_mask)
                self.sentence_ppx = tf.identity(self.sentence_ppx, 'ppx_loss')
         
        with tf.variable_scope('decoder', reuse=True):
            # get attention function
            attention_keys, attention_values, attention_score_fn, attention_construct_fn \
                    = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=triples_embedding, output_alignments=output_alignments)#'luong', num_units)
            decoder_fn_inference = attention_decoder_fn_inference(
                    output_fn, encoder_state, attention_keys, attention_values, 
                    attention_score_fn, attention_construct_fn, self.embed, GO_ID, 
                    EOS_ID, max_length, num_symbols, imem=entities_word_embedding, selector_fn=selector_fn)

                
            self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder(decoder_cell,
                    decoder_fn_inference, scope="decoder_rnn")
            if output_alignments:
                output_len = tf.shape(self.decoder_distribution)[1]
                output_ids = tf.transpose(output_ids_ta.gather(tf.range(output_len)))
                word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols), tf.int64)
                entity_ids = tf.reshape(tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]), [-1])
                entities = tf.reshape(tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len])
                words = self.index2symbol.lookup(word_ids)
                self.generation = tf.where(output_ids > 0, words, entities, name='generation')
            else:
                self.generation_index = tf.argmax(self.decoder_distribution, 2)
                
                self.generation = self.index2symbol.lookup(self.generation_index, name='generation') 
        

        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate), 
                trainable=False, dtype=tf.float32)
        self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        self.params = tf.global_variables()
            
        # calculate the gradient of parameters
        #opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.lr = opt._lr
       
        gradients = tf.gradients(self.decoder_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, 
                max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params), 
                global_step=self.global_step)

        tf.summary.scalar('decoder_loss', self.decoder_loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()
        
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, 
                max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
        
        self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1000, pad_step_number=True)
示例#28
0
    def __init__(self,
            num_symbols,
            num_embed_units,
            num_units,
            num_layers,
            num_labels,
            embed,
            learning_rate=0.005,
            max_gradient_norm=5.0):
        
        self.texts = tf.placeholder(tf.string, (None, None), 'texts')  # shape: [batch, length]

        #todo: implement placeholders
        self.texts_length = tf.placeholder(, , 'texts_length')  # shaoe: [batch]
        self.labels = tf.placeholder(, , 'labels')  # shape: [batch]
        
        self.symbol2index = MutableHashTable(
                key_dtype=tf.string,
                value_dtype=tf.int64,
                default_value=UNK_ID,
                shared_name="in_table",
                name="in_table",
                checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate), 
                trainable=False, dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)


        self.index_input = self.symbol2index.lookup(self.texts)   # shape: [batch, length]
        
        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed)

        #todo: implement embedding inputs
        self.embed_input = tf.nn.embedding_lookup(, ) #shape: [batch, length, num_embed_units]

        #todo: implement other RNNCell to replace BasicRNNCell
        cell = MultiRNNCell([BasicRNNCell(num_units) for _ in range(num_layers)])
        
        outputs, states = dynamic_rnn(cell, self.embed_input, 
                self.texts_length, dtype=tf.float32, scope="rnn")

        #todo: vectors is the last hidden states of the BasicRNNCell, u may need to change the code to get the right vectors of other RNNCell
        vectors = states[-1]

        with tf.variable_scope('logits'):
            weight = tf.get_variable("weights", [num_units, num_labels])
            bias = tf.get_variable("biases", [num_labels])
            #todo: implement the linear transformation: [batch, num_units] -> [batch, num_labels], using vectors, weight, bias
            logits = 

        self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss')
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy')

        self.params = tf.trainable_variables()
            
        # calculate the gradient of parameters
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, 
                max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params), 
                global_step=self.global_step)
        
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, 
                max_to_keep=5, pad_step_number=True)
示例#29
0
    def __init__(self,
                 num_symbols,
                 num_embed_units,
                 num_units,
                 num_layers,
                 num_labels,
                 embed,
                 learning_rate,
                 max_gradient_norm=5.0,
                 param_da=150,
                 param_r=10):

        self.texts = tf.placeholder(tf.string, (None, None),
                                    'texts')  # shape: [batch, length]

        #todo: implement placeholders
        self.texts_length = tf.placeholder(tf.int32, None,
                                           'texts_length')  # shape: [batch]
        self.labels = tf.placeholder(tf.int32, None,
                                     'labels')  # shape: [batch]

        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)

        batch_size = tf.shape(self.texts)[0]
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)

        self.index_input = self.symbol2index.lookup(
            self.texts)  # shape: [batch, length]

        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed',
                                         [num_symbols, num_embed_units],
                                         tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed',
                                         dtype=tf.float32,
                                         initializer=embed)

        #todo: implement embedding inputs
        self.embed_input = tf.nn.embedding_lookup(
            self.embed,
            self.index_input)  #shape: [batch, length, num_embed_units]

        #todo: implement Multi-layer RNNCell with #num_units neurons and #num_layers layers
        def LSTM():
            return BasicLSTMCell(num_units)

        cells = [LSTM() for i in range(num_layers)]
        cell_fw = MultiRNNCell(cells)
        cell_bw = MultiRNNCell(cells)
        #todo: implement bidirectional RNN
        outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw,
                                                          cell_bw,
                                                          self.embed_input,
                                                          self.texts_length,
                                                          dtype=tf.float32,
                                                          scope="rnn")
        H = tf.concat(outputs, 2)  # shape: (batch, length, 2*num_units)
        #H = tf.Print(H, [H, tf.shape(H), "H"])

        with tf.variable_scope('logits'):
            #todo: implement self-attention mechanism, feel free to add codes to calculate internal results
            Ws1 = tf.get_variable("Ws1", [2 * num_units, param_da])
            Ws2 = tf.get_variable("Ws2", [param_da, param_r])

            temp = tf.tanh(tf.einsum('aij,jr->air', H, Ws1))
            #temp = tf.Print(temp, [temp, tf.shape(temp), "shape"])
            A = tf.nn.softmax(
                tf.einsum('aij,jr->air', temp,
                          Ws2))  # shape: (batch, param_r*2*num_units)
            #A = tf.Print(A, [A, tf.shape(A), "A"])
            M = tf.reduce_sum(tf.einsum('aij,aik->ajk', A, H), axis=1)
            #M = tf.Print(M, [M, tf.shape(M), "M"])
            logits = tf.layers.dense(
                M, num_labels, activation=None,
                name='projection')  # shape: (batch, num_labels)
            #logits = tf.Print(logits, [logits, tf.shape(logits), "logits"])

        #todo: calculate additional loss, feel free to add codes to calculate internal results
        identity = tf.reshape(
            tf.tile(tf.diag(tf.ones([param_r])), [batch_size, 1]),
            [batch_size, param_r, param_r])
        temp = tf.matmul(A, A, transpose_a=True)
        self.penalized_term = tf.norm(temp - identity)

        self.loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels,
                                                           logits=logits),
            name='loss') + 0.001 * self.penalized_term
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(
            tf.equal(self.labels, tf.cast(predict_labels, tf.int32)),
            tf.int32),
                                      name='accuracy')

        self.params = tf.trainable_variables()

        #         global_step = tf.Variable(0, trainable=False)
        #         initial_learning_rate = self.learning_rate
        #         learning_rate = tf.train.exponential_decay(initial_learning_rate,
        #                                                    global_step=global_step,
        #                                                    decay_steps=10,decay_rate=0.9)
        # calculate the gradient of parameters
        #opt = tf.train.AdamOptimizer(learning_rate)
        opt = tf.train.MomentumOptimizer(self.learning_rate, 0.9)
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=5,
                                    pad_step_number=True)
示例#30
0
    def __init__(self,
                 num_symbols,
                 num_embed_units,
                 num_units,
                 num_layers,
                 beam_size,
                 embed,
                 learning_rate=0.5,
                 remove_unk=False,
                 learning_rate_decay_factor=0.95,
                 max_gradient_norm=5.0,
                 num_samples=512,
                 max_length=8,
                 use_lstm=False):

        self.posts = tf.placeholder(tf.string, (None, None),
                                    'enc_inps')  # batch*len
        self.posts_length = tf.placeholder(tf.int32, (None),
                                           'enc_lens')  # batch
        self.responses = tf.placeholder(tf.string, (None, None),
                                        'dec_inps')  # batch*len
        self.responses_length = tf.placeholder(tf.int32, (None),
                                               'dec_lens')  # batch

        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        self.index2symbol = MutableHashTable(key_dtype=tf.int64,
                                             value_dtype=tf.string,
                                             default_value='_UNK',
                                             shared_name="out_table",
                                             name="out_table",
                                             checkpoint=True)
        # build the vocab table (string to index)

        self.posts_input = self.symbol2index.lookup(self.posts)  # batch*len
        self.responses_target = self.symbol2index.lookup(
            self.responses)  #batch*len

        batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(
            self.responses)[1]
        self.responses_input = tf.concat([
            tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID,
            tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0]
        ], 1)  # batch*len
        self.decoder_mask = tf.reshape(
            tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len),
                      reverse=True,
                      axis=1), [-1, decoder_len])

        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed',
                                         [num_symbols, num_embed_units],
                                         tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed',
                                         dtype=tf.float32,
                                         initializer=embed)

        self.encoder_input = tf.nn.embedding_lookup(
            self.embed, self.posts_input)  #batch*len*unit
        self.decoder_input = tf.nn.embedding_lookup(self.embed,
                                                    self.responses_input)

        if use_lstm:
            cell = MultiRNNCell([LSTMCell(num_units)] * num_layers)
        else:
            cell = MultiRNNCell([GRUCell(num_units)] * num_layers)

        # rnn encoder
        encoder_output, encoder_state = dynamic_rnn(cell,
                                                    self.encoder_input,
                                                    self.posts_length,
                                                    dtype=tf.float32,
                                                    scope="encoder")

        # get output projection function
        output_fn, sampled_sequence_loss = output_projection_layer(
            num_units, num_symbols, num_samples)

        # get attention function
        attention_keys, attention_values, attention_score_fn, attention_construct_fn \
                = attention_decoder_fn.prepare_attention(encoder_output, 'luong', num_units)

        with tf.variable_scope('decoder'):
            decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train(
                encoder_state, attention_keys, attention_values,
                attention_score_fn, attention_construct_fn)
            self.decoder_output, _, _ = dynamic_rnn_decoder(
                cell,
                decoder_fn_train,
                self.decoder_input,
                self.responses_length,
                scope="decoder_rnn")
            self.decoder_loss = sampled_sequence_loss(self.decoder_output,
                                                      self.responses_target,
                                                      self.decoder_mask)

        with tf.variable_scope('decoder', reuse=True):
            decoder_fn_inference = attention_decoder_fn.attention_decoder_fn_inference(
                output_fn, encoder_state, attention_keys, attention_values,
                attention_score_fn, attention_construct_fn, self.embed, GO_ID,
                EOS_ID, max_length, num_symbols)

            self.decoder_distribution, _, _ = dynamic_rnn_decoder(
                cell, decoder_fn_inference, scope="decoder_rnn")
            self.generation_index = tf.argmax(
                tf.split(self.decoder_distribution, [2, num_symbols - 2],
                         2)[1], 2) + 2  # for removing UNK
            self.generation = self.index2symbol.lookup(self.generation_index,
                                                       name='generation')

        with tf.variable_scope('decoder', reuse=True):
            decoder_fn_beam_inference = attention_decoder_fn_beam_inference(
                output_fn, encoder_state, attention_keys, attention_values,
                attention_score_fn, attention_construct_fn, self.embed, GO_ID,
                EOS_ID, max_length, num_symbols, beam_size, remove_unk)
            _, _, self.context_state = dynamic_rnn_decoder(
                cell, decoder_fn_beam_inference, scope="decoder_rnn")
            (log_beam_probs, beam_parents, beam_symbols, result_probs,
             result_parents, result_symbols) = self.context_state

            self.beam_parents = tf.transpose(tf.reshape(
                beam_parents.stack(), [max_length + 1, -1, beam_size]),
                                             [1, 0, 2],
                                             name='beam_parents')
            self.beam_symbols = tf.transpose(
                tf.reshape(beam_symbols.stack(),
                           [max_length + 1, -1, beam_size]), [1, 0, 2])
            self.beam_symbols = self.index2symbol.lookup(tf.cast(
                self.beam_symbols, tf.int64),
                                                         name="beam_symbols")

            self.result_probs = tf.transpose(tf.reshape(
                result_probs.stack(), [max_length + 1, -1, beam_size * 2]),
                                             [1, 0, 2],
                                             name='result_probs')
            self.result_symbols = tf.transpose(
                tf.reshape(result_symbols.stack(),
                           [max_length + 1, -1, beam_size * 2]), [1, 0, 2])
            self.result_parents = tf.transpose(tf.reshape(
                result_parents.stack(), [max_length + 1, -1, beam_size * 2]),
                                               [1, 0, 2],
                                               name='result_parents')
            self.result_symbols = self.index2symbol.lookup(
                tf.cast(self.result_symbols, tf.int64), name='result_symbols')

        self.params = tf.trainable_variables()

        # calculate the gradient of parameters
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(self.decoder_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)

        # Exporter for serving
        self.model_exporter = exporter.Exporter(self.saver)
        inputs = {"enc_inps:0": self.posts, "enc_lens:0": self.posts_length}
        outputs = {
            "beam_symbols": self.beam_symbols,
            "beam_parents": self.beam_parents,
            "result_probs": self.result_probs,
            "result_symbols": self.result_symbols,
            "result_parents": self.result_parents
        }
        self.model_exporter.init(tf.get_default_graph().as_graph_def(),
                                 named_graph_signatures={
                                     "inputs":
                                     exporter.generic_signature(inputs),
                                     "outputs":
                                     exporter.generic_signature(outputs)
                                 })
示例#31
0
    def __init__(self,
                 num_symbols,
                 num_embed_units,
                 num_units,
                 num_layers,
                 num_labels,
                 embed,
                 learning_rate=0.5,
                 max_gradient_norm=5.0,
                 model='LSTM'):
        #todo: implement placeholders
        self.texts = tf.placeholder(dtype=tf.string,
                                    shape=[None, None])  # shape: batch*len
        self.texts_length = tf.placeholder(dtype=tf.int32,
                                           shape=None)  # shape: batch
        self.labels = tf.placeholder(dtype=tf.int64,
                                     shape=None)  # shape: batch

        self.keep_prob = tf.placeholder(dtype=tf.float32)

        self.symbol2index = MutableHashTable(key_dtype=tf.string,
                                             value_dtype=tf.int64,
                                             default_value=UNK_ID,
                                             shared_name="in_table",
                                             name="in_table",
                                             checkpoint=True)
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate),
                                         trainable=False,
                                         dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_add_op = self.epoch.assign(self.epoch + 1)

        self.index_input = self.symbol2index.lookup(self.texts)  # batch*len

        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed',
                                         [num_symbols, num_embed_units],
                                         tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed',
                                         dtype=tf.float32,
                                         initializer=embed)

        self.embed_input = tf.nn.embedding_lookup(
            self.embed, self.index_input)  #batch*len*embed_unit

        #todo: implement unfinished networks

        if num_layers == 1:
            if model == 'LSTM':
                cell = BasicLSTMCell(num_units)
            elif model == 'RNN':
                cell = BasicRNNCell(num_units)
            elif model == 'GRU':
                cell = GRUCell(num_units)
            else:
                print("Wrong model!")
                return
            cell_dr = tf.nn.rnn_cell.DropoutWrapper(
                cell, input_keep_prob=1.0, output_keep_prob=self.keep_prob)
            outputs, states = dynamic_rnn(cell_dr,
                                          self.embed_input,
                                          self.texts_length,
                                          dtype=tf.float32,
                                          scope="rnn")
            if model == 'LSTM':
                h_state = states[0]
            else:
                h_state = states
        else:
            if model == 'LSTM':
                cell = BasicLSTMCell(num_units)
            elif model == 'RNN':
                cell = BasicRNNCell(num_units)
            elif model == 'GRU':
                cell = GRUCell(num_units)
            else:
                print("Wrong model!")
                return
            cell_dr = tf.nn.rnn_cell.DropoutWrapper(
                cell, input_keep_prob=1.0, output_keep_prob=self.keep_prob)
            multi_cell = tf.contrib.rnn.MultiRNNCell([cell_dr] * num_layers,
                                                     state_is_tuple=True)
            init_state = multi_cell.zero_state(16, tf.float32)
            outputs, state = tf.nn.dynamic_rnn(multi_cell,
                                               self.embed_input,
                                               self.texts_length,
                                               dtype=tf.float32,
                                               scope="rnn",
                                               initial_state=init_state,
                                               time_major=False)
            h_state = outputs[:, -1, :]

        logits = tf.layers.dense(h_state, num_labels)

        self.loss = tf.reduce_sum(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels,
                                                           logits=logits),
            name='loss')
        mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0],
                                        dtype=tf.float32)
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(
            tf.equal(self.labels, predict_labels), tf.int32),
                                      name='accuracy')

        self.params = tf.trainable_variables()

        # calculate the gradient of parameters
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(mean_loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        tf.summary.scalar('loss/step', self.loss)
        for each in tf.trainable_variables():
            tf.summary.histogram(each.name, each)

        self.merged_summary_op = tf.summary.merge_all()

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=3,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
示例#32
0
    def __init__(self,
            num_symbols,
            num_embed_units,
            num_units,
            num_layers,
            num_labels,
            embed,
            learning_rate=0.005,
            max_gradient_norm=5.0,
			param_da=150,
			param_r=10):

        self.texts = tf.placeholder(tf.string, (None, None), 'texts')  # shape: [batch, length]

        #todo: implement placeholders
        self.texts_length = tf.placeholder(tf.int32,(None), 'texts_length')  # shape: [batch]
        self.labels = tf.placeholder(tf.int64,(None), 'labels')  # shape: [batch]

        self.symbol2index = MutableHashTable(
                key_dtype=tf.string,
                value_dtype=tf.int64,
                default_value=UNK_ID,
                shared_name="in_table",
                name="in_table",
                checkpoint=True)

        batch_size = tf.shape(self.texts)[0]
        # build the vocab table (string to index)
        # initialize the training process
        self.learning_rate = tf.Variable(float(learning_rate),
                trainable=False, dtype=tf.float32)
        self.global_step = tf.Variable(0, trainable=False)


        self.index_input = self.symbol2index.lookup(self.texts)   # shape: [batch, length]

        # build the embedding table (index to vector)
        if embed is None:
            # initialize the embedding randomly
            self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32)
        else:
            # initialize the embedding by pre-trained word vectors
            self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed)

        #todo: implement embedding inputs
        self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input)


        #shape: [batch, length, num_embed_units]


        #todo: implement Multi-layer RNNCell with #num_units neurons and #num_layers layers
        cell_fw = MultiRNNCell([BasicLSTMCell(num_units) for _ in range(num_layers)])
        cell_bw = MultiRNNCell([BasicLSTMCell(num_units) for _ in range(num_layers)])

        #todo: implement bidirectional RNN
        outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw,cell_bw,self.embed_input ,self.texts_length ,  dtype=tf.float32, scope="rnn")
        vectors = states[-1][-1]
        print ("Hi")
        H = tf.concat(outputs, 2) # shape: (batch, length, 2*num_units)


        with tf.variable_scope('logits'):
            #todo: implement self-attention mechanism, feel free to add codes to calculate internal results
            Ws1 = tf.get_variable("Ws1", shape = [2*num_units, param_da])
            Ws2 = tf.get_variable("Ws2", shape = [param_da, param_r])
            #param1 = tf.matmul(vectors,Ws1) + Ws2
            #print(Ws1,Ws2)

            A = tf.nn.softmax(tf.einsum('aij,jk->aik',tf.nn.tanh(tf.einsum('aij,jk->aik',H,Ws1)),Ws2))
            #M = tf.matmul(H,Ws1) + Ws2   # shape: (batch, param_r*2*num_units)
            M = tf.einsum('aij,aik->ajk',A,H)
            #M=tf.reduce_sum(M, axis=1)
            M = tf.reshape(M,[batch_size,param_r*2*num_units])
            logits = tf.layers.dense(M, num_labels, activation=None, name='projection') # shape: (batch, num_labels)
            #logits = tf.layers.dense(M, num_labels, activation=None, name='projection') # shape: (batch, num_labels)
        #todo: calculate additional loss, feel free to add codes to calculate internal results
        identity = tf.reshape(tf.tile(tf.diag(tf.ones([param_r])), [batch_size, 1]), [batch_size, param_r, param_r])
        #self.penalized_term = tf.nnl2_loss(M,name=None)
        P = tf.einsum("aij,ajk->aik",tf.einsum("aij->aji",A),A) - identity
        self.penalized_term = tf.reduce_mean(tf.trace(tf.einsum("aij,ajk->aik", tf.einsum("aij->aji",P),P)))




        self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') + 0.0001*self.penalized_term
        predict_labels = tf.argmax(logits, 1, 'predict_labels')
        self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy')

        self.params = tf.trainable_variables()

        # calculate the gradient of parameters
        opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients,
                max_gradient_norm)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                global_step=self.global_step)

        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                max_to_keep=5, pad_step_number=True)