Пример #1
0
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        memory, sents1, src_masks = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory, src_masks)

        # train scheme
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
        if self.hp.io_tie and self.hp.ortho_embedding:

            lmb = self.hp.ortho_lambda
            normlevel = self.hp.ortho_reg_norm
            if not self.hp.fac_embed:

                real_embedding = self.embeddings[1:, :]
                if not (self.hp.norm_embedding
                        or self.embedding_normalization):
                    loss = loss + (tf.norm(tf.subtract(
                        tf.matmul(tf.transpose(real_embedding),
                                  real_embedding),
                        tf.scalar_mul(tf.constant(2.0, dtype=tf.float32),
                                      tf.eye(self.hp.d_model))),
                                           ord=normlevel)**2) * lmb
                else:
                    wtw = tf.matmul(tf.transpose(real_embedding),
                                    real_embedding)
                    wtw_diag = tf.linalg.diag(tf.linalg.diag_part(wtw))
                    loss = loss + (tf.norm(tf.subtract(wtw, wtw_diag))**
                                   2) * lmb
            else:
                loss = loss + (tf.norm(tf.subtract(
                    tf.matmul(self.embeddings2, tf.transpose(
                        self.embeddings2)), tf.eye(self.hp.d_embed)),
                                       ord=normlevel)**2) * lmb
                #loss=loss+tf.norm(tf.subtract( tf.matmul( tf.transpose(self.embeddings1), self.embeddings1),tf.eye(self.hp.d_embed) ) ,ord=normlevel)*lmb#if not good, delete this

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        memory, sents1 = self.encode(xs)
        logits, y, sents2 = self.decode(xs, ys, memory)

        loss = self._calc_loss(y, logits)

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)

        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()
        return loss, train_op, global_step, summaries
Пример #3
0
    def train(self, xs, ys):
        # forward
        loss_weight = ys[-1]
        ys = ys[:-1]
        
        memory, sents1 = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory)

        # train scheme
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_)
        nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"]))  # 0: <pad>
        a = ce * nonpadding
        print ('loss_weight1.shape', loss_weight.shape)
        print ('a.shape', a.shape)
        a = ce * nonpadding * (1 + loss_weight)
        b = nonpadding
        loss = tf.reduce_sum(a) / (tf.reduce_sum(b) + 1e-7)
        
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(preds, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(learning_rate=lr,beta1=0.9, beta2=0.997, epsilon=1e-9)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("accuracy", accuracy)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Пример #4
0
    def train(self, xs, ys):
        # Forward
        memory, sents1 = self.encode(xs)
        logits, preds, y, sent2 = self.decode(ys, memory)

        # Train scheme
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        nonpadding = tf.to_float(tf.not_equal(y, self.token2idx['<PAD>']))
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('global_step', global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
    def train_multi_gpu(self, xs, ys):
        tower_grads = []
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        loss, summaries = None, None
        with tf.variable_scope(tf.get_variable_scope()):
            for i, no in enumerate(self.hp.gpu_list):
                with tf.device("/gpu:%d" % no):
                    with tf.name_scope("tower_%d" % no):
                        memory, sents1 = self.encode(xs)
                        logits, y, sents2 = self.decode(xs, ys, memory)
                        tf.get_variable_scope().reuse_variables()

                        loss = self._calc_loss(y, logits)

                        grads = optimizer.compute_gradients(loss)
                        tower_grads.append(grads)

        with tf.device("/cpu:0"):
            grads = self.average_gradients(tower_grads)
            train_op = optimizer.apply_gradients(grads, global_step=global_step)

            tf.summary.scalar('lr', lr)
            tf.summary.scalar("train_loss", loss)
            tf.summary.scalar("global_step", global_step)
            summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Пример #6
0
    def train(self, xs, ys, x_paraphrased_dict, synonym_label=None):
        # forward
        memory, sents1 = self.encode(xs)
        _, _, synonym_label_loss = self.labeling(synonym_label, memory)
        logits, preds, y, sents2 = self.decode(ys, x_paraphrased_dict, memory)

        # train scheme
        # generation loss
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_)
        nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"]))  # 0: <pad>
        loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
        # multi task loss
        tloss = self.hp.l_alpha * loss + (1.0-self.hp.l_alpha) * synonym_label_loss

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(tloss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("tloss", tloss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Пример #7
0
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        memory, sents1, src_masks = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory, src_masks)

        # train scheme
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Пример #8
0
    def train(self, xs1, xs2, scores):
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.context.lr, global_step,
                         self.context.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        gpus = get_available_gpus()

        if gpus:
            num_gpu = len(gpus)
            assert self.context.hparams.batch_size % num_gpu == 0

            xs1s, xs2s = tf.split(xs1, num_gpu, axis=0), tf.split(xs2,
                                                                  num_gpu,
                                                                  axis=0)
            scoress = tf.split(scores, num_gpu, axis=0)

            tower_grads = []
            losses = []
            with tf.variable_scope(tf.get_variable_scope()) as scope:
                list_predictions = []
                for i in range(num_gpu):
                    with tf.device("/gpu:%d" % i):
                        with tf.name_scope("tower_%d" % i):
                            predictions = self._get_prediction(
                                xs1s[i], xs2s[i])
                            list_predictions.append(predictions)
                            # square loss
                            partial_loss = tf.reduce_sum(tf.squared_difference(
                                predictions, scoress[i]),
                                                         name="loss")
                            losses.append(partial_loss)
                            tf.get_variable_scope().reuse_variables()
                            grad = get_gradients_by_loss_and_optimizer(
                                partial_loss, optimizer)
                            tower_grads.append(grad)
                predictions = tf.concat(list_predictions, axis=0)
            loss = tf.reduce_mean(losses)
            grads_and_vars = average_gradients(tower_grads)
        else:
            predictions = self._get_prediction(xs1, xs2)
            loss = tf.reduce_sum(tf.squared_difference(predictions, scores),
                                 name="loss")
            grads_and_vars = get_gradients_by_loss_and_optimizer(
                loss, optimizer)
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        for g, v in grads_and_vars:
            tf.summary.histogram(v.name, v)
            tf.summary.histogram(v.name + '_grad', g)
        tf.summary.scalar("pred_avg", tf.reduce_mean(predictions))
        tf.summary.scalar("label_avg", tf.reduce_mean(scores))

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()
        return loss, train_op, global_step, summaries
Пример #9
0
    def train(self, inputs, targets):
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self._context.lr, global_step,
                         self._context.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        gpus = get_available_gpus()

        loss_func = self._loss_func_dict.get(self._context.loss_func,
                                             self._get_loss)
        if gpus:
            num_gpu = len(gpus)
            assert self._context.hparams.batch_size % num_gpu == 0

            partial_inputs = [[] for _ in range(num_gpu)]
            for input_tmp in inputs:
                input_tmps = tf.split(input_tmp, num_gpu, axis=0)
                for i in range(num_gpu):
                    partial_inputs[i].append(input_tmps[i])
            targetses = tf.split(targets, num_gpu, axis=0)

            tower_grads = []
            losses = []
            with tf.variable_scope(tf.get_variable_scope()) as scope:
                for i in range(num_gpu):
                    with tf.device("/gpu:%d" % i):
                        with tf.name_scope("tower_%d" % i):
                            partial_loss = loss_func(partial_inputs[i],
                                                     targetses[i])
                            losses.append(partial_loss)
                            tf.get_variable_scope().reuse_variables()
                            grad = get_gradients_by_loss_and_optimizer(
                                partial_loss, optimizer)
                            tower_grads.append(grad)
            loss = tf.reduce_mean(losses)
            grads_and_vars = average_gradients(tower_grads)
        else:
            loss = tf.reduce_mean(loss_func(inputs, targets))
            grads_and_vars = get_gradients_by_loss_and_optimizer(
                loss, optimizer)
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

        for g, v in grads_and_vars:
            if g is None:  # 无梯度
                continue
            tf.summary.histogram(v.name, v)
            tf.summary.histogram(v.name + '_grad', g)
        tf.summary.scalar("pred_avg", tf.reduce_mean(self._outputs))
        tf.summary.scalar("infr_avg", tf.reduce_mean(self._inferences))
        tf.summary.scalar("label_avg", tf.reduce_mean(targets))

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()
        return loss, train_op, global_step, summaries
Пример #10
0
 def add_optimizer(self,global_step):
     # optimizer에 global_step을 넘겨줘야, global_step이 자동으로 증가된다.
     
     lr = noam_scheme(self.hp.learning_rate, global_step, self.hp.warmup_steps)
     optimizer = tf.train.AdamOptimizer(lr)
     self.train_op = optimizer.minimize(self.loss, global_step=global_step)
     
     
     return self.train_op
Пример #11
0
    def __init__(self, hp):
        self.hp = hp
        self.token2idx, self.idx2token = load_vocab(hp.vocab)
        self.embeddings = get_token_embeddings(self.hp.vocab_size,
                                               self.hp.d_model,
                                               zero_pad=True)

        self.input_x = tf.placeholder(dtype=tf.int32,
                                      shape=(None, None),
                                      name="input_x")
        self.decoder_input = tf.placeholder(dtype=tf.int32,
                                            shape=(None, None),
                                            name="decoder_input")
        self.target = tf.placeholder(dtype=tf.int32,
                                     shape=(None, None),
                                     name="target")
        self.is_training = tf.placeholder(dtype=tf.bool, name="is_training")

        # encoder
        self.encoder_hidden = self.encode(self.input_x,
                                          training=self.is_training)

        # decoder
        self.logits = self.decode(self.decoder_input,
                                  self.encoder_hidden,
                                  training=self.is_training)

        self.y_hat = tf.to_int32(tf.argmax(self.logits, axis=-1),
                                 name="y_predict_v2")

        # loss
        self.smoothing_y = label_smoothing(
            tf.one_hot(self.target, depth=self.hp.vocab_size))
        self.ce_loss = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=self.logits, labels=self.smoothing_y)
        nonpadding = tf.to_float(
            tf.not_equal(self.target, self.token2idx["<pad>"]))
        self.loss = tf.reduce_sum(
            self.ce_loss * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        # optimize
        self.global_step = tf.train.get_or_create_global_step()
        self.lr = noam_scheme(self.hp.lr, self.global_step,
                              self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(self.lr)
        self.train_op = optimizer.minimize(self.loss,
                                           global_step=self.global_step)

        # tensorboard
        tf.summary.scalar('lr', self.lr)
        tf.summary.scalar("loss", self.loss)
        tf.summary.scalar("global_step", self.global_step)
        self.summaries = tf.summary.merge_all()

        # predict part
        self.y_predict = tf.identity(self.greedy_search(), name="y_predict")
Пример #12
0
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        memory, sents1 = self.encode(xs)
        # memory = tf.Print(memory, [memory], message='memory =', summarize=10)
        logits, preds, y, sents2 = self.decode(ys, memory)
        # logits = tf.Print(logits, [logits], message='logits =', summarize=10)

        print('train logits.shape, y.shape =', logits.shape, y.shape)
        # train scheme
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        # logits = tf.Print(logits, [logits], message='logits =', summarize=10)
        # y_ = tf.Print(y_, [y_], message='y_ =', summarize=10)
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        # nonpadding = tf.Print(nonpadding, [nonpadding], message='nonpadding =',
        #     summarize=100)
        # nonpadding_print = tf.print('nonpadding =', tf.shape(nonpadding)
        #     , summarize=20)
        # with tf.control_dependencies([nonpadding_print]):
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
        # loss = tf.Print(loss, [loss], message='loss =', summarize=10)

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        # gradients = optimizer.compute_gradients(loss)
        # # print_grad = tf.print('gradients =', gradients, summarize=10)
        # # with tf.control_dependencies([print_grad]):
        # clip_grads = [(tf.clip_by_value(grad, -100., 100.), var) for grad, var in gradients]
        # train_op = optimizer.apply_gradients(clip_grads, global_step=global_step)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Пример #13
0
    def train_labeling(self, xs, synonym_label=None):
        # forward
        memory, sents1 = self.encode(xs)
        _, _, loss = self.labeling(synonym_label, memory)

        # train scheme
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)
        summaries = tf.summary.merge_all()
        return loss, train_op, global_step, summaries
Пример #14
0
    def train(self, xs, ys):
        logits, y_, dec = self.encode_decode(xs, ys)

        ce = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y_)
        loss = tf.reduce_sum(ce)

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Пример #15
0
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        memory, sents1, src_masks = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory, src_masks)

        # train scheme
        # y:(N, T2)值:[[ 5768  7128  7492  7128  7492  4501 7128  7128 14651],[ 5768  7128  7492  7128  7492  4501 7128  7128 14651]]
        # y_:(N, T2, vocab_size);  值:(N, T2,[0,0.999,0,.....,0]
        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        # 预测值和label做交叉熵,生成损失值
        # logits:预测id的概率 (N, T2,[0,0.999,0,.....,0])
        # ce: (N,T2) 例如:(4, 42)  值:array([[ 6.8254533,  6.601975 ,  6.5515084...,9.603574 , 10.001306 ],[6.8502007,  6.645137...]】,每个字粒度的损失
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        # nonpadding:(N,T2) 例如:(4, 42) 值:[[[1., 1., 1.,0,0,0],[1., 1., 1., 1., 1., 1.,.....]
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        # tf.reduce_sum 按照某一维度求和 不指定axis,默认所有维度
        # ce * nonpadding:只求没有填充的词的损失,padding的去掉了 tf.reduce_sum(nonpadding):个数相加为了求平均
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        global_step = tf.train.get_or_create_global_step()
        # 根据训练步数,动态改变学习率
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        #定义优化器
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)

        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Пример #16
0
    def train(self, xs, ys):  # 用于训练模型
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        # 调用decode()和encode()来获取个部分的输出结果
        memory, sents1, src_masks = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory, src_masks)

        # train scheme
        # 利用one_hot表示每个词的索引在整个词表中的位置, 相当于构建出了要训练的目标Label,
        # 这里就是要使logits的最终结果,即vocab_size大小的向量中,目标词汇所在位置(索引)的值尽可能的大,而使其他位置的值尽可能的小。
        # 构造出了输出和标签之后,就使用tf.nn.softmax_cross_entropy_with_logits()进行训练
        y_ = label_smoothing(tf.one_hot(
            y,
            depth=self.hp.vocab_size))  # label_smoothing函数用来进行one hot函数的平滑处理
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        # 在计算Loss之前,还要进行一定的处理。由于一开始对有些句子长度不够maxlen的进行了padding,所以在计算Loss的时候,将这些位置上的误差清0
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        # loss函数用到了交叉熵函数,但是在计算的时候去掉了padding的影响。
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        #对学习率进行调整,用到了warmup操作,初始阶段lr逐渐上升,迭代后期则逐渐下降
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)
        # 最后即是利用了AdadeltaOptimizer优化器对loss进行优化。
        # tf.summary.scalar()函数即是以key - value的形式保存数值,可以用于TensorBoard中对数据的可视化展示。
        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Пример #17
0
    def train(self, xs, ys):
        """
        train model
        :param xs: dataset xs
        :param ys: dataset ys
        :return: loss
                 train op
                 global step
                 tensorflow summary
        """
        tower_grads = []
        global_step = tf.train.get_or_create_global_step()
        global_step_ = global_step * self.hp.gpu_nums
        lr = noam_scheme(self.hp.d_model, global_step_, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        losses = []
        xs, ys = split_input(xs, ys, self.hp.gpu_nums)
        with tf.variable_scope(tf.get_variable_scope()):
            for no in range(self.hp.gpu_nums):
                with tf.device("/gpu:%d" % no):
                    with tf.name_scope("tower_%d" % no):
                        memory_h, memory_u, sents1 = self.encode(xs[no])
                        logits, y, sents2 = self.decode(
                            xs[no], ys[no], memory_h, memory_u)
                        tf.get_variable_scope().reuse_variables()

                        loss = self._calc_loss(y, logits)
                        losses.append(loss)
                        grads = optimizer.compute_gradients(loss)
                        # print(grads)
                        tower_grads.append(grads)

        with tf.device("/cpu:0"):
            grads = self.average_gradients(tower_grads)
            train_op = optimizer.apply_gradients(grads,
                                                 global_step=global_step)
            loss = sum(losses) / len(losses)
            tf.summary.scalar('lr', lr)
            tf.summary.scalar("train_loss", loss)
            summaries = tf.summary.merge_all()

        return loss, train_op, global_step_, summaries
Пример #18
0
    def train(self, xs, ys):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # 构建encoder和decoder
        memory, sents1 = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory)

        # train scheme
        y_ = label_smoothing(tf.one_hot(
            y, depth=self.hp.vocab_size))  # batch_size*T*vocab_size
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=logits, labels=y_)  #  logits 未经过softmax处理,因为函数内部进行了处理,
        nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"])
                                 )  # 0: <pad> 相当于mask,不等长序列最后计算loss要剔除padding项
        # 计算整个batch内的平均loss,1e-7防止分母为0的情况发生
        # ce * nonpadding 只计算非padding的 loss
        # 分母为 tf.reduce_sum(nonpadding) 表示计算整个batch的平均loss
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries
Пример #19
0
    def train(self, xs, ys, mode):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward

        mu, sigma = self.encoder_vae(xs, training=True, mode=mode)
        if mode == "TPAGE" or mode == "PPAGE":
            # 表示 训练VAE
            # 这里提醒自己一下 将embeding 全部设为True
            z = mu + sigma * tf.random_normal(
                tf.shape(mu), 0, 1, dtype=tf.float32)
        else:
            raise ("许海明在这里提醒你:出现非法mode")

        logits, preds, y, sents2 = self.decoder_vae(ys,
                                                    z,
                                                    training=True,
                                                    mode=mode)

        # train scheme

        ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                            labels=y)
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["[PAD]"]))  # 0: <pad>
        loss_decoder = tf.reduce_sum(ce * nonpadding) / tf.to_float(
            get_shape_list(xs[0], expected_rank=2)[0])

        # 这里加上KL loss
        if mode == "TPAGE":
            KL_loss = tf.reduce_mean(0.5 * tf.reduce_sum(
                tf.square(mu) + tf.square(sigma) -
                tf.log(1e-8 + tf.square(sigma)) - 1, [1]))
        else:
            KL_loss = 0.0

        loss = loss_decoder + KL_loss

        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        # # monitor a random sample
        n = tf.random_uniform((), 0, tf.shape(preds)[0] - 1, tf.int32)
        print_demo = (xs[0][n], y[n], preds[n])

        tf.summary.scalar('lr', lr)
        tf.summary.scalar("KL_loss", KL_loss)
        tf.summary.scalar("loss_decoder", loss_decoder)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)

        summaries = tf.summary.merge_all()

        return loss, train_op, global_step, summaries, print_demo
Пример #20
0
    def build(self):
        # truth vector
        self.T = tf.get_variable('truth_vector',
                                 shape=[1, 1, self.item_embedding_dim],
                                 dtype=tf.float32,
                                 trainable=False)
        # embedding matrix
        self.user_embedding_layer = tf.get_variable(
            name='user_embedding_layer',
            shape=[self.num_users, self.user_embedding_dim],
            dtype=tf.float32)
        self.item_embedding_layer = tf.get_variable(
            name='item_embedding_layer',
            shape=[self.num_items, self.item_embedding_dim],
            dtype=tf.float32)
        # embedding
        self.user_emb_vec = tf.nn.embedding_lookup(self.user_embedding_layer,
                                                   self.input_user)
        self.item_emb_vec = tf.nn.embedding_lookup(self.item_embedding_layer,
                                                   self.input_items)
        self.target_emb_vec = tf.nn.embedding_lookup(self.item_embedding_layer,
                                                     self.input_target)
        self.negative_sample_emb_vec = tf.nn.embedding_lookup(
            self.item_embedding_layer, self.input_negative_sample)

        # interaction
        self.encoder = interact_encoder(self.user_emb_vec,
                                        self.item_emb_vec,
                                        self.hidden1_dim,
                                        self.hidden2_dim,
                                        activation=self.activation,
                                        interact_type=self.interact_type)
        self.encoder_pos = interact_encoder(self.user_emb_vec,
                                            self.target_emb_vec,
                                            self.hidden1_dim,
                                            self.hidden2_dim,
                                            activation=self.activation,
                                            interact_type=self.interact_type)
        self.encoder_neg = interact_encoder(self.user_emb_vec,
                                            self.negative_sample_emb_vec,
                                            self.hidden1_dim,
                                            self.hidden2_dim,
                                            activation=self.activation,
                                            interact_type=self.interact_type)

        # NOT(*) operation
        feedback_to_oper = self.input_feedback_score[:, :, tf.
                                                     newaxis] * tf.ones_like(
                                                         self.encoder)
        applicable = tf.equal(feedback_to_oper, 1)
        encoder_to_oper = tf.where(applicable, self.encoder,
                                   tf.zeros_like(self.encoder))
        not_encoder = not_modules(encoder_to_oper,
                                  self.hidden1_dim,
                                  self.hidden2_dim,
                                  activation=self.activation)
        self.not_encoder = tf.where(applicable, not_encoder, self.encoder)

        # OR(*) operation
        self.or_cell = OrMoudleCell(self.hidden1_dim, self.hidden2_dim)
        self.or_encoder, _ = tf.nn.dynamic_rnn(
            self.or_cell,
            self.not_encoder[:, 1:, :],
            initial_state=self.not_encoder[:, 0, :],
            dtype=tf.float32)
        self.or_encoder_last = self.or_encoder[:, -1, :]

        self.or_encoder_pos, _ = tf.nn.dynamic_rnn(
            self.or_cell,
            self.encoder_pos,
            initial_state=self.or_encoder_last,
            dtype=tf.float32)
        self.or_encoder_neg, _ = tf.nn.dynamic_rnn(
            self.or_cell,
            self.encoder_neg,
            initial_state=self.or_encoder_last,
            dtype=tf.float32)

        # cosine similarity
        self.probability_pos = cosine_probability(self.or_encoder_pos, self.T)
        self.probability_neg = cosine_probability(self.or_encoder_neg, self.T)

        # pair-wise loss
        self.traget_loss = -tf.reduce_sum(
            tf.log_sigmoid(self.probability_pos - self.probability_neg))

        # L2 loss
        trainable_variables = tf.trainable_variables()
        self.l2_loss = tf.reduce_sum(
            [tf.nn.l2_loss(var) for var in trainable_variables])

        # model loss
        self.lnn_loss = self.traget_loss + self.l2_weight * self.l2_loss

        # logical regularizer loss
        event_space_vectors = [
            self.encoder, self.encoder_pos, self.encoder_neg, self.not_encoder,
            self.or_encoder, self.or_encoder_pos, self.or_encoder_neg
        ]
        event_space_vectors = tf.concat(event_space_vectors, axis=1)
        self.logical_loss = self.logical_regularizer(event_space_vectors)

        # sum
        self.loss = self.lnn_loss + self.logical_weight * self.logical_loss

        # Adam
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.learning_rate, global_step, self.warmup_steps)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        # train
        self.train_op = self.optimizer.minimize(self.loss,
                                                global_step=global_step)

        # tensorboard scalar
        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('traget_loss', self.traget_loss)
        tf.summary.scalar('l2_loss', self.l2_loss)
        tf.summary.scalar('logical_loss', self.logical_loss)
        tf.summary.scalar('lr', lr)
        tf.summary.scalar('global_step', global_step)

        self.summaries = tf.summary.merge_all()
Пример #21
0
    def batch_split_train(self, xs, ys, split_num=4):
        '''
        Returns
        loss: scalar.
        train_op: training operation
        global_step: scalar.
        summaries: training summary node
        '''
        # forward
        #xs_split=tf.split(xs,gpu_num)
        #ys_split=tf.split(ys,gpu_num)
        #print(xs)
        #print(ys)
        #xs_split=[]
        #ys_split=[]
        #batchsize=self.hp.batch_size
        '''
        divided_batch_size=batchsize//split_num
        for i in range(split_num):
            start=divided_batch_size*i
            end=start+divided_batch_size
            xs_split.append((xs[0][start:end],xs[1][start:end],xs[2][start:end]))
            ys_split.append((ys[0][start:end],ys[1][start:end],ys[2][start:end],ys[3][start:end]))
        '''
        global_step = tf.train.get_or_create_global_step()
        lr = noam_scheme(self.hp.lr, global_step // split_num,
                         self.hp.warmup_steps)
        optimizer = tf.train.AdamOptimizer(lr)
        #models=[]
        #for i in range(split_num):

        memory, sents1, src_masks = self.encode(xs)
        logits, preds, y, sents2 = self.decode(ys, memory, src_masks)
        # train scheme

        y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
        ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                        labels=y_)
        nonpadding = tf.to_float(tf.not_equal(
            y, self.token2idx["<pad>"]))  # 0: <pad>
        loss = tf.reduce_sum(
            ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)
        if self.hp.io_tie and self.hp.ortho_embedding:
            lmb = self.hp.ortho_lambda
            normlevel = self.hp.ortho_reg_norm
            if not self.hp.fac_embed:
                real_embedding = self.embeddings[1:, :]
                if not (self.hp.norm_embedding
                        or self.embedding_normalization):
                    loss = loss + (tf.norm(tf.subtract(
                        tf.matmul(tf.transpose(real_embedding),
                                  real_embedding),
                        tf.scalar_mul(tf.constant(2.0, dtype=tf.float32),
                                      tf.eye(self.hp.d_model))),
                                           ord=normlevel)**2) * lmb
                else:

                    wtw = tf.matmul(tf.transpose(real_embedding),
                                    real_embedding)
                    wtw_diag = tf.linalg.diag(tf.linalg.diag_part(wtw))
                    loss = loss + (tf.norm(tf.subtract(wtw, wtw_diag))**
                                   2) * lmb
            else:
                loss = loss + (tf.norm(tf.subtract(
                    tf.matmul(self.embeddings2, tf.transpose(
                        self.embeddings2)), tf.eye(self.hp.d_embed)),
                                       ord=normlevel)**2) * lmb
                #loss=loss+tf.norm(tf.subtract( tf.matmul( tf.transpose(self.embeddings1), self.embeddings1),tf.eye(self.hp.d_embed) ) ,ord=normlevel)*lmb#if not good, delete this

        grads = optimizer.compute_gradients(loss)
        self.steps.append((loss, grads))
        if len(self.steps) == split_num:
            tower_losses, tower_grads = zip(*self.steps)

            train_op = optimizer.apply_gradients(
                average_gradients(tower_grads), global_step=global_step)
            self.steps = []
        else:
            train_op = optimizer.apply_gradients([], global_step=global_step)
        #aver_loss=tf.reduce_mean(tower_losses)
        tf.summary.scalar('lr', lr)
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("global_step", global_step)
        summaries = tf.summary.merge_all()
        return loss, train_op, global_step, summaries
Пример #22
0
    def build(self):
        position_encoding_outputs = modules.position_encoding(
            self.x_input, args.position_size)
        if args.position_encoding_type == 'add':
            outputs = position_encoding_outputs + self.x_input
        if args.position_encoding_type == "concat":
            outputs = tf.concat([self.x_input, position_encoding_outputs],
                                axis=2)

        for i in range(6):
            sublayer1 = modules.multi_head_attention(
                outputs,
                outputs,
                outputs,
                args.head_num,
                args.head_size,
                self.dropout,
                self.training,
                type=args.attention_unit_type)
            self.mhas.append(sublayer1)
            outputs = modules.residual_connection(outputs, sublayer1,
                                                  self.training)
            sublayer2 = modules.feed_forward(outputs, args.feed_forward_size,
                                             self.dropout, self.training)
            outputs = modules.residual_connection(outputs, sublayer2,
                                                  self.training)

        outputs = tf.layers.dense(outputs,
                                  1,
                                  use_bias=True,
                                  name='last_output')
        outputs = tf.squeeze(outputs, -1)  # (batch_size, seqlen)
        outputs = tf.layers.dense(outputs, args.nlabel, name='output_logit')

        self.logits = outputs
        self.logits_softmax = tf.nn.softmax(outputs,
                                            name='output_logit_softmax')

        if self.training is not None:
            util.params_usage(tf.trainable_variables())

        y = tf.one_hot(self.y_true, args.nlabel)
        self.y_smooth = modules.label_smoothing(y)
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y_smooth,
                                                          logits=self.logits)
        self.loss = tf.reduce_mean(loss)
        self.global_step = tf.train.get_or_create_global_step()
        self.lr = modules.noam_scheme(args.eta,
                                      global_step=self.global_step,
                                      warmup_steps=args.warmup)
        train_op = tf.compat.v1.train.AdamOptimizer(self.lr).minimize(
            self.loss, global_step=self.global_step)
        update_ops = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.UPDATE_OPS)
        self.train_op = tf.group([train_op, update_ops])

        self.y_pred = tf.argmax(self.logits_softmax, axis=1, name="y_pred")
        pred_prob = tf.equal(tf.cast(self.y_pred, tf.int32), self.y_true)
        self.accuracy = tf.reduce_mean(tf.cast(pred_prob, tf.float32),
                                       name="accuracy")

        tf.compat.v1.summary.scalar('accuracy', self.accuracy)
        tf.compat.v1.summary.scalar('loss', self.loss)
        tf.compat.v1.summary.scalar('learning rate', self.lr)
        self.merged_summary_op = tf.compat.v1.summary.merge_all()