示例#1
0
    def _build(self, num_classifiers, learning_rate):
        # inputs
        self.X = tf.placeholder(tf.float32, [None, 28, 28])
        self.y = tf.placeholder(tf.int32, [None])
        one_hot_y = tf.one_hot(self.y, 10)

        networks = [layers.feedforward(self.X) for _ in range(num_classifiers)]
        self.individual_loss = [
            layers.loss(net, one_hot_y) for net in networks
        ]
        self.individual_accuracy = [
            layers.accuracy(net, one_hot_y) for net in networks
        ]

        logits = tf.reduce_mean(tf.stack(networks, axis=-1), axis=-1)
        l2_distance = tf.add_n([
            tf.norm(networks[0] - networks[1]),
            tf.norm(networks[1] - networks[2]),
            tf.norm(networks[2] - networks[0])
        ])

        cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=logits, labels=one_hot_y)
        self.loss = tf.reduce_mean(cross_entropy) + 1e-4 * l2_distance
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        self.train_op = optimizer.minimize(self.loss)

        correct_prediction = tf.equal(tf.argmax(logits, axis=1),
                                      tf.argmax(one_hot_y, axis=1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        self.prediction = tf.argmax(logits, axis=1)
def loss(y, y_):
    m = len(y)
    e = 0
    for i in range(m):
        e += net.loss().forward({
            'x': np.matrix(y_[i]),
            'y': np.matrix(y[i])
        })
    return e / m
示例#3
0
def build_model(input_data_tensor, input_label_tensor):
    num_classes = config["num_classes"]
    images = tf.image.resize_images(input_data_tensor, [224, 224])
    logits = vgg.build(images, n_classes=num_classes, training=True)
    probs = tf.nn.softmax(logits)
    loss = L.loss(logits, tf.one_hot(input_label_tensor, num_classes))
    error_top5 = L.topK_error(probs, input_label_tensor, K=5)
    error_top1 = L.topK_error(probs, input_label_tensor, K=1)

    # you must return a dictionary with at least the "loss" as a key
    return dict(loss=loss,
                logits=logits,
                error_top5=error_top5,
                error_top1=error_top1)
def gradCostFunc(x, y, w1):

    m = x.shape[0]
    dEdW1 = 0
    E = 0
    for i in range(m):

        # forward --->
        z1 = np.matrix(x[i, :]).T
        yi = np.matrix(y[i])
        z2 = net.inner().forward({'x': z1, 'w': w1})
        #z3 = net.sigm().forward({'x':z2});

        z = net.loss().forward({
            'x': z2,
            'y': yi
        })
        E += z

        # <--- backward
        l4 = [1]
        l3 = net.loss().backward({
            'x': z2,
            'y': yi,
            'dzdx': l4
        })
        #l2 = net.sigm().backward({'x':z2, 'dzdx':l3});
        _, dEdW1_i = net.inner().backward({
            'x': z1,
            'w': w1,
            'dzdx': l3
        })

        dEdW1 += dEdW1_i

    return E / m, dEdW1 / m
示例#5
0
def build_model(input_data_tensor, input_label_tensor):
    num_classes = config["num_classes"]
    weight_decay = config["weight_decay"]
    images = tf.image.resize_images(input_data_tensor, [224, 224])
    logits = vgg.build(images, n_classes=num_classes, training=True)
    probs = tf.nn.softmax(logits)
    loss_classify = L.loss(logits, tf.one_hot(input_label_tensor, num_classes))
    loss_weight_decay = tf.reduce_sum(tf.stack([tf.nn.l2_loss(i) for i in tf.get_collection('variables')]))
    loss = loss_classify + weight_decay*loss_weight_decay
    error_top5 = L.topK_error(probs, input_label_tensor, K=5)
    error_top1 = L.topK_error(probs, input_label_tensor, K=1)

    # you must return a dictionary with loss as a key, other variables
    return dict(loss=loss,
                probs=probs,
                logits=logits,
                error_top5=error_top5,
                error_top1=error_top1)
示例#6
0
	def _build(self, num_classifiers, learning_rate):
		# inputs
		self.X = tf.placeholder(tf.float32, [None, 28, 28])
		self.y = tf.placeholder(tf.int32, [None])
		one_hot_y = tf.one_hot(self.y, 10)
		
		networks = [layers.convolutional(self.X) for _ in range(num_classifiers)]
		self.individual_loss = [layers.loss(net, one_hot_y) for net in networks]
		self.individual_accuracy = [layers.accuracy(net, one_hot_y) for net in networks]
		
		logits = layers.linear(tf.concat(networks, axis=-1), 10, bias=False)
		
		cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=one_hot_y)
		self.loss = tf.reduce_mean(cross_entropy)
		optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
		self.train_op = optimizer.minimize(self.loss)
		
		correct_prediction = tf.equal(tf.argmax(logits, axis=1), tf.argmax(one_hot_y, axis=1))
		self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
		self.prediction = tf.argmax(logits, axis=1)
示例#7
0
def training():

    pretrained_weights = './pretrain/vgg16.npy'
    data_dir = './data/cifar10_data/cifar-10-batches-bin'
    train_log_dir = './log/train/'
    val_log_dir = './log/val/'

    with tf.name_scope('input'):
        images_train, labels_train = input_data.read_cifar10(data_dir, is_train=True,
                                                                        batch_size=BATCH_SIZE, shuffle=True)
        images_val, labels_val = input_data.read_cifar10(data_dir, is_train=False,
                                                                        batch_size=BATCH_SIZE, shuffle=False)

    image_holder = tf.placeholder(tf.float32, shape=[BATCH_SIZE, IMG_W, IMG_H, 3])
    label_holder = tf.placeholder(tf.int32, shape=[BATCH_SIZE, N_CLASSES])

    logits = vgg.VGG16(image_holder, N_CLASSES, 0.8)
    loss = layers.loss(logits, label_holder)
    accuracy = layers.accuracy(logits, label_holder)

    global_steps = tf.Variable(0, name='global_step', trainable=False)
    train_op = layers.optimize(loss, LEARNING_RATE, global_steps)

    saver = tf.train.Saver(tf.global_variables())

    # Refenrnce: https://stackoverflow.com/questions/35413618/tensorflow-placeholder-error-when-using-tf-merge-all-summaries
    summary_op = tf.summary.merge_all()
    # summary_op = tf.summary.merge([loss_summary, accuracy_summary], tf.GraphKeys.SUMMARIES)

    # The main thread
    init = tf.global_variables_initializer()
    sess = tf.InteractiveSession()
    sess.run(init)

    print('########################## Start Training ##########################')

    layers.load_with_skip(pretrained_weights, sess, ['fc6', 'fc7', 'fc8'])

    # Coordinate the relationship between threads
    # Reference: http://wiki.jikexueyuan.com/project/tensorflow-zh/how_tos/threading_and_queues.html
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    train_summary_writer = tf.summary.FileWriter(train_log_dir, graph=sess.graph)
    val_summary_writer = tf.summary.FileWriter(val_log_dir, graph=sess.graph)

    try:
        for step in np.arange(MAX_STEP):
            if coord.should_stop():
                break
            # start_time  = time .time()

            train_images, train_labels = sess.run([images_train, labels_train])
            _, train_loss, train_acc, summary_str = sess.run([train_op, loss, accuracy, summary_op],
                                                feed_dict={image_holder: train_images, label_holder: train_labels})
            # duration = time.time() - start_time

            if step % 50 == 0 or (step + 1) == MAX_STEP:
                print('step %d, loss = %.4f, accuracy = %.4f%%' % (step, train_loss, train_acc))
                #summary_str = sess.run(summary_op)
                train_summary_writer.add_summary(summary_str, step)

            if step % 200 == 0 or (step + 1) == MAX_STEP:
                val_images, val_labels = sess.run([images_val, labels_val])
                val_loss, val_acc = sess.run([loss, accuracy],
                                             feed_dict={image_holder: val_images, label_holder: val_labels})
                print('step %d, val loss = %.2f, val accuracy = %.2f%%' % (step, val_loss, val_acc))

                #summary_str2 = sess.run(summary_op)
                val_summary_writer.add_summary(summary_str, step)

            # Why not use global_step=global_steps instead of step ???
            if step % 2000 == 0 or (step + 1) == MAX_STEP:
                checkpoint_path = os.path.join(train_log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

    except tf.errors.OutOfRangeError:
        coord.request_stop()

    coord.request_stop()
    coord.join(threads)

    sess.close()
示例#8
0
def training():
    pretrained_weights = './pretrain/vgg16.npy'

    train_log_dir = './log_dr50000/train/'
    val_log_dir = './log_dr50000/val/'

    with tf.name_scope('input'):
        images_train, labels_train = dr5_input.input_data(True, BATCH_SIZE)
        images_val, labels_val = dr5_input.input_data(False, BATCH_SIZE)

    image_holder = tf.placeholder(tf.float32,
                                  shape=[BATCH_SIZE, IMG_W, IMG_H, 3])
    label_holder = tf.placeholder(tf.int32, shape=[BATCH_SIZE, N_CLASSES])

    logits = vgg.VGG16(image_holder, N_CLASSES, 0.5)
    loss = layers.loss(logits, label_holder)
    accuracy = layers.accuracy(logits, label_holder)

    global_steps = tf.Variable(0, name='global_step', trainable=False)
    LEARNING_RATE = tf.train.exponential_decay(start_rate,
                                               global_steps,
                                               decay_steps,
                                               deacy_rate,
                                               staircase=True)
    train_op = layers.optimize(loss, LEARNING_RATE, global_steps)

    saver = tf.train.Saver(tf.global_variables())

    summary_op = tf.summary.merge_all()

    # The main thread
    init = tf.group(tf.global_variables_initializer(),
                    tf.local_variables_initializer())
    sess = tf.InteractiveSession()
    sess.run(init)

    print(
        '########################## Start Training ##########################')

    layers.load_with_skip(pretrained_weights, sess, ['fc6', 'fc7', 'fc8'])

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    train_summary_writer = tf.summary.FileWriter(train_log_dir,
                                                 graph=sess.graph)
    val_summary_writer = tf.summary.FileWriter(val_log_dir, graph=sess.graph)

    try:
        for step in np.arange(MAX_STEP):
            if coord.should_stop():
                break
            # start_time  = time .time()

            train_images, train_labels = sess.run([images_train, labels_train])
            _, train_loss, train_acc, summary_str = sess.run(
                [train_op, loss, accuracy, summary_op],
                feed_dict={
                    image_holder: train_images,
                    label_holder: train_labels
                })
            # duration = time.time() - start_time

            if step % 50 == 0 or (step + 1) == MAX_STEP:
                print('step %d, loss = %.4f, accuracy = %.4f%%' %
                      (step, train_loss, train_acc))
                train_summary_writer.add_summary(summary_str, step)

            if step % 200 == 0 or (step + 1) == MAX_STEP:
                val_images, val_labels = sess.run([images_val, labels_val])
                val_loss, val_acc = sess.run([loss, accuracy],
                                             feed_dict={
                                                 image_holder: val_images,
                                                 label_holder: val_labels
                                             })
                print('step %d, val loss = %.2f, val accuracy = %.2f%%' %
                      (step, val_loss, val_acc))
                val_summary_writer.add_summary(summary_str, step)

            if step % 2000 == 0 or (step + 1) == MAX_STEP:
                checkpoint_path = os.path.join(train_log_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
                lr = sess.run(LEARNING_RATE)
                print("step %d, learning_rate= %f" % (step, lr))

    except tf.errors.OutOfRangeError:
        coord.request_stop()

    coord.request_stop()
    coord.join(threads)

    sess.close()
示例#9
0
def main(N=300, K=3, D=2, nodes=100, lr=1e-3, reg=1e-8):
    """Main"""
    # Generate and plot data set
    X, Y = gen_data(N, K, D)

    print("Plotting data...")
    col_levels = np.array(list(range(K + 1)), dtype=np.float) - 0.5
    col_cmap = plt.cm.gist_rainbow
    col_norm = col.BoundaryNorm(col_levels, col_cmap.N)

    plt.ion()
    plt.subplot(1, 1, 1)
    plt.scatter(X[:, 0],
                X[:, 1],
                c=Y,
                cmap=col_cmap,
                norm=col_norm,
                vmin=np.min(Y),
                vmax=np.max(Y))
    plt.draw()

    input("Press <ENTER> to continue.")

    # Set up layers
    layers = []
    layers += [L.input(X)]
    layers += [L.fc(layers[-1].Y, nodes)]
    layers += [L.sigmoid(layers[-1].Y)]
    layers += [L.dropout(layers[-1].Y, 0.25)]
    layers += [L.fc(layers[-1].Y, nodes)]
    layers += [L.sigmoid(layers[-1].Y)]
    layers += [L.dropout(layers[-1].Y, 0.25)]
    layers += [L.fc(layers[-1].Y, K)]
    layers += [L.softmax(layers[-1].Y)]
    layers += [L.loss(layers[-1].Y, Y)]

    nlayers = len(layers)

    # TODO (architecture): Instead of calling fwd on each layer, connect layers
    # with "pointers" and call fwd only on the first layer

    try:
        itx = 1
        while True:
            # Forward propagation
            for i, layer in enumerate(layers):
                if i == 0:
                    layer.X = X
                else:
                    layer.reshape(layers[i - 1].Y.shape)
                    layer.X = layers[i - 1].Y
                layer.fwd()

            if np.isnan(layers[-1].Y[0, 0]):
                pdb.set_trace()

            print("Iteration {}, Loss = {:.4f}".format(
                itx, np.asscalar(layers[-1].Y)),
                  end='\r')

            if itx % 1000 == 0:
                print("")

            # Backprop
            for i in list(range(nlayers))[::-1]:
                if i == nlayers - 1:
                    layers[i].dy = 1
                else:
                    layers[i].dy = layers[i + 1].dx

                layers[i].bck()

                if itx % 5000 == 0:  # Gradient check
                    if np.all(layers[i].dx == 0):
                        continue

                    r, c = [
                        np.random.choice(layers[i].X.shape[j]) for j in (0, 1)
                    ]
                    h = 1e-4

                    if abs(layers[i].dx[r, c]) < 1e-5:
                        continue

                    print("Checking gradient on {}...".format(layers[i]),
                          end=' ')

                    X_store = layers[i].X

                    Y_ = []
                    for X_ in [layers[i].X[r, c] + s * h for s in (-1, 1)]:
                        layers[i].X[r, c] = X_

                        for j in range(i, nlayers):
                            if j > i:
                                layers[j].X = layers[j - 1].Y

                            stochastic_store = layers[j].stochastic
                            layers[j].stochastic = False
                            layers[j].fwd()
                            layers[j].stochastic = stochastic_store

                        Y_.append(np.asscalar(layers[-1].Y))

                    layers[i].X = X_store

                    dx = layers[i].dx[r, c]
                    ndx = (Y_[1] - Y_[0]) / (2 * h)
                    diff = abs(ndx - dx) / max(abs(ndx), abs(dx), 1e-10)

                    print("Diff: {:.8f}".format(diff))
                    if diff > 1e-2:
                        pdb.set_trace()

            for layer in layers:
                layer.step(lr, reg)

            if itx % 1000 == 0:
                range_ = [np.max(X[:, i]) - np.min(X[:, i]) for i in (0, 1)]
                x, y = [
                    np.linspace(
                        np.min(X[:, i]) - range_[i] / 2,
                        np.max(X[:, i]) + range_[i] / 2, 400) for i in (0, 1)
                ]
                xx, yy = np.meshgrid(x, y)

                X_ = np.c_[xx.flatten(), yy.flatten()]
                for i, layer in enumerate(layers[:-1]):
                    if i == 0:
                        layer.X = X_
                    else:
                        layer.reshape(layers[i - 1].Y.shape)
                        layer.X = layers[i - 1].Y

                    temp = layer.stochastic
                    layer.stochastic = False
                    layer.fwd()
                    layer.stochastic = temp

                z = np.argmax(layers[-2].Y, axis=1).reshape(xx.shape)

                plt.clf()
                plt.contourf(xx,
                             yy,
                             z,
                             levels=col_levels,
                             cmap=col_cmap,
                             norm=col_norm)
                plt.scatter(X[:, 0],
                            X[:, 1],
                            c=Y,
                            cmap=col_cmap,
                            norm=col_norm)
                plt.draw()
                plt.pause(1e-10)

            itx += 1

    except KeyboardInterrupt:
        # print(layers[-2].Y)
        pass
    def run(self, run_type):

        is_training = True if run_type == 'train' else False

        self.log('{} epoch: {}'.format(run_type, self.epoch))

        image_filenames, label_filenames = self.dataset.load_filenames(
            run_type)

        global_step = tf.Variable(1, name='global_step', trainable=False)

        images, labels = inputs.load_batches(image_filenames,
                                             label_filenames,
                                             shape=self.dataset.SHAPE,
                                             batch_size=self.batch_size,
                                             resize_shape=self.dataset.SHAPE,
                                             crop_shape=(256, 512),
                                             augment=True)

        with tf.name_scope('labels'):
            color_labels = util.colorize(labels, self.dataset.augmented_labels)
            labels = tf.cast(labels, tf.int32)
            ignore_mask = util.get_ignore_mask(labels,
                                               self.dataset.augmented_labels)
            tf.summary.image('label', color_labels, 1)
            tf.summary.image('weights', tf.cast(ignore_mask * 255, tf.uint8),
                             1)

        tf.summary.image('image', images, 1)

        logits = self.model.inference(images,
                                      num_classes=self.num_classes,
                                      is_training=is_training)

        with tf.name_scope('outputs'):
            predictions = layers.predictions(logits)
            color_predictions = util.colorize(predictions,
                                              self.dataset.augmented_labels)
            tf.summary.image('prediction', color_predictions, 1)

        # Add some metrics
        with tf.name_scope('metrics'):
            accuracy_op, accuracy_update_op = tf.contrib.metrics.streaming_accuracy(
                predictions, labels, weights=ignore_mask)
            mean_iou_op, mean_iou_update_op = tf.contrib.metrics.streaming_mean_iou(
                predictions,
                labels,
                num_classes=self.num_classes,
                weights=ignore_mask)

        if is_training:
            loss_op = layers.loss(logits,
                                  labels,
                                  mask=ignore_mask,
                                  weight_decay=self.weight_decay)
            train_op = layers.optimize(loss_op,
                                       learning_rate=self.learning_rate,
                                       global_step=global_step)

        # Merge all summaries into summary op
        summary_op = tf.summary.merge_all()

        # Create restorer for restoring
        saver = tf.train.Saver()

        # Initialize session and local variables (for input pipeline and metrics)
        sess = tf.Session()
        sess.run(tf.local_variables_initializer())

        if self.checkpoint is None:
            sess.run(tf.global_variables_initializer())
            self.log('{} {} from scratch.'.format(run_type, self.model_name))
        else:
            start_time = time.time()
            saver.restore(sess, self.checkpoint)
            duration = time.time() - start_time
            self.log('{} from previous checkpoint {:s} ({:.2f}s)'.format(
                run_type, self.checkpoint, duration))

        # Create summary writer
        summary_path = os.path.join(self.model_path, run_type)
        step_writer = tf.summary.FileWriter(summary_path, sess.graph)

        # Start filling the input queues
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        num_examples = self.dataset.NUM_TRAIN_EXAMPLES if is_training else self.dataset.NUM_VALID_EXAMPLES

        for local_step in range(num_examples // self.batch_size):

            # Take time!
            start_time = time.time()

            if is_training:
                _, loss, accuracy, mean_iou, summary = sess.run([
                    train_op, loss_op, accuracy_update_op, mean_iou_update_op,
                    summary_op
                ])
                duration = time.time() - start_time
                self.log('Epoch: {} train step: {} loss: {:.4f} accuracy: {:.2f}% duration: {:.2f}s' \
                    .format(self.epoch, local_step + 1, loss, accuracy * 100, duration))
            else:
                accuracy, mean_iou, summary = sess.run(
                    [accuracy_update_op, mean_iou_update_op, summary_op])
                duration = time.time() - start_time
                self.log('Epoch: {} eval step: {} accuracy: {:.2f}% duration: {:.2f}s'\
                    .format(self.epoch, local_step + 1, accuracy * 100, duration))

            # Save summary and print stats
            step_writer.add_summary(summary,
                                    global_step=global_step.eval(session=sess))

        # Write additional epoch summaries
        epoch_writer = tf.summary.FileWriter(summary_path)
        epoch_summaries = []
        if is_training:
            epoch_summaries.append(
                tf.summary.scalar('params/weight_decay', self.weight_decay))
            epoch_summaries.append(
                tf.summary.scalar('params/learning_rate', self.learning_rate))
        epoch_summaries.append(
            tf.summary.scalar('params/batch_size', self.batch_size))
        epoch_summaries.append(
            tf.summary.scalar('metrics/accuracy', accuracy_op))
        epoch_summaries.append(
            tf.summary.scalar('metrics/mean_iou', mean_iou_op))
        epoch_summary_op = tf.summary.merge(epoch_summaries)
        summary = sess.run(epoch_summary_op)
        epoch_writer.add_summary(summary, global_step=self.epoch)

        # Save after each epoch when training
        if is_training:
            checkpoint_path = os.path.join(self.model_path,
                                           self.model_name + '.checkpoint')
            start_time = time.time()
            self.checkpoint = saver.save(sess,
                                         checkpoint_path,
                                         global_step=self.epoch)
            duration = time.time() - start_time
            self.log('Model saved as {:s} ({:.2f}s)'.format(
                self.checkpoint, duration))

        # Stop queue runners and reset the graph
        coord.request_stop()
        coord.join(threads)
        sess.close()
        tf.reset_default_graph()
示例#11
0
    def create_network(self):
        """
        Create network
        """
        mask_cache = dict() if self.use_mask_cache else None

        response_emb = fluid.layers.embedding(
            input=self.response,
            size=[self._vocab_size + 1, self._emb_size],
            is_sparse=self.use_sparse_embedding,
            param_attr=fluid.ParamAttr(
                name=self.word_emb_name,
                initializer=fluid.initializer.Normal(scale=0.1)))

        # response part
        Hr = response_emb
        Hr_stack = [Hr]

        for index in six.moves.xrange(self._stack_num):
            Hr = layers.block(name="response_self_stack" + str(index),
                              query=Hr,
                              key=Hr,
                              value=Hr,
                              d_key=self._emb_size,
                              q_mask=self.response_mask,
                              k_mask=self.response_mask,
                              mask_cache=mask_cache)
            Hr_stack.append(Hr)

        # context part
        sim_turns = []
        for t in six.moves.xrange(self._max_turn_num):
            Hu = fluid.layers.embedding(
                input=self.turns_data[t],
                size=[self._vocab_size + 1, self._emb_size],
                is_sparse=self.use_sparse_embedding,
                param_attr=fluid.ParamAttr(
                    name=self.word_emb_name,
                    initializer=fluid.initializer.Normal(scale=0.1)))
            Hu_stack = [Hu]

            for index in six.moves.xrange(self._stack_num):
                # share parameters
                Hu = layers.block(name="turn_self_stack" + str(index),
                                  query=Hu,
                                  key=Hu,
                                  value=Hu,
                                  d_key=self._emb_size,
                                  q_mask=self.turns_mask[t],
                                  k_mask=self.turns_mask[t],
                                  mask_cache=mask_cache)
                Hu_stack.append(Hu)

            # cross attention
            r_a_t_stack = []
            t_a_r_stack = []
            for index in six.moves.xrange(self._stack_num + 1):
                t_a_r = layers.block(name="t_attend_r_" + str(index),
                                     query=Hu_stack[index],
                                     key=Hr_stack[index],
                                     value=Hr_stack[index],
                                     d_key=self._emb_size,
                                     q_mask=self.turns_mask[t],
                                     k_mask=self.response_mask,
                                     mask_cache=mask_cache)
                r_a_t = layers.block(name="r_attend_t_" + str(index),
                                     query=Hr_stack[index],
                                     key=Hu_stack[index],
                                     value=Hu_stack[index],
                                     d_key=self._emb_size,
                                     q_mask=self.response_mask,
                                     k_mask=self.turns_mask[t],
                                     mask_cache=mask_cache)

                t_a_r_stack.append(t_a_r)
                r_a_t_stack.append(r_a_t)

            t_a_r_stack.extend(Hu_stack)
            r_a_t_stack.extend(Hr_stack)

            if self.use_stack_op:
                t_a_r = fluid.layers.stack(t_a_r_stack, axis=1)
                r_a_t = fluid.layers.stack(r_a_t_stack, axis=1)
            else:
                for index in six.moves.xrange(len(t_a_r_stack)):
                    t_a_r_stack[index] = fluid.layers.unsqueeze(
                        input=t_a_r_stack[index], axes=[1])
                    r_a_t_stack[index] = fluid.layers.unsqueeze(
                        input=r_a_t_stack[index], axes=[1])

                t_a_r = fluid.layers.concat(input=t_a_r_stack, axis=1)
                r_a_t = fluid.layers.concat(input=r_a_t_stack, axis=1)

            # sim shape: [batch_size, 2*(stack_num+1), max_turn_len, max_turn_len]
            sim = fluid.layers.matmul(x=t_a_r,
                                      y=r_a_t,
                                      transpose_y=True,
                                      alpha=1 / np.sqrt(200.0))
            sim_turns.append(sim)

        if self.use_stack_op:
            sim = fluid.layers.stack(sim_turns, axis=2)
        else:
            for index in six.moves.xrange(len(sim_turns)):
                sim_turns[index] = fluid.layers.unsqueeze(
                    input=sim_turns[index], axes=[2])
            # sim shape: [batch_size, 2*(stack_num+1), max_turn_num, max_turn_len, max_turn_len]
            sim = fluid.layers.concat(input=sim_turns, axis=2)

        final_info = layers.cnn_3d(sim, self._channel1_num, self._channel2_num)
        loss, logits = layers.loss(final_info, self.label)
        return loss, logits