예제 #1
0
# config1 = tf.ConfigProto(device_count={"CPU": cpu_num},
#                          inter_op_parallelism_threads=cpu_num,
#                          intra_op_parallelism_threads=cpu_num, log_device_placement=True)
# with tf.Session(config=config1) as sess:
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
    sess.run(init)
    summary_writer = tf.summary.FileWriter(log_dir, sess.graph)  # 创建图写入器并写文件
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    print("Finetune starting!")
    start_time = time.time()
    for epoch in range(config.training_epochs):
        avg_cost = 0.
        total_batch = int(config.n_samples / config.batch_size)
        for _ in range(total_batch):
            batch_xs = get_random_block_from_data(plane_image, config.batch_size)
            cost, _ = sess.run((sae.cost, sae.optimizer),
                               feed_dict={image: batch_xs})

            avg_cost += cost / config.n_samples * config.batch_size
        # Display logs per epoch step
        # if epoch % config.display_step == 0:
        #     print("Epoch:", '%d,' % (epoch + 1), "Cost:", "{:.9f}".format(avg_cost),
        #           "Time/Epoch is ", (time.time() - start_time))
    print ("total time is ", (time.time()-start_time)/(total_batch*config.training_epochs))
    coord.request_stop()
    coord.join(threads)
    #
    # # 利用重建误差进行AUC计算
    # hidden_tmp, recon_err_tmp = sess.run((sae.hidden_out, sae.errtmp), feed_dict={image: plane_image})
    # recon_err_tmp = np.sum(recon_err_tmp, axis=1)
예제 #2
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (config.n_samples / config.batch_size
                                 )  # num of updating times per epoch
        decay_steps = int(num_batches_per_epoch *
                          config.num_epochs_per_decay)  # updating every step

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(config.initial_lr,
                                        global_step,
                                        decay_steps,
                                        config.lr_dacay_factor,
                                        staircase=True)
        opt = tf.train.AdamOptimizer(0.00005)
        # dataset = tf.data.Dataset.from_tensor_slices(plane_image)
        # dataset = dataset.repeat(config.training_epochs*config.num_gpus)  #
        # dataset = dataset.batch(config.batch_size)
        # iterator = dataset.make_initializable_iterator()
        # images = iterator.get_next()

        # input_image = tf.constant(plane_image)
        input1 = tf.placeholder(tf.float32, [None, config.input_size])
        input2 = tf.placeholder(tf.float32, [None, config.input_size])
        input_image = [input1, input2]

        # images = tf.placeholder(tf.float32, [None, config.input_size])
        # batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
        #       [images, labels], capacity=2 * FLAGS.num_gpus)
        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in xrange(config.num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % ("SAE_Tower", i)) as scope:
                        # Dequeues one batch for the GPU
                        # image_batch, label_batch = batch_queue.dequeue()
                        # Calculate the loss for one tower of the CIFAR model. This function
                        # constructs the entire CIFAR model but shares the variables across
                        # all towers.
                        # images = tf.train.shuffle_batch([input_image],
                        #                                 batch_size=config.batch_size,
                        #                                 capacity=10000,
                        #                                 num_threads=10,
                        #                                 min_after_dequeue=100,
                        #                                 enqueue_many=True)
                        # loss = tower_loss(scope, images)
                        loss = tower_loss(scope, input_image[i])

                        # Reuse variables for the next tower.
                        tf.get_variable_scope().reuse_variables()

                        # Retain the summaries from the final tower.
                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                      scope)

                        # Calculate the gradients for the batch of data on this CIFAR tower.
                        grads = opt.compute_gradients(loss)

                        # Keep track of the gradients across all towers.
                        tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            0.9999, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Create a saver.
        # saver = tf.train.Saver(tf.global_variables())

        # Build the summary operation from the last tower summaries.
        # summary_op = tf.summary.merge(summaries)

        # Build an initialization operation to run below.
        # init = tf.global_variables_initializer()
        init = tf.group(tf.global_variables_initializer(),
                        tf.local_variables_initializer())

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                                log_device_placement=True))
        sess.run(init)
        # sess.run(iterator.initializer)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)
        start_time = time.time()
        # summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
        for epoch in range(config.training_epochs):

            avg_cost = 0.
            total_batch = int(config.n_samples /
                              (config.num_gpus * config.batch_size))
            for _ in range(total_batch):
                batch_0 = get_random_block_from_data(plane_image,
                                                     config.batch_size)
                batch_1 = get_random_block_from_data(plane_image,
                                                     config.batch_size)

                # cost, _ = sess.run([loss, train_op], feed_dict={images: batch_xs})
                cost, _ = sess.run([loss, train_op],
                                   feed_dict={
                                       input_image[0]: batch_0,
                                       input_image[1]: batch_1
                                   })

                # cost = sess.run(sae.cost, feed_dict={image: batch_xs})

                avg_cost += cost / config.n_samples * config.batch_size
            # 生成timeline文件已经在tensorboard中加入内存和运行时间的记录,但是会导致运行时间增加
            # summary_writer.add_run_metadata(run_metadata, 'step%03d' % epoch)
            # fetched_timeline = timeline.Timeline(run_metadata.step_stats)
            # chrome_trace = fetched_timeline.generate_chrome_trace_format()
            # with open('timeline_gpu.json', 'w') as f:
            #     f.write(chrome_trace)

            # Display logs per epoch step
            # if epoch % config.display_step == 0:
            #     print("Epoch:", '%d,' % (epoch + 1), "Cost:", "{:.9f}".format(avg_cost),
            #           "Time/Epoch is ", (time.time() - start_time))
        print("TOTAL TIME IS ", time.time() - start_time)