예제 #1
0
def run_test():
    # Get all ckpt names in log dir (without meta ext)
    meta_list = get_checkpoints(FLAGS.log_dir)

    # GPU/CPU Flag
    if FLAGS.gpu is not None:
        compute_string = '/gpu:' + str(FLAGS.gpu)
    else:
        compute_string = '/cpu:0'

    # Iterate through the checkpoints
    for ckpt_path in meta_list:
        tf.reset_default_graph()

        ####################
        # Setup Data Queue #
        ####################
        with tf.device("/cpu:0"):
            with tf.variable_scope('test') as scope:
                data_pipeline = DataPipeline(augment=False,
                                             num_epochs=1,
                                             shuffle=False)
                validate_x, validate_y, ids = data_pipeline.batch_ops()

        with tf.device(compute_string):
            ##########################
            # Declare Validate Graph #
            ##########################
            # Sets train/test mode; currently only used for BatchNormalization
            # True: Train   False: Test
            phase = tf.placeholder(tf.bool, name='phase')
            validate_model = model(validate_x, validate_y, phase)

            # Delete extraneous info when done debugging
            validate_pred = validate_model.inference()
            pool5 = validate_model.fc2

        init = tf.group(tf.global_variables_initializer(),
                        tf.local_variables_initializer())

        ids_file = open(os.path.join(FLAGS.log_dir, 'ids.txt'), 'w')
        predictions_file = open(os.path.join(FLAGS.log_dir, 'predictions.txt'),
                                'w')

        session_config = tf.ConfigProto(allow_soft_placement=True)
        with tf.Session(config=session_config) as sess:
            sess.run(init)

            # Coordinator hands data fetching threads
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)

            optimistic_restore(sess, ckpt_path)
            try:
                step = 0
                cum_time = 0
                while True:
                    if coord.should_stop():
                        break
                    step += 1
                    start_time = time()
                    prediction_value, pool5_value, ids_value = sess.run(
                        [validate_pred, pool5, ids], feed_dict={phase: False})
                    duration_time = time() - start_time

                    cum_time += duration_time

                    feature_file = os.path.join(FLAGS.log_dir,
                                                "feature_%d" % step)
                    #pool5_value = np.sum(pool5_value, (1,2)) #spatial average
                    pool5_value = pool5_value.reshape(FLAGS.batch_size, -1)
                    np.save(feature_file, pool5_value)

                    for id in ids_value:
                        ids_file.write("%s\n" % id)

                    # Save prediction and ground truth info
                    predictions_file.write(np.array_str( \
                            prediction_value, \
                            max_line_width=1e3, \
                            precision=10, \
                            suppress_small=True))
                    predictions_file.write('\n')
                    predictions_file.flush()

            except tf.errors.OutOfRangeError:
                step -= 1
            except Exception as e:
                step -= 1

            # Stop Queueing data, we're done!
            coord.request_stop()
            coord.join(threads)
예제 #2
0
def run_training():
    '''
    Run Training Loop
    '''
    # GPU/CPU Flag
    if FLAGS.gpu is not None:
        compute_string = '/gpu:' + str(FLAGS.gpu)
    else:
        compute_string = '/cpu:0'

    #####################
    # Setup Data Queues #
    #####################
    with tf.device("/cpu:0"):
        with tf.variable_scope('train'):
            data_pipeline = DataPipeline(augment=True)
            train_x, train_y = data_pipeline.batch_ops()

    #######################
    # Declare train graph #
    #######################
    with tf.device(compute_string):
        phase = tf.placeholder(tf.bool, name='phase')
        train_model = model(train_x, train_y, phase)
        train_predictions = train_model.inference()
        train_acc = train_model.evaluate()
        train_loss, gt_y = train_model.loss()
        train_op = train_model.optimize()
        global_step = train_model.get_global_step()
        tf.summary.scalar('train_loss', train_loss)
        tf.summary.scalar('train_acc', train_acc)

    #############################
    # Setup Summaries and Saver #
    #############################

    # Collect summaries for TensorBoard
    summary = tf.summary.merge_all()
    # Create variable initializer op
    init = tf.group(tf.global_variables_initializer(),
                    tf.local_variables_initializer())
    # Create checkpoint saver
    saver = tf.train.Saver(max_to_keep=100)

    # Begin TensorFlow Session
    session_config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=session_config) as sess:
        # Resume training or
        # Run the Variable Initializer Op
        sess.run(init)
        if FLAGS.resume == True:
            try:
                meta_list = get_checkpoints(FLAGS.log_dir)
                optimistic_restore(sess, meta_list[-1])
                resume_status = True
            except:
                print('Checkpoint Load Failed')
                print('Training from scratch')
                resume_status = False
        if not resume_status:
            try:
                train_model.load_pretrained_weights(sess)
            except:
                print('Failed to load pretrained weights.')
                print('Training from scratch')
                sys.stdout.flush()

        # Coordinator hands data fetching threads
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)

        # Instantiate a summary writer to output summaries and the Graph.
        summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

        # Actually begin the training process
        try:
            for step in xrange(FLAGS.max_steps):
                if coord.should_stop():
                    break
                start_time = time()

                # Run one step of the model.
                _, loss_value, acc = sess.run(
                    [train_op, train_loss, train_acc], feed_dict={phase: True})
                global_step_value = global_step.eval()
                duration_time = time() - start_time

                # debug profiler on step 3
                # open timeline.json in chrome://tracing/
                if FLAGS.profile and step == 3:
                    run_metadata = tf.RunMetadata()
                    _, loss, acc = sess.run(
                        [train_op, train_loss, train_acc],
                        options=tf.RunOptions(
                            trace_level=tf.RunOptions.FULL_TRACE),
                        run_metadata=run_metadata)
                    tl = timeline.Timeline(run_metadata.step_stats)
                    ctf = tl.generate_chrome_trace_format()
                    with open('timeline.json', 'w') as f:
                        f.write(ctf)

                # Display progress
                if global_step_value % 1 == 0:
                    # Print progress to stdout
                    print('Step %d: loss = %.2f, acc = %.2f (%.3f sec)' %
                          (global_step_value, loss_value, acc, duration_time))
                    sys.stdout.flush()

                # Write the summaries
                if global_step_value % 20 == 0:
                    # Update the summary file
                    summary_str = sess.run(summary, feed_dict={phase: False})
                    summary_writer.add_summary(summary_str, global_step_value)
                    summary_writer.flush()

                # Save Model Checkpoint
                if (global_step_value)%FLAGS.checkpoint_freq==0 or \
                        (global_step_value+1)==FLAGS.max_steps:
                    checkpoint_path = os.path.join(FLAGS.log_dir, 'model')
                    saver.save(sess, checkpoint_path, global_step=global_step)
                #loop_time = time() - start_time
                #print('Total Loop Time: %.3f' % loop_time)
        except tf.errors.OutOfRangeError:
            print('Done Training -- Epoch limit reached.')
            sys.stdout.flush()
        except Exception as e:
            print("Exception encountered: ", e)
            sys.stdout.flush()

        # Stop Queueing data, we're done!
        coord.request_stop()
        coord.join(threads)
예제 #3
0
def run_validate():
    # Get all ckpt names in log dir (without meta ext)
    meta_list = get_checkpoints(FLAGS.log_dir)

    # GPU/CPU Flag
    if FLAGS.gpu is not None:
        compute_string = '/gpu:' + str(FLAGS.gpu)
    else:
        compute_string = '/cpu:0'

    # Iterate through the checkpoints
    val_loss = []
    val_acc = []
    val_itr = []
    for ckpt_path in meta_list:
        tf.reset_default_graph()

        ####################
        # Setup Data Queue #
        ####################
        with tf.device("/cpu:0"):
            with tf.variable_scope('validate') as scope:
                data_pipeline = DataPipeline(augment=False,
                                             num_epochs=1,
                                             shuffle=False)
                validate_x, validate_y, ids = data_pipeline.batch_ops()

        with tf.device(compute_string):
            ##########################
            # Declare Validate Graph #
            ##########################
            # Sets train/test mode; currently only used for BatchNormalization
            # True: Train   False: Test
            phase = tf.placeholder(tf.bool, name='phase')
            validate_model = model(validate_x, validate_y, phase)

            # Delete extraneous info when done debugging
            validate_pred = validate_model.inference()
            validate_acc = validate_model.evaluate()
            validate_loss, gt_y = validate_model.loss()
            global_step = validate_model.get_global_step()
        summary = tf.summary.merge_all()

        init = tf.group(tf.global_variables_initializer(),
                        tf.local_variables_initializer())

        session_config = tf.ConfigProto(allow_soft_placement=True)
        with tf.Session(config=session_config) as sess:
            sess.run(init)
            summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)

            # Coordinator hands data fetching threads
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)

            optimistic_restore(sess, ckpt_path)
            global_step_value = global_step.eval()
            try:
                step = 0
                cum_loss = 0
                cum_acc = 0
                cum_time = 0
                while True:
                    if coord.should_stop():
                        break
                    step += 1
                    start_time = time()
                    loss_value, acc_value, prediction_value, gt_value, ids_value = sess.run(
                        [
                            validate_loss, validate_acc, validate_pred, gt_y,
                            ids
                        ],
                        feed_dict={phase: False})
                    duration_time = time() - start_time

                    cum_loss += loss_value
                    cum_acc += acc_value
                    cum_time += duration_time

                    if step % 1 == 0:
                        # Print progress to stdout
                        if FLAGS.print_pred:
                            print(
                                'Step %d: loss = %.4f acc = %.4f (%.3f sec)' %
                                (step, loss_value, acc_value, duration_time))
                            print('Prediction:{}'.format(prediction_value))
                            print('GT:{}'.format(gt_value))
                        sys.stdout.flush()

                    # Write the summaries
                    if step % 25 == 0:
                        # Update the summary file
                        summary_str = sess.run(summary,
                                               feed_dict={phase: False})
                        summary_writer.add_summary(summary_str,
                                                   global_step_value)
                        summary_writer.flush()

            except tf.errors.OutOfRangeError:
                step -= 1
            except Exception as e:
                step -= 1

            # Stop Queueing data, we're done!
            coord.request_stop()
            coord.join(threads)

        avg_loss = cum_loss / step
        avg_acc = cum_acc / step
        avg_time = cum_time / step

        val_loss.append(float(avg_loss))
        val_acc.append(float(avg_acc))
        val_itr.append(int(global_step_value))

        print('Results For Load File: %s' % ckpt_path)
        print('Average_Loss = %.4f' % avg_loss)
        print('Average_Acc = %.4f' % avg_acc)
        print('Run Time: %.2f' % cum_time)
        sys.stdout.flush()

    val_loss = np.asarray(val_loss)
    val_acc = np.asarray(val_acc)
    val_itr = np.asarray(val_itr)

    best_loss = np.amin(val_loss)
    best_acc = np.amax(val_acc)
    best_itr = val_itr[np.argmax(val_acc)]

    print('Overall Results')
    print('Minimum Loss: %.4f' % best_loss)
    print('Maximum Acc: %.4f' % best_acc)
    print('Best Checkpoint: %d' % best_itr)

    save_path = os.path.join(FLAGS.log_dir, 'validation_results.npz')
    np.savez(save_path, val_loss=val_loss, val_acc=val_acc, val_itr=val_itr)