Пример #1
0
def set_up_summary_writer(model_config,
                          sess):
    """ Helper function to set up log directories and summary
        handlers.
    Args:
        model_config: Model protobuf configuration
        sess : A tensorflow session
    """

    paths_config = model_config.paths_config

    logdir = paths_config.logdir
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    logdir = logdir + '/eval'

    datetime_str = str(datetime.datetime.now())
    summary_writer = tf.summary.FileWriter(logdir + '/' + datetime_str,
                                           sess.graph)

    global_summaries = set([])
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
    summary_merged = summary_utils.summaries_to_keep(summaries,
                                                     global_summaries,
                                                     histograms=False,
                                                     input_imgs=False,
                                                     input_bevs=False)

    return summary_writer, summary_merged
Пример #2
0
def set_up_summary_writer(model_config,
                          sess):
    """ Helper function to set up log directories and summary
        handlers.
    Args:
        model_config: Model protobuf configuration
        sess : A tensorflow session
    """

    paths_config = model_config.paths_config

    logdir = paths_config.logdir
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    logdir = logdir + '/eval'

    datetime_str = str(datetime.datetime.now())
    summary_writer = tf.summary.FileWriter(logdir + '/' + datetime_str,
                                           sess.graph)

    global_summaries = set([])
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
    #remove some visualize image summaries for inspect training not evaluation to short evaluation time.
    for summary in summaries.copy():
        name = summary.name
        #if name.find('vis_') != -1:
        #    summaries.remove(summary) 
        #elif name.find('nms') != -1:
        #    summaries.remove(summary) 
        #elif name.find('retinanet_losses') != -1:
        if name.find('retinanet_losses') != -1:
            summaries.remove(summary)

    if len(summaries) < 1:
        return summary_writer, None
        
    summary_merged = summary_utils.summaries_to_keep(summaries,
                                                     global_summaries,
                                                     histograms=False,
                                                     input_imgs=False,
                                                     input_bevs=False)

    return summary_writer, summary_merged
Пример #3
0
def train(model, train_config):
    """Training function for detection models.

    Args:
        model: The detection model object.
        train_config: a train_*pb2 protobuf.
            training i.e. loading RPN weights onto AVOD model.
    """

    model = model
    train_config = train_config
    # Get model configurations
    model_config = model.model_config

    # Create a variable tensor to hold the global step
    global_step_tensor = tf.Variable(
        0, trainable=False, name='global_step')

    #############################
    # Get training configurations
    #############################
    max_iterations = train_config.max_iterations
    summary_interval = train_config.summary_interval
    checkpoint_interval = \
        train_config.checkpoint_interval
    max_checkpoints = train_config.max_checkpoints_to_keep

    paths_config = model_config.paths_config
    logdir = paths_config.logdir
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    checkpoint_dir = paths_config.checkpoint_dir
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint_path = checkpoint_dir + '/' + \
        model_config.checkpoint_name

    global_summaries = set([])

    # The model should return a dictionary of predictions
    prediction_dict = model.build()


    #WZN: for debug only
    #img_input_debug = model._rpn_model._img_preprocessed
    #bev_input_debug = model._rpn_model._bev_preprocessed
    #bev_pooled_debug = model._rpn_model.bev_input_pooled
    #img_pooled_debug = model._rpn_model.img_input_pooled
    #import numpy as np 
    #import matplotlib.pyplot as plt
    

    summary_histograms = train_config.summary_histograms
    summary_img_images = train_config.summary_img_images
    summary_bev_images = train_config.summary_bev_images

    ##############################
    # Setup loss
    ##############################
    losses_dict, total_loss = model.loss(prediction_dict)

    # Optimizer
    training_optimizer = optimizer_builder.build(
        train_config.optimizer,
        global_summaries,
        global_step_tensor)

    # Create the train op
    with tf.variable_scope('train_op'):
        train_op = slim.learning.create_train_op(
            total_loss,
            training_optimizer,
            clip_gradient_norm=1.0,
            global_step=global_step_tensor)

    # Save checkpoints regularly.
    saver = tf.train.Saver(max_to_keep=max_checkpoints,
                           pad_step_number=True)

    # Add the result of the train_op to the summary
    tf.summary.scalar("training_loss", train_op)

    # Add maximum memory usage summary op
    # This op can only be run on device with gpu
    # so it's skipped on travis
    is_travis = 'TRAVIS' in os.environ
    if not is_travis:
        # tf.summary.scalar('bytes_in_use',
        #                   tf.contrib.memory_stats.BytesInUse())
        tf.summary.scalar('max_bytes',
                          tf.contrib.memory_stats.MaxBytesInUse())

    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
    summary_merged = summary_utils.summaries_to_keep(
        summaries,
        global_summaries,
        histograms=summary_histograms,
        input_imgs=summary_img_images,
        input_bevs=summary_bev_images
    )

    allow_gpu_mem_growth = train_config.allow_gpu_mem_growth
    if allow_gpu_mem_growth:
        # GPU memory config
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = allow_gpu_mem_growth
        sess = tf.Session(config=config)
    else:
        sess = tf.Session()

    # Create unique folder name using datetime for summary writer
    datetime_str = str(datetime.datetime.now())
    logdir = logdir + '/train'
    train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str,
                                         sess.graph)

    # Create init op
    init = tf.global_variables_initializer()

    # Continue from last saved checkpoint
    if not train_config.overwrite_checkpoints:
        trainer_utils.load_checkpoints(checkpoint_dir,
                                       saver)
        if len(saver.last_checkpoints) > 0:
            checkpoint_to_restore = saver.last_checkpoints[-1]
            saver.restore(sess, checkpoint_to_restore)
        else:
            # Initialize the variables
            sess.run(init)
    else:
        # Initialize the variables
        sess.run(init)

    # Read the global step if restored
    global_step = tf.train.global_step(sess,
                                       global_step_tensor)
    print('Starting from step {} / {}'.format(
        global_step, max_iterations))



    # Main Training Loop
    last_time = time.time()
    for step in range(global_step, max_iterations + 1):

        # Save checkpoint
        if step % checkpoint_interval == 0:
            global_step = tf.train.global_step(sess,
                                               global_step_tensor)

            saver.save(sess,
                       save_path=checkpoint_path,
                       global_step=global_step)

            print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format(
                step, max_iterations,
                checkpoint_path, global_step))

        # Create feed_dict for inferencing
        feed_dict = model.create_feed_dict()

        #WZN: only run for debug 
        #bev_p,img_p = sess.run([bev_pooled_debug,img_pooled_debug],feed_dict)
        #bev_p = np.squeeze(bev_p)
        #img_p = np.squeeze(img_p)
        #import pdb
        #pdb.set_trace()

        # Write summaries and train op
        if step % summary_interval == 0:
            current_time = time.time()
            time_elapsed = current_time - last_time
            last_time = current_time

            train_op_loss, summary_out = sess.run(
                [train_op, summary_merged], feed_dict=feed_dict)

            print('Step {}, Total Loss {:0.3f}, Time Elapsed {:0.3f} s'.format(
                step, train_op_loss, time_elapsed))
            train_writer.add_summary(summary_out, step)

        else:
            # Run the train op only
            sess.run(train_op, feed_dict)

    # Close the summary writers
    train_writer.close()
Пример #4
0
def train(model, train_config):
    """Training function for detection models.

    Args:
        model: The detection model object.
        train_config: a train_*pb2 protobuf.
            training i.e. loading RPN weights onto AVOD model.
    """

    model = model
    train_config = train_config
    # Get model configurations
    model_config = model.model_config

    # Create a variable tensor to hold the global step
    global_step_tensor = tf.Variable(0, trainable=False, name='global_step')

    #############################
    # Get training configurations
    #############################
    max_iterations = train_config.max_iterations
    summary_interval = train_config.summary_interval
    checkpoint_interval = \
        train_config.checkpoint_interval
    max_checkpoints = train_config.max_checkpoints_to_keep

    paths_config = model_config.paths_config
    logdir = paths_config.logdir
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    checkpoint_dir = paths_config.checkpoint_dir
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint_path = checkpoint_dir + '/' + \
        model_config.checkpoint_name

    global_summaries = set([])

    # The model should return a dictionary of predictions
    prediction_dict = model.build()

    summary_histograms = train_config.summary_histograms
    summary_img_images = train_config.summary_img_images
    summary_bev_images = train_config.summary_bev_images

    ##############################
    # Setup loss
    ##############################
    loss_dict, total_loss, rpn_score_2d_loss, \
    rpn_acc_score_neg, rpn_acc_score_pos, \
    rpn_class_loss, rpn_reg_loss, rpn_acc_all, \
    rpn_acc_pos, refine_class_loss, refine_reg_loss, \
    avod_acc_all, avod_acc_pos = model.loss(prediction_dict)

    # Optimizer
    training_optimizer = optimizer_builder.build(train_config.optimizer,
                                                 global_summaries,
                                                 global_step_tensor)

    # Create the train op
    with tf.variable_scope('train_op'):
        train_op = slim.learning.create_train_op(
            total_loss,
            training_optimizer,
            clip_gradient_norm=1.0,
            global_step=global_step_tensor)

    # Save checkpoints regularly.
    saver = tf.train.Saver(max_to_keep=max_checkpoints, pad_step_number=True)

    # Add the result of the train_op to the summary
    tf.summary.scalar("training_loss", train_op)

    # Add maximum memory usage summary op
    # This op can only be run on device with gpu
    # so it's skipped on travis
    is_travis = 'TRAVIS' in os.environ
    if not is_travis:
        # tf.summary.scalar('bytes_in_use',
        #                   tf.contrib.memory_stats.BytesInUse())
        tf.summary.scalar('max_bytes', tf.contrib.memory_stats.MaxBytesInUse())

    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
    summary_merged = summary_utils.summaries_to_keep(
        summaries,
        global_summaries,
        histograms=summary_histograms,
        input_imgs=summary_img_images,
        input_bevs=summary_bev_images)

    allow_gpu_mem_growth = train_config.allow_gpu_mem_growth
    if allow_gpu_mem_growth:
        # GPU memory config
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = allow_gpu_mem_growth
        sess = tf.Session(config=config)
    else:
        sess = tf.Session()

    # Create unique folder name using datetime for summary writer
    datetime_str = str(datetime.datetime.now())
    logdir = logdir + '/train'
    train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str,
                                         sess.graph)

    # Create init op
    init = tf.global_variables_initializer()

    # Continue from last saved checkpoint
    if not train_config.overwrite_checkpoints:
        trainer_utils.load_checkpoints(checkpoint_dir, saver)

        all_variables = tf.get_collection_ref(tf.GraphKeys.GLOBAL_VARIABLES)
        sess.run(tf.variables_initializer(all_variables))
        var_list = [
            var for var in all_variables if "beta" not in var.name
            and 'Adam' not in var.name and 'Average' not in var.name
        ]

        saver_part = tf.train.Saver(var_list=var_list)

        if train_config.use_pretrained:
            saver_part.restore(sess, train_config.pretrained)
            print('Model loaded from: {}'.format(train_config.pretrained))
            step_base = tf.train.global_step(sess, global_step_tensor)

        elif len(saver.last_checkpoints) > 0:
            checkpoint_to_restore = saver.last_checkpoints[-1]
            saver_part.restore(sess, checkpoint_to_restore)
            step_base = 0

        else:
            # Initialize the variables
            sess.run(init)
    else:
        # Initialize the variables
        sess.run(init)

    global_step = tf.train.global_step(sess, global_step_tensor) - step_base

    print('Starting from step {} / {}'.format(global_step, max_iterations))

    # Main Training Loop
    last_time = time.time()
    for step in range(global_step, max_iterations + 1):

        # Save checkpoint
        if step % checkpoint_interval == 0:
            global_step = tf.train.global_step(sess,
                                               global_step_tensor) - step_base

            saver.save(sess,
                       save_path=checkpoint_path,
                       global_step=global_step)

            print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format(
                step, max_iterations, checkpoint_path, global_step))

        # Create feed_dict for inferencing
        feed_dict = model.create_feed_dict()

        # Write summaries and train op
        if step % summary_interval == 0:
            current_time = time.time()
            time_elapsed = current_time - last_time
            last_time = current_time

            #train_op_loss, summary_out = sess.run(
            #    [train_op, summary_merged], feed_dict=feed_dict)

            train_op_loss, summary_out, \
            rpn_score_2d_loss_np, \
            rpn_acc_score_neg_np, \
            rpn_acc_score_pos_np, \
            rpn_class_loss_np, rpn_reg_loss_np, \
            rpn_acc_all_np, rpn_acc_pos_np, \
            refine_class_loss_np, refine_reg_loss_np, \
            avod_acc_all_np, avod_acc_pos_np = sess.run(
                [train_op, summary_merged,
                 rpn_score_2d_loss,
                 rpn_acc_score_neg,
                 rpn_acc_score_pos,
                 rpn_class_loss, rpn_reg_loss,
                 rpn_acc_all, rpn_acc_pos,
                 refine_class_loss, refine_reg_loss,
                 avod_acc_all, avod_acc_pos], feed_dict=feed_dict)

            print('Step {}, Total Loss {:0.3f} | Score {:0.3f}, Acc {:0.2f} {:0.2f} | RPN Class {:0.3f}, Reg {:0.3f}, Acc {:0.2f} {:0.2f} | Final Class {:0.3f}, Reg {:0.3f}, Acc {:0.2f} {:0.2f}'.format(\
                   step, train_op_loss, rpn_score_2d_loss_np, rpn_acc_score_neg_np * 100, \
                   rpn_acc_score_pos_np * 100, rpn_class_loss_np, rpn_reg_loss_np, rpn_acc_all_np * 100, \
                   rpn_acc_pos_np * 100, refine_class_loss_np, refine_reg_loss_np, avod_acc_all_np * 100, avod_acc_pos_np * 100))
            train_writer.add_summary(summary_out, step)

        else:
            # Run the train op only
            sess.run(train_op, feed_dict)

    # Close the summary writers
    train_writer.close()
def train(model, train_config):
    """Training function for detection models.

    Args:
        model: The detection model object.
        train_config: a train_*pb2 protobuf.
            training i.e. loading RPN weights onto AVOD model.
    """

    model = model
    train_config = train_config
    # Get model configurations
    model_config = model.model_config

    # Create a variable tensor to hold the global step
    global_step_tensor = tf.Variable(0, trainable=False, name='global_step')

    #############################
    # Get training configurations
    #############################
    max_iterations = train_config.max_iterations
    summary_interval = train_config.summary_interval
    checkpoint_interval = \
        train_config.checkpoint_interval
    max_checkpoints = train_config.max_checkpoints_to_keep

    paths_config = model_config.paths_config
    logdir = paths_config.logdir
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    checkpoint_dir = paths_config.checkpoint_dir
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoint_path = checkpoint_dir + '/' + \
        model_config.checkpoint_name

    global_summaries = set([])

    # The model should return a dictionary of predictions
    prediction_dict = model.build()

    summary_histograms = train_config.summary_histograms
    summary_img_images = train_config.summary_img_images
    summary_bev_images = train_config.summary_bev_images

    ##############################
    # Setup loss
    ##############################
    losses_dict, total_loss = model.loss(prediction_dict)

    ##############################################################################################
    # Select trainable variables of the original AVOD model to set gradient to 0(var0)

    var_moe = [
        var for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                         scope='mix_of_experts')
    ]
    var0 = [var for var in tf.trainable_variables()]
    var_all_but_var_moe = [
        var for var in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    ]

    for var in var_moe:
        var0.remove(var)
        var_all_but_var_moe.remove(var)

    ##############################################################################################
    # Create optimizer with 0 gradient for the AVOD model and an optimizer for the MoE
    training_optimizer0 = tf.train.GradientDescentOptimizer(0.0)
    training_optimizer1 = optimizer_builder.build(train_config.optimizer,
                                                  global_summaries,
                                                  global_step_tensor)
    ##############################################################################################

    # Create the train op. train_op1 is for MoE and train_op0 is for AVOD
    with tf.variable_scope('train_op'):
        #Create training operations
        train_op1 = slim.learning.create_train_op(
            total_loss,
            training_optimizer1,
            variables_to_train=var_moe,
            clip_gradient_norm=1.0,
            global_step=global_step_tensor)
        train_op0 = slim.learning.create_train_op(
            total_loss,
            training_optimizer0,
            variables_to_train=var0,
            clip_gradient_norm=1.0,
            global_step=global_step_tensor)
        train_op = tf.group(train_op1, train_op0)

    ##############################################################################################

    # Save checkpoints regularly.
    saver = tf.train.Saver(max_to_keep=max_checkpoints, pad_step_number=True)

    # Add the result of the train_op to the summary
    tf.summary.scalar("training_loss", train_op1)
    # Add maximum memory usage summary op
    # This op can only be run on device with gpu
    # so it's skipped on travis
    is_travis = 'TRAVIS' in os.environ
    if not is_travis:
        # tf.summary.scalar('bytes_in_use',
        #                   tf.contrib.memory_stats.BytesInUse())
        tf.summary.scalar('max_bytes', tf.contrib.memory_stats.MaxBytesInUse())

    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
    summary_merged = summary_utils.summaries_to_keep(
        summaries,
        global_summaries,
        histograms=summary_histograms,
        input_imgs=summary_img_images,
        input_bevs=summary_bev_images)

    allow_gpu_mem_growth = train_config.allow_gpu_mem_growth
    if allow_gpu_mem_growth:
        # GPU memory config
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = allow_gpu_mem_growth
        sess = tf.Session(config=config)
    else:
        sess = tf.Session()

    # Create unique folder name using datetime for summary writer
    datetime_str = str(datetime.datetime.now())
    logdir = logdir + '/train'
    train_writer = tf.summary.FileWriter(logdir + '/' + datetime_str,
                                         sess.graph)

    # Create init op
    init = tf.global_variables_initializer()

    # Continue from last saved checkpoint
    if not train_config.overwrite_checkpoints:
        trainer_utils.load_checkpoints(checkpoint_dir, saver)
        if len(saver.last_checkpoints) > 0:
            checkpoint_to_restore = saver.last_checkpoints[-1]
            saver.restore(sess, checkpoint_to_restore)
        else:

            # Initialize the variables

            # Restore checkpoints from original avod model. Give the correct path to restore
            checkpoint_path_start = train_config.moe_config.initial_avod_checkpoint_path
            variables_to_restore = dict()
            for var in var_all_but_var_moe:

                variables_to_restore[var.op.name] = slim.get_unique_variable(
                    var.op.name)

            init_assign_op, init_feed_dict = slim.assign_from_checkpoint(
                checkpoint_path_start, variables_to_restore)
            sess.run(init)
            sess.run(init_assign_op, init_feed_dict)
            ##############################################################################################
    else:

        # Initialize the variables
        sess.run(init)

    # Read the global step if restored
    global_step = tf.train.global_step(sess, global_step_tensor)
    print('Starting from step {} / {}'.format(global_step, max_iterations))

    # Main Training Loop
    last_time = time.time()
    for step in range(global_step, max_iterations + 1):

        # Save checkpoint
        if step % checkpoint_interval == 0:
            global_step = tf.train.global_step(sess, global_step_tensor)

            saver.save(sess,
                       save_path=checkpoint_path,
                       global_step=global_step)

            print('Step {} / {}, Checkpoint saved to {}-{:08d}'.format(
                step, max_iterations, checkpoint_path, global_step))

        # Create feed_dict for inferencing
        feed_dict = model.create_feed_dict()

        # Write summaries and train op
        if step % summary_interval == 0:
            current_time = time.time()
            time_elapsed = current_time - last_time
            last_time = current_time

            train_op_loss, summary_out = sess.run([train_op1, summary_merged],
                                                  feed_dict=feed_dict)
            print(train_op_loss)

            print('Step {}, Total Loss {:0.3f}, Time Elapsed {:0.3f} s'.format(
                step, train_op_loss, time_elapsed))
            train_writer.add_summary(summary_out, step)

        else:
            # Run the train op only
            sess.run(train_op1, feed_dict)

    # Close the summary writers
    train_writer.close()