예제 #1
0
파일: train.py 프로젝트: czbiohub/luminoth
def run(
    config,
    target="",
    cluster_spec=None,
    is_chief=True,
    job_name=None,
    task_index=None,
    get_model_fn=get_model,
    get_dataset_fn=get_dataset,
    environment=None,
):
    model_class = get_model_fn(config.model.type)

    image_vis = config.train.get("image_vis")
    var_vis = config.train.get("var_vis")

    if config.train.get("seed") is not None:
        tf.set_random_seed(config.train.seed)

    log_prefix = (
        "[{}-{}] - ".format(job_name, task_index)
        if job_name is not None and task_index is not None
        else ""
    )

    if config.train.debug or config.train.tf_debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    else:
        tf.logging.set_verbosity(tf.logging.INFO)

    model = model_class(config)

    # Placement of ops on devices using replica device setter
    # which automatically places the parameters on the `ps` server
    # and the `ops` on the workers
    #
    # See:
    # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
    with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
        try:
            config["dataset"]["type"]
        except KeyError:
            raise KeyError("dataset.type should be set on the custom config.")

        try:
            dataset_class = get_dataset_fn(config.dataset.type)
            dataset = dataset_class(config)
            train_dataset = dataset()
        except InvalidDataDirectory as exc:
            tf.logging.error("Error while reading dataset, {}".format(exc))
            sys.exit(1)

        train_image = train_dataset["image"]
        train_filename = train_dataset["filename"]
        train_bboxes = train_dataset["bboxes"]

        prediction_dict = model(train_image, train_bboxes, is_training=True)
        total_loss = model.loss(prediction_dict)

        global_step = tf.train.get_or_create_global_step()

        optimizer = get_optimizer(config.train, global_step)

        # TODO: Is this necesarry? Couldn't we just get them from the
        # trainable vars collection? We should probably improve our
        # usage of collections.
        trainable_vars = model.get_trainable_vars()

        # Compute, clip and apply gradients
        with tf.name_scope("gradients"):
            grads_and_vars = optimizer.compute_gradients(total_loss, trainable_vars)

            if config.train.clip_by_norm:
                grads_and_vars = clip_gradients_by_norm(grads_and_vars)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.apply_gradients(
                grads_and_vars, global_step=global_step
            )

        # Create custom init for slots in optimizer, as we don't save them to
        # our checkpoints. An example of slots in an optimizer are the Momentum
        # variables in MomentumOptimizer. We do this because slot variables can
        # effectively duplicate the size of your checkpoint!
        slot_variables = [
            optimizer.get_slot(var, name)
            for name in optimizer.get_slot_names()
            for var in trainable_vars
        ]
        slot_init = tf.variables_initializer(
            slot_variables, name="optimizer_slots_initializer"
        )

        # Create saver for saving/restoring model
        model_saver = tf.train.Saver(
            set(tf.global_variables()) - set(slot_variables),
            name="model_saver",
            max_to_keep=config.train.get("checkpoints_max_keep", 1),
        )

        # Create saver for loading pretrained checkpoint into base network
        base_checkpoint_vars = model.get_base_network_checkpoint_vars()
        checkpoint_file = model.get_checkpoint_file()
        if base_checkpoint_vars and checkpoint_file:
            base_net_checkpoint_saver = tf.train.Saver(
                base_checkpoint_vars, name="base_net_checkpoint_saver"
            )

            # We'll send this fn to Scaffold init_fn
            def load_base_net_checkpoint(_, session):
                base_net_checkpoint_saver.restore(session, checkpoint_file)

        else:
            load_base_net_checkpoint = None

    tf.logging.info("{}Starting training for {}".format(log_prefix, model))

    run_options = None
    if config.train.full_trace:
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

    # Create custom Scaffold to make sure we run our own init_op when model
    # is not restored from checkpoint.
    summary_op = [model.summary]
    summaries = tf.summary.merge_all()
    if summaries is not None:
        summary_op.append(summaries)
    summary_op = tf.summary.merge(summary_op)

    # `ready_for_local_init_op` is hardcoded to 'ready' as local init doesn't
    # depend on global init and `local_init_op` only runs when it is set as
    # 'ready' (an empty string tensor sets it as ready).
    scaffold = tf.train.Scaffold(
        saver=model_saver,
        init_op=tf.global_variables_initializer() if is_chief else tf.no_op(),
        local_init_op=tf.group(tf.initialize_local_variables(), slot_init),
        ready_for_local_init_op=tf.constant([], dtype=tf.string),
        summary_op=summary_op,
        init_fn=load_base_net_checkpoint,
    )

    # Custom hooks for our session
    hooks = []
    chief_only_hooks = []

    if config.train.tf_debug:
        debug_hook = tf_debug.LocalCLIDebugHook()
        debug_hook.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
        hooks.extend([debug_hook])

    if not config.train.job_dir:
        tf.logging.warning(
            "`job_dir` is not defined. Checkpoints and logs will not be saved."
        )
        checkpoint_dir = None
    elif config.train.run_name:
        # Use run_name when available
        checkpoint_dir = os.path.join(config.train.job_dir, config.train.run_name)
    else:
        checkpoint_dir = config.train.job_dir

    should_add_hooks = (
        config.train.display_every_steps
        or config.train.display_every_secs
        and checkpoint_dir is not None
    )
    if should_add_hooks:
        if not config.train.debug and image_vis == "debug":
            tf.logging.warning("ImageVisHook will not run without debug mode.")
        elif image_vis is not None:
            # ImageVis only runs on the chief.
            chief_only_hooks.append(
                ImageVisHook(
                    prediction_dict,
                    image=train_dataset["image"],
                    gt_bboxes=train_dataset["bboxes"],
                    config=config.model,
                    output_dir=checkpoint_dir,
                    every_n_steps=config.train.display_every_steps,
                    every_n_secs=config.train.display_every_secs,
                    image_visualization_mode=image_vis,
                )
            )

        if var_vis is not None:
            # VarVis only runs on the chief.
            chief_only_hooks.append(
                VarVisHook(
                    every_n_steps=config.train.display_every_steps,
                    every_n_secs=config.train.display_every_secs,
                    mode=var_vis,
                    output_dir=checkpoint_dir,
                    vars_summary=model.vars_summary,
                )
            )

    step = -1
    with tf.train.MonitoredTrainingSession(
        master=target,
        is_chief=is_chief,
        checkpoint_dir=checkpoint_dir,
        scaffold=scaffold,
        hooks=hooks,
        chief_only_hooks=chief_only_hooks,
        save_checkpoint_secs=config.train.save_checkpoint_secs,
        save_summaries_steps=config.train.save_summaries_steps,
        save_summaries_secs=config.train.save_summaries_secs,
    ) as sess:

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                before = time.time()
                _, train_loss, step, filename = sess.run(
                    [train_op, total_loss, global_step, train_filename],
                    options=run_options,
                )

                # TODO: Add image summary every once in a while.

                tf.logging.info(
                    "{}step: {}, file: {}, train_loss: {}, in {:.2f}s".format(
                        log_prefix, step, filename, train_loss, time.time() - before
                    )
                )

                if is_chief and step == 1:
                    # We save the run after first batch to make sure everything
                    # works properly.
                    save_run(config, environment=environment)

        except tf.errors.OutOfRangeError:
            tf.logging.info(
                "{}finished training after {} epoch limit".format(
                    log_prefix, config.train.num_epochs
                )
            )

            # TODO: Print summary
        finally:
            coord.request_stop()

        # Wait for all threads to stop.
        coord.join(threads)

        return step
예제 #2
0
def run(config,
        target='',
        cluster_spec=None,
        is_chief=True,
        job_name=None,
        task_index=None,
        get_model_fn=get_model,
        get_dataset_fn=get_dataset,
        environment=None):
    model_class = get_model_fn(config.model.type)

    image_vis = config.train.get('image_vis')
    var_vis = config.train.get('var_vis')

    if config.train.get('seed') is not None:
        tf.set_random_seed(config.train.seed)

    log_prefix = '[{}-{}] - '.format(job_name, task_index) \
        if job_name is not None and task_index is not None else ''

    if config.train.debug or config.train.tf_debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    else:
        tf.logging.set_verbosity(tf.logging.INFO)

    model = model_class(config)

    # Placement of ops on devices using replica device setter
    # which automatically places the parameters on the `ps` server
    # and the `ops` on the workers
    #
    # See:
    # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
    with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
        try:
            config['dataset']['type']
        except KeyError:
            raise KeyError('dataset.type should be set on the custom config.')

        try:
            dataset_class = get_dataset_fn(config.dataset.type)
            dataset = dataset_class(config)
            train_dataset = dataset()
        except InvalidDataDirectory as exc:
            tf.logging.error("Error while reading dataset, {}".format(exc))
            sys.exit(1)

        train_image = train_dataset['image']
        train_filename = train_dataset['filename']
        train_bboxes = train_dataset['bboxes']

        prediction_dict = model(train_image, train_bboxes, is_training=True)
        total_loss = model.loss(prediction_dict)

        global_step = tf.train.get_or_create_global_step()

        optimizer = get_optimizer(config.train, global_step)

        trainable_vars = model.get_trainable_vars()

        with tf.name_scope('gradients'):
            # Compute, clip and apply gradients
            grads_and_vars = optimizer.compute_gradients(
                total_loss, trainable_vars)

            # Clip by norm.
            if config.train.clip_by_norm:
                grads_and_vars = clip_gradients_by_norm(grads_and_vars)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

    tf.logging.info('{}Starting training for {}'.format(log_prefix, model))

    run_options = None
    if config.train.full_trace:
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

    if is_chief:
        # Load pretrained weights needs to be called before defining the train
        # op. After it, variables for the optimizer are created.
        with tf.control_dependencies([tf.global_variables_initializer()]):
            with tf.control_dependencies([model.load_pretrained_weights()]):
                init_op = tf.no_op(name='global_init_load_pretrained')
    else:
        init_op = tf.no_op()

    # Create custom Scaffold to make sure we run our own init_op when model
    # is not restored from checkpoint.
    summary_op = [model.summary]
    summaries = tf.summary.merge_all()
    if summaries is not None:
        summary_op.append(summaries)
    summary_op = tf.summary.merge(summary_op)

    scaffold = tf.train.Scaffold(
        # Initialize local and global variables.
        init_op=init_op,
        # Queue-related variables need a special initializer.
        local_init_op=tf.local_variables_initializer(),
        summary_op=summary_op,
    )

    # Custom hooks for our session
    hooks = []
    chief_only_hooks = []

    if config.train.tf_debug:
        debug_hook = tf_debug.LocalCLIDebugHook()
        debug_hook.add_tensor_filter('has_inf_or_nan', tf_debug.has_inf_or_nan)
        hooks.extend([debug_hook])

    if not config.train.job_dir:
        tf.logging.warning(
            '`job_dir` is not defined. Checkpoints and logs will not be saved.'
        )
        checkpoint_dir = None
    elif config.train.run_name:
        # Use run_name when available
        checkpoint_dir = os.path.join(config.train.job_dir,
                                      config.train.run_name)
    else:
        checkpoint_dir = config.train.job_dir

    should_add_hooks = (config.train.display_every_steps
                        or config.train.display_every_secs
                        and checkpoint_dir is not None)
    if should_add_hooks:
        if not config.train.debug and image_vis == 'debug':
            tf.logging.warning('ImageVisHook will not run without debug mode.')
        elif image_vis is not None:
            # ImageVis only runs on the chief.
            chief_only_hooks.append(
                ImageVisHook(prediction_dict,
                             image=train_dataset['image'],
                             gt_bboxes=train_dataset['bboxes'],
                             with_rcnn=config.model.network.with_rcnn,
                             output_dir=checkpoint_dir,
                             every_n_steps=config.train.display_every_steps,
                             every_n_secs=config.train.display_every_secs,
                             image_visualization_mode=image_vis))

        if var_vis is not None:
            # VarVis only runs on the chief.
            chief_only_hooks.append(
                VarVisHook(
                    every_n_steps=config.train.display_every_steps,
                    every_n_secs=config.train.display_every_secs,
                    mode=var_vis,
                    output_dir=checkpoint_dir,
                    vars_summary=model.vars_summary,
                ))

    step = -1
    with tf.train.MonitoredTrainingSession(
            master=target,
            is_chief=is_chief,
            checkpoint_dir=checkpoint_dir,
            scaffold=scaffold,
            hooks=hooks,
            chief_only_hooks=chief_only_hooks,
            save_checkpoint_secs=config.train.save_checkpoint_secs,
            save_summaries_steps=config.train.save_summaries_steps,
            save_summaries_secs=config.train.save_summaries_secs,
    ) as sess:

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                before = time.time()
                _, train_loss, step, filename = sess.run(
                    [train_op, total_loss, global_step, train_filename],
                    options=run_options)

                # TODO: Add image summary every once in a while.

                tf.logging.info(
                    '{}step: {}, file: {}, train_loss: {}, in {:.2f}s'.format(
                        log_prefix, step, filename, train_loss,
                        time.time() - before))

                if is_chief and step == 1:
                    # We save the run after first batch to make sure everything
                    # works properly.
                    save_run(config, environment=environment)

        except tf.errors.OutOfRangeError:
            tf.logging.info('{}finished training after {} epoch limit'.format(
                log_prefix, config.train.num_epochs))

            # TODO: Print summary
        finally:
            coord.request_stop()

        # Wait for all threads to stop.
        coord.join(threads)

        return step
예제 #3
0
def run(config, target='', cluster_spec=None, is_chief=True, job_name=None,
        task_index=None, get_model_fn=get_model, get_dataset_fn=get_dataset,
        environment=None):
    model_class = get_model_fn(config.model.type)

    image_vis = config.train.get('image_vis')
    var_vis = config.train.get('var_vis')

    if config.train.get('seed') is not None:
        tf.set_random_seed(config.train.seed)

    log_prefix = '[{}-{}] - '.format(job_name, task_index) \
        if job_name is not None and task_index is not None else ''

    if config.train.debug or config.train.tf_debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    else:
        tf.logging.set_verbosity(tf.logging.INFO)

    model = model_class(config)

    # Placement of ops on devices using replica device setter
    # which automatically places the parameters on the `ps` server
    # and the `ops` on the workers
    #
    # See:
    # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
    with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
        try:
            config['dataset']['type']
        except KeyError:
            raise KeyError('dataset.type should be set on the custom config.')

        try:
            dataset_class = get_dataset_fn(config.dataset.type)
            dataset = dataset_class(config)
            train_dataset = dataset()
        except InvalidDataDirectory as exc:
            tf.logging.error(
                "Error while reading dataset, {}".format(exc)
            )
            sys.exit(1)

        train_image = train_dataset['image']
        train_filename = train_dataset['filename']
        train_bboxes = train_dataset['bboxes']

        prediction_dict = model(train_image, train_bboxes, is_training=True)
        total_loss = model.loss(prediction_dict)

        global_step = tf.train.get_or_create_global_step()

        optimizer = get_optimizer(config.train, global_step)

        trainable_vars = model.get_trainable_vars()

        with tf.name_scope('gradients'):
            # Compute, clip and apply gradients
            grads_and_vars = optimizer.compute_gradients(
                total_loss, trainable_vars
            )

            # Clip by norm.
            if config.train.clip_by_norm:
                grads_and_vars = clip_gradients_by_norm(grads_and_vars)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.apply_gradients(
                grads_and_vars, global_step=global_step
            )

    tf.logging.info('{}Starting training for {}'.format(log_prefix, model))

    run_options = None
    if config.train.full_trace:
        run_options = tf.RunOptions(
            trace_level=tf.RunOptions.FULL_TRACE
        )

    if is_chief:
        # Load pretrained weights needs to be called before defining the train
        # op. After it, variables for the optimizer are created.
        with tf.control_dependencies([tf.global_variables_initializer()]):
            with tf.control_dependencies([model.load_pretrained_weights()]):
                init_op = tf.no_op(name='global_init_load_pretrained')
    else:
        init_op = tf.no_op()

    # Create custom Scaffold to make sure we run our own init_op when model
    # is not restored from checkpoint.
    summary_op = [model.summary]
    summaries = tf.summary.merge_all()
    if summaries is not None:
        summary_op.append(summaries)
    summary_op = tf.summary.merge(summary_op)

    scaffold = tf.train.Scaffold(
        # Initialize local and global variables.
        init_op=init_op,
        # Queue-related variables need a special initializer.
        local_init_op=tf.local_variables_initializer(),
        summary_op=summary_op,
    )

    # Custom hooks for our session
    hooks = []
    chief_only_hooks = []

    if config.train.tf_debug:
        debug_hook = tf_debug.LocalCLIDebugHook()
        debug_hook.add_tensor_filter(
            'has_inf_or_nan', tf_debug.has_inf_or_nan
        )
        hooks.extend([debug_hook])

    if not config.train.job_dir:
        tf.logging.warning(
            '`job_dir` is not defined. Checkpoints and logs will not be saved.'
        )
        checkpoint_dir = None
    elif config.train.run_name:
        # Use run_name when available
        checkpoint_dir = os.path.join(
            config.train.job_dir, config.train.run_name
        )
    else:
        checkpoint_dir = config.train.job_dir

    should_add_hooks = (
        config.train.display_every_steps
        or config.train.display_every_secs
        and checkpoint_dir is not None
    )
    if should_add_hooks:
        if not config.train.debug and image_vis == 'debug':
            tf.logging.warning('ImageVisHook will not run without debug mode.')
        elif image_vis is not None:
            # ImageVis only runs on the chief.
            chief_only_hooks.append(
                ImageVisHook(
                    prediction_dict,
                    image=train_dataset['image'],
                    gt_bboxes=train_dataset['bboxes'],
                    with_rcnn=config.model.network.with_rcnn,
                    output_dir=checkpoint_dir,
                    every_n_steps=config.train.display_every_steps,
                    every_n_secs=config.train.display_every_secs,
                    image_visualization_mode=image_vis
                )
            )

        if var_vis is not None:
            # VarVis only runs on the chief.
            chief_only_hooks.append(
                VarVisHook(
                    every_n_steps=config.train.display_every_steps,
                    every_n_secs=config.train.display_every_secs,
                    mode=var_vis,
                    output_dir=checkpoint_dir,
                    vars_summary=model.vars_summary,
                )
            )

    step = -1
    with tf.train.MonitoredTrainingSession(
        master=target,
        is_chief=is_chief,
        checkpoint_dir=checkpoint_dir,
        scaffold=scaffold,
        hooks=hooks,
        chief_only_hooks=chief_only_hooks,
        save_checkpoint_secs=config.train.save_checkpoint_secs,
        save_summaries_steps=config.train.save_summaries_steps,
        save_summaries_secs=config.train.save_summaries_secs,
    ) as sess:

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                before = time.time()
                _, train_loss, step, filename = sess.run([
                    train_op, total_loss, global_step, train_filename
                ], options=run_options)

                # TODO: Add image summary every once in a while.

                tf.logging.info(
                    '{}step: {}, file: {}, train_loss: {}, in {:.2f}s'.format(
                        log_prefix, step, filename, train_loss,
                        time.time() - before
                    ))

                if is_chief and step == 1:
                    # We save the run after first batch to make sure everything
                    # works properly.
                    save_run(config, environment=environment)

        except tf.errors.OutOfRangeError:
            tf.logging.info(
                '{}finished training after {} epoch limit'.format(
                    log_prefix, config.train.num_epochs
                )
            )

            # TODO: Print summary
        finally:
            coord.request_stop()

        # Wait for all threads to stop.
        coord.join(threads)

        return step
예제 #4
0
파일: train.py 프로젝트: hiredd/luminoth
def run(custom_config,
        model_type,
        override_params,
        target='',
        cluster_spec=None,
        is_chief=True,
        job_name=None,
        task_index=None,
        get_model_fn=get_model,
        get_dataset_fn=get_dataset):
    model_class = get_model_fn(model_type)

    config = get_model_config(
        model_class.base_config,
        custom_config,
        override_params,
    )

    if config.train.get('seed') is not None:
        tf.set_random_seed(config.train.seed)

    log_prefix = '[{}-{}] - '.format(job_name, task_index) \
        if job_name is not None and task_index is not None else ''

    if config.train.debug or config.train.tf_debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    else:
        tf.logging.set_verbosity(tf.logging.INFO)

    model = model_class(config)

    # Placement of ops on devices using replica device setter
    # which automatically places the parameters on the `ps` server
    # and the `ops` on the workers
    #
    # See:
    # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
    with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
        try:
            config['dataset']['type']
        except KeyError:
            raise KeyError('dataset.type should be set on the custom config.')
        dataset_class = get_dataset_fn(config.dataset.type)
        dataset = dataset_class(config)
        train_dataset = dataset()

        train_image = train_dataset['image']
        train_filename = train_dataset['filename']
        train_bboxes = train_dataset['bboxes']

        # TODO: This is not the best place to configure rank? Why is rank not
        # transmitted through the queue
        train_image.set_shape((None, None, 3))
        # We add fake batch dimension to train data.
        # TODO: DEFINITELY NOT THE BEST PLACE
        train_image = tf.expand_dims(train_image, 0)

        prediction_dict = model(train_image, train_bboxes, is_training=True)
        total_loss = model.loss(prediction_dict)

        global_step = tf.contrib.framework.get_or_create_global_step()

        optimizer = get_optimizer(config.train, global_step)

        trainable_vars = model.get_trainable_vars()

        with tf.name_scope('gradients'):
            # Compute, clip and apply gradients
            grads_and_vars = optimizer.compute_gradients(
                total_loss, trainable_vars)

            # Clip by norm. TODO: Configurable
            grads_and_vars = clip_gradients_by_norm(grads_and_vars)

        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

    tf.logging.info('{}Starting training for {}'.format(log_prefix, model))

    run_options = None
    if config.train.full_trace:
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

    if is_chief:
        # Load pretrained weights needs to be called before defining the train
        # op. After it, variables for the optimizer are created.
        with tf.control_dependencies([tf.global_variables_initializer()]):
            with tf.control_dependencies([model.load_pretrained_weights()]):
                init_op = tf.no_op(name='global_init_load_pretrained')
    else:
        init_op = tf.no_op()

    # Create custom Scaffold to make sure we run our own init_op when model
    # is not restored from checkpoint.
    scaffold = tf.train.Scaffold(
        # Initialize local and global variables.
        init_op=init_op,
        # Queue-related variables need a special initializer.
        local_init_op=tf.local_variables_initializer(),
        summary_op=tf.summary.merge([
            tf.summary.merge_all(),
            model.summary,
        ]))

    # Custom hooks for our session
    hooks = []
    chief_only_hooks = []

    if config.train.tf_debug:
        debug_hook = tf_debug.LocalCLIDebugHook()
        debug_hook.add_tensor_filter('has_inf_or_nan', tf_debug.has_inf_or_nan)
        hooks.extend([debug_hook])

    if not config.train.job_dir:
        tf.logging.warning(
            '`job_dir` is not defined. Checkpoints and logs will not be saved.'
        )
    elif config.train.run_name:
        # Use run_name when available
        checkpoint_dir = os.path.join(config.train.job_dir,
                                      config.train.run_name)
    else:
        checkpoint_dir = config.train.job_dir

    if config.train.display_every_steps or config.train.display_every_secs:
        if not config.train.debug:
            tf.logging.warning('ImageVisHook will not run without debug mode.')
        else:
            # ImageVis only runs on the chief.
            chief_only_hooks.append(
                ImageVisHook(prediction_dict,
                             with_rcnn=config.network.with_rcnn,
                             output_dir=checkpoint_dir,
                             every_n_steps=config.train.display_every_steps,
                             every_n_secs=config.train.display_every_secs))

    with tf.train.MonitoredTrainingSession(
            master=target,
            is_chief=is_chief,
            checkpoint_dir=checkpoint_dir,
            scaffold=scaffold,
            hooks=hooks,
            chief_only_hooks=chief_only_hooks,
            save_checkpoint_secs=config.train.save_checkpoint_secs,
            save_summaries_steps=config.train.save_summaries_steps,
            save_summaries_secs=config.train.save_summaries_secs,
    ) as sess:

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                before = time.time()
                _, train_loss, step, filename = sess.run(
                    [train_op, total_loss, global_step, train_filename],
                    options=run_options)

                # TODO: Add image summary every once in a while.

                tf.logging.info(
                    '{}step: {}, file: {}, train_loss: {}, in {:.2f}s'.format(
                        log_prefix, step, filename, train_loss,
                        time.time() - before))

        except tf.errors.OutOfRangeError:
            tf.logging.info('{}finished training after {} epoch limit'.format(
                log_prefix, config.train.num_epochs))

            # TODO: Print summary
        finally:
            coord.request_stop()

        # Wait for all threads to stop.
        coord.join(threads)
예제 #5
0
def run_local(config, environment=None):
    model_class = get_model(config.model.type)
    image_vis = config.train.get('image_vis')
    var_vis = config.train.get('var_vis')

    if config.train.get('seed') is not None:
        tf.set_random_seed(config.train.seed)

    if config.train.debug or config.train.tf_debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    else:
        tf.logging.set_verbosity(tf.logging.INFO)

    model = model_class(config)

    num_gpus = config.train.get('num_gpus')
    if num_gpus is None:
        num_gpus = 1
    gpu_devices = ['gpu:{}'.format(i) for i in range(num_gpus)]
    gpu_indices = [i for i in range(num_gpus)]

    global_step = tf.train.get_or_create_global_step()

    optimizer = get_optimizer(config.train, global_step)

    def forward_pass_and_gradients(train_dataset):
        """
        Create forward loss and grads on each device
        """
        train_image = train_dataset['image']
        train_filename = train_dataset['filename']
        train_bboxes = train_dataset['bboxes']

        prediction_dict = model(train_image, train_bboxes, is_training=True)
        total_loss = model.loss(prediction_dict)

        # TODO: Is this necesarry? Couldn't we just get them from the
        # trainable vars collection? We should probably improve our
        # usage of collections.
        trainable_vars = model.get_trainable_vars()

        # Compute, clip and apply gradients
        with tf.name_scope('gradients'):
            grads_and_vars = optimizer.compute_gradients(
                total_loss, trainable_vars)

            if config.train.clip_by_norm:
                grads_and_vars = clip_gradients_by_norm(grads_and_vars)

        return prediction_dict, total_loss, grads_and_vars

    def build_train_ops(device_grads):
        training_ops = []
        # average all gradients
        grads_to_reduce = [[g for g, _ in grad_vars]
                           for grad_vars in device_grads]
        algorithm = batch_allreduce.AllReduceSpecAlgorithm(
            'nccl', gpu_indices, 0, 10)
        reduced_grads, _ = algorithm.batch_all_reduce(grads_to_reduce, 0, 0, 0)
        reduced_device_grads = [[
            (g, v) for g, (_, v) in zip(grads, grad_vars)
        ] for grads, grad_vars in zip(reduced_grads, device_grads)]

        for i, device in enumerate(gpu_devices):
            with tf.device(device):
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                with tf.control_dependencies(update_ops):
                    train_op = optimizer.apply_gradients(
                        reduced_device_grads[i], global_step=global_step)
                    training_ops.append(train_op)
        train_ops = tf.group(*(training_ops), name='train_ops_group')
        return train_ops

    try:
        dataset_class = get_dataset(config.dataset.type)
        dataset = dataset_class(config)
    except InvalidDataDirectory as exc:
        tf.logging.error("Error while reading dataset, {}".format(exc))
        sys.exit(1)
    device_losses = []
    device_gradvars = []

    for device in gpu_devices:
        train_dataset = dataset()
        with tf.device(device):
            prediction_dict, loss, gradvars = forward_pass_and_gradients(
                train_dataset)
            device_losses.append(loss)
            device_gradvars.append(gradvars)

    train_filename = train_dataset['filename']

    train_op = build_train_ops(device_gradvars)
    # average losses
    average_loss = tf.reduce_mean(device_losses)

    # Create custom init for slots in optimizer, as we don't save them to
    # our checkpoints. An example of slots in an optimizer are the Momentum
    # variables in MomentumOptimizer. We do this because slot variables can
    # effectively duplicate the size of your checkpoint!
    trainable_vars = model.get_trainable_vars()
    slot_variables = [
        optimizer.get_slot(var, name) for name in optimizer.get_slot_names()
        for var in trainable_vars
    ]
    slot_init = tf.variables_initializer(slot_variables,
                                         name='optimizer_slots_initializer')

    # Create saver for saving/restoring model
    model_saver = tf.train.Saver(
        set(tf.global_variables()) - set(slot_variables),
        name='model_saver',
        max_to_keep=config.train.get('checkpoints_max_keep', 1),
    )

    # Create saver for loading pretrained checkpoint into base network
    base_checkpoint_vars = model.get_base_network_checkpoint_vars()
    checkpoint_file = model.get_checkpoint_file()
    if base_checkpoint_vars and checkpoint_file:
        base_net_checkpoint_saver = tf.train.Saver(
            base_checkpoint_vars, name='base_net_checkpoint_saver')

        # We'll send this fn to Scaffold init_fn
        def load_base_net_checkpoint(_, session):
            base_net_checkpoint_saver.restore(session, checkpoint_file)
    else:
        load_base_net_checkpoint = None

    tf.logging.info('Starting training for {}'.format(model))

    run_options = None
    if config.train.full_trace:
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

    # Create custom Scaffold to make sure we run our own init_op when model
    # is not restored from checkpoint.
    summary_op = [model.summary]
    summaries = tf.summary.merge_all()
    if summaries is not None:
        summary_op.append(summaries)
    summary_op = tf.summary.merge(summary_op)

    # `ready_for_local_init_op` is hardcoded to 'ready' as local init doesn't
    # depend on global init and `local_init_op` only runs when it is set as
    # 'ready' (an empty string tensor sets it as ready).
    is_chief = True
    local_var_init_op = tf.local_variables_initializer()
    table_init_ops = tf.tables_initializer()
    variable_mgr_init_ops = [local_var_init_op]
    variable_mgr_init_ops.extend([table_init_ops])
    variable_mgr_init_ops.extend([slot_init])
    local_var_init_op_group = tf.group(*variable_mgr_init_ops)
    scaffold = tf.train.Scaffold(
        saver=model_saver,
        init_op=tf.global_variables_initializer() if is_chief else tf.no_op(),
        local_init_op=local_var_init_op_group,
        ready_for_local_init_op=tf.constant([], dtype=tf.string),
        summary_op=summary_op,
        init_fn=load_base_net_checkpoint,
    )

    # Custom hooks for our session
    hooks = []
    chief_only_hooks = []

    if config.train.tf_debug:
        debug_hook = tf_debug.LocalCLIDebugHook()
        debug_hook.add_tensor_filter('has_inf_or_nan', tf_debug.has_inf_or_nan)
        hooks.extend([debug_hook])

    if not config.train.job_dir:
        tf.logging.warning(
            '`job_dir` is not defined. Checkpoints and logs will not be saved.'
        )
        checkpoint_dir = None
    elif config.train.run_name:
        # Use run_name when available
        checkpoint_dir = os.path.join(config.train.job_dir,
                                      config.train.run_name)
    else:
        checkpoint_dir = config.train.job_dir

    should_add_hooks = (config.train.display_every_steps
                        or config.train.display_every_secs
                        and checkpoint_dir is not None)
    if should_add_hooks:
        if not config.train.debug and image_vis == 'debug':
            tf.logging.warning('ImageVisHook will not run without debug mode.')
        elif image_vis is not None:
            # ImageVis only runs on the chief.
            chief_only_hooks.append(
                ImageVisHook(prediction_dict,
                             image=train_dataset['image'],
                             gt_bboxes=train_dataset['bboxes'],
                             config=config.model,
                             output_dir=checkpoint_dir,
                             every_n_steps=config.train.display_every_steps,
                             every_n_secs=config.train.display_every_secs,
                             image_visualization_mode=image_vis))

        if var_vis is not None:
            # VarVis only runs on the chief.
            chief_only_hooks.append(
                VarVisHook(
                    every_n_steps=config.train.display_every_steps,
                    every_n_secs=config.train.display_every_secs,
                    mode=var_vis,
                    output_dir=checkpoint_dir,
                    vars_summary=model.vars_summary,
                ))

    step = -1
    target = ''
    config_proto = tf.ConfigProto()
    config_proto.allow_soft_placement = True
    with tf.train.MonitoredTrainingSession(
            master=target,
            is_chief=is_chief,
            checkpoint_dir=checkpoint_dir,
            scaffold=scaffold,
            hooks=hooks,
            chief_only_hooks=chief_only_hooks,
            save_checkpoint_secs=config.train.save_checkpoint_secs,
            save_summaries_steps=config.train.save_summaries_steps,
            save_summaries_secs=config.train.save_summaries_secs,
            config=config_proto,
    ) as sess:

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                before = time.time()
                _, train_loss, step, filename = sess.run(
                    [train_op, average_loss, global_step, train_filename],
                    options=run_options)

                # TODO: Add image summary every once in a while.

                tf.logging.info(
                    'step: {}, file: {}, train_loss: {}, in {:.2f}s'.format(
                        step, filename, train_loss,
                        time.time() - before))

                if is_chief and step == 1:
                    # We save the run after first batch to make sure everything
                    # works properly.
                    save_run(config, environment=environment)

        except tf.errors.OutOfRangeError:
            tf.logging.info('{}finished training after {} epoch limit'.format(
                log_prefix, config.train.num_epochs))

            # TODO: Print summary
        finally:
            coord.request_stop()

        # Wait for all threads to stop.
        coord.join(threads)

        return step