def continue_training(logdir): """ Continues training of a model. This will load model files and weights found in logdir and continues an aborted training. Parameters ---------- logdir : string Directory with logs. """ hypes = utils.load_hypes_from_logdir(logdir) modules = utils.load_modules_from_logdir(logdir) # Tell TensorFlow that the model will be built into the default Graph. with tf.Session() as sess: # build the graph based on the loaded modules with tf.name_scope("Queues"): queue = modules['input'].create_queues(hypes, 'train') tv_graph = core.build_training_graph(hypes, queue, modules) # prepaire the tv session tv_sess = core.start_tv_session(hypes) sess = tv_sess['sess'] saver = tv_sess['saver'] logging_file = os.path.join(logdir, 'output.log') utils.create_filewrite_handler(logging_file, mode='a') logging.info("Continue training.") cur_step = core.load_weights(logdir, sess, saver) if cur_step is None: logging.warning("Loaded global_step is None.") logging.warning("This could mean," " that no weights have been loaded.") logging.warning("Starting Training with step 0.") cur_step = 0 with tf.name_scope('Validation'): tf.get_variable_scope().reuse_variables() image_pl = tf.placeholder(tf.float32) image = tf.expand_dims(image_pl, 0) image.set_shape([1, None, None, 3]) inf_out = core.build_inference_graph(hypes, modules, image=image) tv_graph['image_pl'] = image_pl tv_graph['inf_out'] = inf_out # Start the data load modules['input'].start_enqueuing_threads(hypes, queue, 'train', sess) # And then after everything is built, start the training loop. run_training(hypes, modules, tv_graph, tv_sess, cur_step) # stopping input Threads tv_sess['coord'].request_stop() tv_sess['coord'].join(tv_sess['threads'])
def do_training(hypes): """ Train model for a number of steps. This trains the model for at most hypes['solver']['max_steps']. It shows an update every utils.cfg.step_show steps and writes the model to hypes['dirs']['output_dir'] every utils.cfg.step_eval steps. Paramters --------- hypes : dict Hyperparameters """ # Get the sets of images and labels for training, validation, and # test on MNIST. modules = utils.load_modules_from_hypes(hypes) # set to allocate memory on GPU as needed # For more details, look at # https://stackoverflow.com/questions/36927607/how-can-i-solve-ran-out-of-gpu-memory-in-tensorflow config = tf.ConfigProto() config.gpu_options.allow_growth = True # Tell TensorFlow that the model will be built into the default Graph. with tf.Session(config=config) as sess: # build the graph based on the loaded modules with tf.name_scope("Queues"): queue = modules['input'].create_queues(hypes, 'train') tv_graph = core.build_training_graph(hypes, queue, modules) # prepaire the tv session tv_sess = core.start_tv_session(hypes) with tf.name_scope('Validation'): tf.get_variable_scope().reuse_variables() image_pl = tf.placeholder(tf.float32) image = tf.expand_dims(image_pl, 0) image.set_shape([1, None, None, 3]) inf_out = core.build_inference_graph(hypes, modules, image=image) tv_graph['image_pl'] = image_pl tv_graph['inf_out'] = inf_out # Start the data load modules['input'].start_enqueuing_threads(hypes, queue, 'train', sess) # And then after everything is built, start the training loop. train.run_training(hypes, modules, tv_graph, tv_sess) # stopping input Threads tv_sess['coord'].request_stop() tv_sess['coord'].join(tv_sess['threads'])
def do_training(hypes): """ Train model for a number of steps. This trains the model for at most hypes['solver']['max_steps']. It shows an update every utils.cfg.step_show steps and writes the model to hypes['dirs']['output_dir'] every utils.cfg.step_eval steps. Paramters --------- hypes : dict Hyperparameters """ modules = utils.load_modules_from_hypes(hypes) # Tell TensorFlow that the model will be built into the default Graph. with tf.Session() as sess: # build the graph based on the loaded modules logging.info("..Creating queues") with tf.name_scope("Queues"): queue = modules['input'].create_queues(hypes, 'train') logging.info("..Building training graph") tv_graph = core.build_training_graph(hypes, queue, modules) # prepare the tv session tv_sess = core.start_tv_session(hypes) with tf.name_scope('Validation'): tf.get_variable_scope().reuse_variables() image_pl = tf.placeholder(tf.float32) image = tf.expand_dims(image_pl, 0) image.set_shape([1, None, None, 3]) inf_out = core.build_inference_graph(hypes, modules, image=image) tv_graph['image_pl'] = image_pl tv_graph['inf_out'] = inf_out # Start the data load logging.info("..Enqueuing files") modules['input'].start_enqueuing_threads(hypes, queue, 'train', sess) # And then after everything is built, start the training loop. logging.info("..Initializing the training") run_training(hypes, modules, tv_graph, tv_sess) # stopping input Threads tv_sess['coord'].request_stop() tv_sess['coord'].join(tv_sess['threads'])
def do_training(hypes): """ Train model for a number of steps. This trains the model for at most hypes['solver']['max_steps']. It shows an update every utils.cfg.step_show steps and writes the model to hypes['dirs']['output_dir'] every utils.cfg.step_eval steps. Paramters --------- hypes : dict Hyperparameters """ # Get the sets of images and labels for training, validation, and # test on MNIST. modules = utils.load_modules_from_hypes(hypes) # Tell TensorFlow that the model will be built into the default Graph. with tf.Session() as sess: # build the graph based on the loaded modules with tf.name_scope("Queues"): queue = modules['input'].create_queues(hypes, 'train') tv_graph = core.build_training_graph(hypes, queue, modules) # prepaire the tv session tv_sess = core.start_tv_session(hypes) with tf.name_scope('Validation'): tf.get_variable_scope().reuse_variables() image_pl = tf.placeholder(tf.float32) image = tf.expand_dims(image_pl, 0) image.set_shape([1, None, None, 3]) inf_out = core.build_inference_graph(hypes, modules, image=image) tv_graph['image_pl'] = image_pl tv_graph['inf_out'] = inf_out # Start the data load modules['input'].start_enqueuing_threads(hypes, queue, 'train', sess) # And then after everything is built, start the training loop. run_training(hypes, modules, tv_graph, tv_sess) # stopping input Threads tv_sess['coord'].request_stop() tv_sess['coord'].join(tv_sess['threads'])
def continue_training(logdir): """ Continues training of a model. This will load model files and weights found in logdir and continues an aborted training. Parameters ---------- logdir : string Directory with logs. """ hypes = utils.load_hypes_from_logdir(logdir) modules = utils.load_modules_from_logdir(logdir) # Tell TensorFlow that the model will be built into the default Graph. with tf.Session() as sess: # build the graph based on the loaded modules with tf.name_scope("Queues"): queue = modules['input'].create_queues(hypes, 'train') tv_graph = core.build_training_graph(hypes, queue, modules) # prepaire the tv session tv_sess = core.start_tv_session(hypes) sess = tv_sess['sess'] saver = tv_sess['saver'] logging_file = os.path.join(logdir, 'output.log') utils.create_filewrite_handler(logging_file, mode='a') logging.info("Continue training.") cur_step = core.load_weights(logdir, sess, saver) if cur_step is None: logging.warning("Loaded global_step is None.") logging.warning("This could mean," " that no weights have been loaded.") logging.warning("Starting Training with step 0.") cur_step = 0 with tf.name_scope('Validation'): tf.get_variable_scope().reuse_variables() image_pl = tf.placeholder(tf.float32) image = tf.expand_dims(image_pl, 0) image.set_shape([1, None, None, 3]) inf_out = core.build_inference_graph(hypes, modules, image=image) tv_graph['image_pl'] = image_pl tv_graph['inf_out'] = inf_out # Start the data load modules['input'].start_enqueuing_threads(hypes, queue, 'train', sess) # And then after everything is built, start the training loop. run_training(hypes, modules, tv_graph, tv_sess, cur_step) # stopping input Threads tv_sess['coord'].request_stop() tv_sess['coord'].join(tv_sess['threads'])
def main(_): utils.set_gpus_to_use() try: import tensorvision.train import tensorflow_fcn.utils except ImportError: logging.error("Could not import the submodules.") logging.error("Please execute:" "'git submodule update --init --recursive'") exit(1) with open(tf.app.flags.FLAGS.hypes, 'r') as f: logging.info("f: %s", f) hypes = json.load(f) utils.load_plugins() if 'TV_DIR_RUNS' in os.environ: runs_dir = os.path.join(os.environ['TV_DIR_RUNS'], 'KittiSeg') else: runs_dir = 'RUNS' utils.set_dirs(hypes, tf.app.flags.FLAGS.hypes) utils._add_paths_to_sys(hypes) train.maybe_download_and_extract(hypes) maybe_download_and_extract(runs_dir) logging.info("Trimming weights.") logdir = os.path.join(runs_dir, FLAGS.RUN) modules = utils.load_modules_from_hypes(hypes) with tf.Graph().as_default(): # build the graph based on the loaded modules with tf.name_scope("Queues"): queue = modules['input'].create_queues(hypes, 'train') tv_graph = core.build_training_graph(hypes, queue, modules) # prepare the tv session with tf.Session().as_default(): tv_sess = core.start_tv_session(hypes) sess = tv_sess['sess'] saver = tv_sess['saver'] cur_step = core.load_weights(logdir, sess, saver) if cur_step is None: logging.warning("Loaded global_step is None.") logging.warning("This could mean," " that no weights have been loaded.") logging.warning("Starting Training with step 0.") cur_step = 0 with tf.name_scope('Validation'): tf.get_variable_scope().reuse_variables() image_pl = tf.placeholder(tf.float32) image = tf.expand_dims(image_pl, 0) image.set_shape([1, None, None, 3]) inf_out = core.build_inference_graph(hypes, modules, image=image) tv_graph['image_pl'] = image_pl tv_graph['inf_out'] = inf_out # prepaire the tv session image_pl = tf.placeholder(tf.float32) image = tf.expand_dims(image_pl, 0) image.set_shape([1, None, None, 3]) inf_out = core.build_inference_graph(hypes, modules, image=image) # Create a session for running Ops on the Graph. trim_dir = 'RUNS/trimmed' shutil.copytree(logdir, trim_dir) shutil.copy(tf.app.flags.FLAGS.hypes, os.path.join(trim_dir, 'model_files', 'hypes.json')) sess = tf.Session() saver = tf.train.Saver() core.load_weights(trim_dir, sess, saver) for weight in tf.contrib.model_pruning.get_masks(): if any([ layer in weight.name for layer in hypes['layer_pruning']['layers'] ]): weight_value = tv_sess['sess'].run(weight) kernel_count = int(weight_value.shape[3] * hypes['layer_pruning']['layer_sparsity']) l1_values = np.sum(np.abs(weight_value), axis=(0, 1, 2)) toss_kernels = l1_values.argsort()[:kernel_count] weight_value[:, :, :, toss_kernels] = 0 assign_op = tf.assign(weight, tf.constant(weight_value)) tv_sess['sess'].run(assign_op) checkpoint_path = os.path.join(trim_dir, 'model.ckpt') tv_sess['saver'].save(sess, checkpoint_path, global_step=cur_step) train.continue_training(trim_dir)
def do_training(hypes): """ Train model for a number of steps. This trains the model for at most hypes['solver']['max_steps']. It shows an update every utils.cfg.step_show steps and writes the model to hypes['dirs']['output_dir'] every utils.cfg.step_eval steps. Paramters --------- hypes : dict Hyperparameters """ # Get the sets of images and labels for training, validation, and # test on MNIST. modules = utils.load_modules_from_hypes(hypes) # Tell TensorFlow that the model will be built into the default Graph. with tf.Session() as sess: # build the graph based on the loaded modules with tf.name_scope("Queues"): queue = modules['input'].create_queues(hypes, 'train') regression_weights = tf.placeholder(dtype=tf.float32, shape=(3,)) hypes['solver']['regression_weights'] = regression_weights tv_graph = core.build_training_graph(hypes, queue, modules) # prepaire the tv session tv_sess = core.start_tv_session(hypes) with tf.name_scope('Validation'): tf.get_variable_scope().reuse_variables() image_pl = tf.placeholder(tf.float32) calib = tf.placeholder(tf.float32, shape=[1, hypes['grid_height'], hypes['grid_width'], 3, 4]) xy_scale = tf.placeholder(tf.float32, shape=[1, hypes['grid_height'], hypes['grid_width'], 2]) image = tf.expand_dims(image_pl, 0) image.set_shape([1, 384, 1248, 3]) inf_out, encoder_out = core.build_inference_graph(hypes, modules, image, calib, xy_scale) tv_graph['image_pl'] = image_pl tv_graph['inf_out'] = inf_out tv_graph['calib_pl'] = calib tv_graph['xy_scale_pl'] = xy_scale tv_graph['encoder_out'] = encoder_out all_variables = tf.get_collection_ref(tf.GraphKeys.GLOBAL_VARIABLES) sess.run(tf.variables_initializer(all_variables)) var_list = [var for var in all_variables if "beta" not in var.name and 'Adam' not in var.name] saver = tf.train.Saver(var_list=var_list) saver.restore(sess, hypes['pretrained']) # Start the data load modules['input'].start_enqueuing_threads(hypes, queue, 'train', sess) # And then after everything is built, start the training loop. run_training(hypes, modules, tv_graph, tv_sess) # stopping input Threads tv_sess['coord'].request_stop() tv_sess['coord'].join(tv_sess['threads'])
def do_evaling(hypes): """ eval model for a number of steps. This trains the model for at most hypes['solver']['max_steps']. It shows an update every utils.cfg.step_show steps and writes the model to hypes['dirs']['output_dir'] every utils.cfg.step_eval steps. Paramters --------- hypes : dict Hyperparameters """ # Get the sets of images and labels for training, validation, and # test on MNIST. modules = utils.load_modules_from_hypes(hypes) # Tell TensorFlow that the model will be built into the default Graph. with tf.Session() as sess: # build the graph based on the loaded modules with tf.name_scope("Queues"): queue = modules['input'].create_queues(hypes, 'train') tv_graph = core.build_training_graph(hypes, queue, modules) # prepaire the tv session tv_sess = core.start_tv_session(hypes) with tf.name_scope('Validation'): tf.get_variable_scope().reuse_variables() image_pl = tf.placeholder(tf.float32) calib = tf.placeholder( tf.float32, shape=[1, hypes['grid_height'], hypes['grid_width'], 3, 4]) xy_scale = tf.placeholder( tf.float32, shape=[1, hypes['grid_height'], hypes['grid_width'], 2]) image = tf.expand_dims(image_pl, 0) image.set_shape([1, 384, 1248, 3]) inf_out = core.build_inference_graph(hypes, modules, image, calib, xy_scale) tv_graph['image_pl'] = image_pl tv_graph['inf_out'] = inf_out tv_graph['calib_pl'] = calib tv_graph['xy_scale_pl'] = xy_scale all_variables = tf.get_collection_ref(tf.GraphKeys.GLOBAL_VARIABLES) sess.run(tf.variables_initializer(all_variables)) # Start the data load modules['input'].start_enqueuing_threads(hypes, queue, 'train', sess) saver = tf.train.Saver() print("Reading checkpoints...") ckpt = tf.train.get_checkpoint_state(hypes["dirs"]["ckpt_dir"]) if ckpt and ckpt.model_checkpoint_path: global_step = ckpt.model_checkpoint_path.split('/')[-1].split( '-')[-1] saver.restore(sess, ckpt.model_checkpoint_path) print('Loading success, global_step is %s' % global_step) # And then after everything is built, start the training loop. run_evaling(hypes, modules, tv_graph, tv_sess) # stopping input Threads tv_sess['coord'].request_stop() tv_sess['coord'].join(tv_sess['threads'])
def do_finetuning(hypes): """ Finetune model for a number of steps. This finetunes the model for at most hypes['solver']['max_steps']. It shows an update every utils.cfg.step_show steps and writes the model to hypes['dirs']['output_dir'] every utils.cfg.step_eval steps. Paramters --------- hypes : dict Hyperparameters """ # Get the sets of images and labels for training, validation, and # test on MNIST. try: import tensorvision.core as core except ImportError: logging.error("Could not import the submodules.") logging.error("Please execute:" "'git submodule update --init --recursive'") exit(1) modules = utils.load_modules_from_hypes(hypes) # Tell TensorFlow that the model will be built into the default Graph. with tf.Session() as sess: # build the graph based on the loaded modules with tf.name_scope("Queues"): queue = modules['input'].create_queues(hypes, 'train') tv_graph = core.build_training_graph(hypes, queue, modules) # restoring vars vars_to_restore = restoring_vars(hypes, sess) restorer = tf.train.Saver(vars_to_restore) # load pre-trained model of hand segmentation logging.info("Loading pretrained model's weights") model_dir = hypes['transfer']['model_folder'] model_file = hypes['transfer']['model_name'] # DEBUG: check the model file # check_model(os.path.join(model_dir, model_file)) """ # Get a list of vars to restore vars_to_restore = restoring_vars(sess) print("vars to restore:", vars_to_restore) # Create another Saver for restoring pre-trained vars saver = tf.train.Saver(vars_to_restore) """ core.load_weights(model_dir, sess, restorer) # load_trained_model(sess, hypes) saver = tf.train.Saver(max_to_keep=int(utils.cfg.max_to_keep)) # prepaire the tv session tv_sess = prepare_tv_session(hypes, sess, saver) # DEBUG: print weights # check_weights(tv_sess['sess']) # check_graph(tv_sess['sess']) with tf.name_scope('Validation'): tf.get_variable_scope().reuse_variables() image_pl = tf.placeholder(tf.float32) image = tf.expand_dims(image_pl, 0) image.set_shape([1, None, None, 3]) inf_out = core.build_inference_graph(hypes, modules, image=image) tv_graph['image_pl'] = image_pl tv_graph['inf_out'] = inf_out # Start the data load modules['input'].start_enqueuing_threads(hypes, queue, 'train', sess) # And then after everything is built, start the training loop. train.run_training(hypes, modules, tv_graph, tv_sess) # stopping input Threads tv_sess['coord'].request_stop() tv_sess['coord'].join(tv_sess['threads'])