def _create_image_encoder(preprocess_fn, factory_fn, image_shape, batch_size=32, session=None, checkpoint_path=None, loss_mode="cosine"): image_var = tf.placeholder(tf.uint8, (None, ) + image_shape) preprocessed_image_var = tf.map_fn( lambda x: preprocess_fn(x, is_training=False), tf.cast(image_var, tf.float32)) l2_normalize = loss_mode == "cosine" feature_var, _ = factory_fn( preprocessed_image_var, l2_normalize=l2_normalize, reuse=None) feature_dim = feature_var.get_shape().as_list()[-1] if session is None: session = tf.Session() if checkpoint_path is not None: slim.get_or_create_global_step() init_assign_op, init_feed_dict = slim.assign_from_checkpoint( checkpoint_path, slim.get_variables_to_restore()) session.run(init_assign_op, feed_dict=init_feed_dict) def encoder(data_x): out = np.zeros((len(data_x), feature_dim), np.float32) _run_in_batches( lambda x: session.run(feature_var, feed_dict=x), {image_var: data_x}, out, batch_size) return out return encoder
def load_ckpt(self, sess, ckpt='ckpts/vgg_16.ckpt'): variables = slim.get_variables(scope='vgg_16', suffix="weights") + slim.get_variables( scope='vgg_16', suffix="biases") init_assign_op, init_feed_dict = slim.assign_from_checkpoint( ckpt, variables) sess.run(init_assign_op, init_feed_dict)
def load_ckpt_path(sess, model_path, variables_to_restore=None): if variables_to_restore is None: variables_to_restore = slim.get_variables_to_restore() restore_op, restore_fd = slim.assign_from_checkpoint( model_path, variables_to_restore) sess.run(restore_op, feed_dict=restore_fd) print(f'{model_path} loaded')
def _create_image_encoder(preprocess_fn, factory_fn, image_shape, batch_size=32, session=None, checkpoint_path=None, loss_mode="cosine"): image_var = tf.placeholder(tf.uint8, (None, ) + image_shape) preprocessed_image_var = tf.map_fn( lambda x: preprocess_fn(x, is_training=False), tf.cast(image_var, tf.float32)) l2_normalize = loss_mode == "cosine" feature_var, _ = factory_fn(preprocessed_image_var, l2_normalize=l2_normalize, reuse=None) feature_dim = feature_var.get_shape().as_list()[-1] if session is None: session = tf.Session() if checkpoint_path is not None: #slim.get_or_create_global_step() ---------------------------------------------------------- tf.train.get_or_create_global_step() init_assign_op, init_feed_dict = slim.assign_from_checkpoint( checkpoint_path, slim.get_variables_to_restore()) session.run(init_assign_op, feed_dict=init_feed_dict) def encoder(data_x): out = np.zeros((len(data_x), feature_dim), np.float32) _run_in_batches(lambda x: session.run(feature_var, feed_dict=x), {image_var: data_x}, out, batch_size) return out return encoder
def assign_from_checkpoint(variables, checkpoint): logging.info('Request to re-store {} weights from {}'.format( len(variables), checkpoint)) if not variables: logging.error('can\'t find any variables to restore.') sys.exit(1) assign_op, feed_dict = slim.assign_from_checkpoint(checkpoint, variables) all_assign_ops.append(assign_op) all_feed_dict.update(feed_dict)
def assign_from_checkpoint(variables, checkpoint): logging.info('Request to re-store %d weights from %s', len(variables), checkpoint) if not variables: logging.error('Can\'t find any variables to restore.') sys.exit(1) assign_op, feed_dict = slim.assign_from_checkpoint(checkpoint, variables) all_assign_ops.append(assign_op) all_feed_dict.update(feed_dict)
def load_ckpt(sess, model_dir, variables_to_restore=None): ckpt = tf.train.get_checkpoint_state(model_dir) model_path = ckpt.model_checkpoint_path if variables_to_restore is None: variables_to_restore = slim.get_variables_to_restore() restore_op, restore_fd = slim.assign_from_checkpoint( model_path, variables_to_restore) sess.run(restore_op, feed_dict=restore_fd) print(f'{model_path} loaded')
def restore_discriminator(self, params): d_vars = [var.name for var in self.dsc_vars] variables = slim.get_variables_to_restore(include=d_vars) print("variables_dsc: ", variables) path = params.encoder_checkpoint_name if not self.isIdeRun else "../checkpoints/exp70/checkpoint/DCGAN.model-50" print('restoring discriminator to [%s]...' % path) init_restore_op, init_feed_dict = slim.assign_from_checkpoint(model_path=path, var_list=variables) self.sess.run(init_restore_op, feed_dict=init_feed_dict) print('discriminator restored.')
def restore_encoder(self, params): enc_vars = [var.name for var in self.gen_vars if 'g_1' in var.name] variables = slim.get_variables_to_restore(include=enc_vars) # print("variables1: ", variables) path = params.encoder_checkpoint_name if not self.isIdeRun else "../checkpoints/exp70/checkpoint/DCGAN.model-50" print('restoring encoder to [%s]...' % path) init_restore_op, init_feed_dict = slim.assign_from_checkpoint(model_path=path, var_list=variables) self.sess.run(init_restore_op, feed_dict=init_feed_dict) print('encoder restored.')
def restore_alexnet(self, chkp_name): al_var_names = [var.name for var in self.an_vars] variables = slim.get_variables_to_restore(include=al_var_names) # print("variables1: ", variables) path = chkp_name if not self.isIdeRun else "../checkpoints/exp70/checkpoint/DCGAN.model-50" print('restoring alexnet from [%s]...' % path) init_restore_op, init_feed_dict = slim.assign_from_checkpoint(model_path=path, var_list=variables) self.sess.run(init_restore_op, feed_dict=init_feed_dict) print('alexnet restored.')
def get_init_fn(): checkpoint_exclude_scopes=["InceptionV1/Logits", "InceptionV1/AuxLogits", "InceptionV2"] exclusions = [scope.strip() for scope in checkpoint_exclude_scopes] print(exclusions) variables_to_restore = [] for var in slim.get_model_variables(): excluded = False for exclusion in exclusions: if var.op.name.startswith(exclusion): excluded = True break if not excluded: variables_to_restore.append(var) checkpoint_exclude_scopes=["InceptionV2/Logits", "InceptionV2/AuxLogits", "InceptionV1"] exclusions = [scope.strip() for scope in checkpoint_exclude_scopes] print(exclusions) variables_to_restore_2 = [] for var in slim.get_model_variables(): excluded = False for exclusion in exclusions: if var.op.name.startswith(exclusion): excluded = True break if not excluded: variables_to_restore_2.append(var) report_init_assign_op, report_init_feed_dict = slim.assign_from_checkpoint( os.path.join(report_inception_network_dir, 'inception_v1.ckpt'), variables_to_restore_2, ignore_missing_vars=True) satelite_init_assign_op, satelite_init_feed_dict = slim.assign_from_checkpoint( os.path.join(satelite_inception_network_dir, 'inception_v1.ckpt'), variables_to_restore, ignore_missing_vars=True) def init_fn(sess): sess.run(report_init_assign_op, report_init_feed_dict) sess.run(satelite_init_assign_op, satelite_init_feed_dict) return init_fn
def main(_): tf.reset_default_graph() env = environment.get_game_environment(FLAGS.maps, multiproc=FLAGS.multiproc, random_goal=FLAGS.random_goal, random_spawn=FLAGS.random_spawn, apple_prob=FLAGS.apple_prob) exp = expert.Expert() net = CMAP(num_iterations=FLAGS.vin_iterations, estimate_scale=FLAGS.estimate_scale, unified_fuser=FLAGS.unified_fuser, unified_vin=FLAGS.unified_vin, biased_fuser=FLAGS.biased_fuser, biased_vin=FLAGS.biased_vin, regularization=FLAGS.reg) estimate_images = [estimate[0, -1, :, :, 0] for estimate in net.intermediate_tensors['estimate_map_list']] goal_images = [goal[0, -1, :, :, 0] for goal in net.intermediate_tensors['goal_map_list']] reward_images = [reward[0, -1, :, :, 0] for reward in net.intermediate_tensors['reward_map_list']] value_images = [value[0, -1, :, :, 0] for value in net.intermediate_tensors['value_map_list']] action_images = [action[0, -1, :, :, 0] for action in net.intermediate_tensors['action_map_list']] step_history = tf.placeholder(tf.string, name='step_history') step_history_op = tf.summary.text('game/step_history', step_history, collections=['game']) global_step = slim.get_or_create_global_step() update_global_step_op = tf.assign_add(global_step, 1) init_op = tf.variables_initializer([global_step]) load_op, load_feed_dict = slim.assign_from_checkpoint(FLAGS.modeldir, slim.get_variables_to_restore(exclude=[global_step.name])) init_op = tf.group(init_op, load_op) slim.learning.train(train_op=tf.no_op('train'), logdir=FLAGS.logdir, init_op=init_op, init_feed_dict=load_feed_dict, global_step=global_step, train_step_fn=DAGGER_train_step, train_step_kwargs=dict(env=env, exp=exp, net=net, update_global_step_op=update_global_step_op, step_history=step_history, step_history_op=step_history_op, estimate_maps=estimate_images, goal_maps=goal_images, reward_maps=reward_images, value_maps=value_images, action_maps=action_images), number_of_steps=FLAGS.num_games, save_interval_secs=300 if not FLAGS.debug else 60, save_summaries_secs=300 if not FLAGS.debug else 60)
def initialize(self, checkpoint_path=None): """Overwrite default to make lazy""" init_op = tf.variables_initializer(self.get_init_variables()) self.session.run(init_op) if checkpoint_path is not None: self.set_global_step( RestoreTFModelHook.parse_global_step(checkpoint_path)) init_assign_op, init_feed_dict = slim.assign_from_checkpoint( checkpoint_path, self.get_restore_variables(), ignore_missing_vars=True) self.session.run(init_assign_op, feed_dict=init_feed_dict) self.logger.info("Lazily restored from {}".format(checkpoint_path))
def load(self, sess): model_saver = self.get_saver() ckpt = tf.train.latest_checkpoint(str(self.exp_dir), latest_filename='%s_ckpt' % self.scope) if ckpt is None: print('[ %s ] No ckpt found...' % self.scope) return print('Loading %s' % str(ckpt)) init_op, init_feed = slim.assign_from_checkpoint( model_path=ckpt, var_list=self.vars(), ignore_missing_vars=True) sess.run(init_op, init_feed) # model_saver.restore(sess, ckpt) return
def init_points(sess): if Flags.net == 'D': restore = slim.get_model_variables('train/densenet161') restore = {change_checkpoint_name(var): var for var in restore} init_points_dense_op, init_points_dense_feed_dict = slim.assign_from_checkpoint( os.path.join(Flags.checkpoint_dir, 'tf-densenet161.ckpt'), restore) sess.run(init_points_dense_op, init_points_dense_feed_dict) else: restore = slim.get_model_variables('train/vgg_16') restore = { change_checkpoint_name3(var): var for var in restore } init_points_vgg_op, init_points_vgg_feed_dict = slim.assign_from_checkpoint( os.path.join(Flags.checkpoint_dir, 'vgg_16.ckpt'), restore) sess.run(init_points_vgg_op, init_points_vgg_feed_dict)
def testTrainWithInitFromCheckpoint(self): logdir1 = os.path.join(self.get_temp_dir(), 'tmp_logs1/') logdir2 = os.path.join(self.get_temp_dir(), 'tmp_logs2/') if tf.gfile.Exists(logdir1): # For running on jenkins. tf.gfile.DeleteRecursively(logdir1) if tf.gfile.Exists(logdir2): # For running on jenkins. tf.gfile.DeleteRecursively(logdir2) # First, train the model one step (make sure the error is high). g = tf.Graph() with g.as_default(): tf.set_random_seed(0) train_op = self.create_train_op() loss = slim.learning.train(train_op, logdir1, number_of_steps=1) self.assertGreater(loss, .5) # Next, train the model to convergence. g = tf.Graph() with g.as_default(): tf.set_random_seed(1) train_op = self.create_train_op() loss = slim.learning.train(train_op, logdir1, number_of_steps=300) self.assertLess(loss, .02) # Finally, advance the model a single step and validate that the loss is # still low. g = tf.Graph() with g.as_default(): tf.set_random_seed(2) train_op = self.create_train_op() model_variables = tf.all_variables() model_path = os.path.join(logdir1, 'model.ckpt-300') init_op = tf.initialize_all_variables() op, init_feed_dict = slim.assign_from_checkpoint( model_path, model_variables) def InitAssignFn(sess): sess.run(op, init_feed_dict) loss = slim.learning.train(train_op, logdir2, number_of_steps=1, init_op=init_op, init_fn=InitAssignFn) self.assertLess(loss, .02)
def make_init_fn(self, chpt_path): # Handle model initialization from prior checkpoint if chpt_path is None: return None var2restore = slim.get_variables_to_restore(exclude=self.exclude_scopes) print('Variables to restore: {}'.format([v.op.name for v in var2restore])) var2restore = remove_missing(var2restore, chpt_path) init_assign_op, init_feed_dict = slim.assign_from_checkpoint(chpt_path, var2restore) sys.stdout.flush() # Create an initial assignment function. def init_fn(sess): print('Restoring from: {}'.format(chpt_path)) sess.run(init_assign_op, init_feed_dict) return init_fn
def main(unused_argv): if not FLAGS.input_file_pattern: raise ValueError("--input_file_pattern is required.") if not FLAGS.train_dir: raise ValueError("--train_dir is required.") with open(FLAGS.model_config) as json_config_file: model_config = json.load(json_config_file) model_config = configuration.model_config(model_config, mode="train") tf.logging.info("Building training graph.") g = tf.Graph() with g.as_default(): model = s2v_model.s2v(model_config, mode="train") model.build() optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate) train_tensor = tf.contrib.slim.learning.create_train_op( total_loss=model.total_loss, optimizer=optimizer, clip_gradient_norm=FLAGS.clip_gradient_norm) saver = tf.train.Saver(max_to_keep=FLAGS.max_ckpts) checkpoint_path = model_config.checkpoint_path variables_to_restore = slim.get_model_variables() checkpoint_path = tf.train.latest_checkpoint(model_config.checkpoint_path) init_assign_op, init_feed_dict = slim.assign_from_checkpoint( checkpoint_path, variables_to_restore) def InitAssignFn(sess): sess.run(init_assign_op, init_feed_dict) nsteps = int(FLAGS.nepochs * (FLAGS.num_train_inst / FLAGS.batch_size)) slim.learning.train( train_op=train_tensor, logdir=FLAGS.train_dir, graph=g, number_of_steps=nsteps, save_summaries_secs=FLAGS.save_summaries_secs, saver=saver, save_interval_secs=FLAGS.save_model_secs, init_fn=InitAssignFn )
def _create_encoder(preprocess_fn, network_factory, image_shape, batch_size=32, session=None, checkpoint_path=None, read_from_file=False): if read_from_file: num_channels = image_shape[-1] if len(image_shape) == 3 else 1 input_var = tf.placeholder(tf.string, (None, )) image_var = tf.map_fn(lambda x: tf.image.decode_jpeg( tf.read_file(x), channels=num_channels), input_var, back_prop=False, dtype=tf.uint8) image_var = tf.image.resize_images(image_var, image_shape[:2]) else: input_var = tf.placeholder(tf.uint8, (None, ) + image_shape) image_var = input_var preprocessed_image_var = tf.map_fn( lambda x: preprocess_fn(x, is_training=False), image_var, back_prop=False, dtype=tf.float32) feature_var, _ = network_factory(preprocessed_image_var) feature_dim = feature_var.get_shape().as_list()[-1] if session is None: session = tf.Session() if checkpoint_path is not None: tf.train.get_or_create_global_step() init_assign_op, init_feed_dict = slim.assign_from_checkpoint( checkpoint_path, slim.get_model_variables()) session.run(init_assign_op, feed_dict=init_feed_dict) def encoder(data_x): out = np.zeros((len(data_x), feature_dim), np.float32) queued_trainer.run_in_batches( lambda x: session.run(feature_var, feed_dict=x), {input_var: data_x}, out, batch_size) return out return encoder
def testTrainWithInitFromCheckpoint(self): logdir1 = os.path.join(self.get_temp_dir(), "tmp_logs1/") logdir2 = os.path.join(self.get_temp_dir(), "tmp_logs2/") if tf.gfile.Exists(logdir1): # For running on jenkins. tf.gfile.DeleteRecursively(logdir1) if tf.gfile.Exists(logdir2): # For running on jenkins. tf.gfile.DeleteRecursively(logdir2) # First, train the model one step (make sure the error is high). g = tf.Graph() with g.as_default(): tf.set_random_seed(0) train_op = self.create_train_op() loss = slim.learning.train(train_op, logdir1, number_of_steps=1) self.assertGreater(loss, 0.5) # Next, train the model to convergence. g = tf.Graph() with g.as_default(): tf.set_random_seed(1) train_op = self.create_train_op() loss = slim.learning.train(train_op, logdir1, number_of_steps=300) self.assertLess(loss, 0.02) # Finally, advance the model a single step and validate that the loss is # still low. g = tf.Graph() with g.as_default(): tf.set_random_seed(2) train_op = self.create_train_op() model_variables = tf.all_variables() model_path = os.path.join(logdir1, "model.ckpt-300") init_op = tf.initialize_all_variables() op, init_feed_dict = slim.assign_from_checkpoint(model_path, model_variables) def InitAssignFn(sess): sess.run(op, init_feed_dict) loss = slim.learning.train(train_op, logdir2, number_of_steps=1, init_op=init_op, init_fn=InitAssignFn) self.assertLess(loss, 0.02)
def val(config): val_dataset = Dataset(os.path.join(config['input']['path'])) with tf.Graph().as_default(): with tf.name_scope('val') as scope: val_loss, val_accuracy, val_summary = build_val_graph(config, val_dataset) exclude = cnn_architectures.model_weight_excludes(config['model']['architecture']) variables_to_restore = slim.get_variables_to_restore() init_assign_op, init_feed_dict = slim.assign_from_checkpoint(config['model']['checkpoint'], variables_to_restore) # initialize with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(init_assign_op, init_feed_dict) # Start the queue runners. coord = tf.train.Coordinator() tf.train.start_queue_runners(sess=sess, coord=coord) print('graph built') com_acc = 0.0 com_loss = 0.0 count = 0 for x in range(val_dataset.num_images() // config['parameters']['batch_size'] + 1): acc_v, loss_v = sess.run([val_accuracy, val_loss]) com_acc += acc_v com_loss += loss_v count += 1 print('validation loss: {} validation_accuracy: {}'.format(com_loss / count, com_acc / count)) logging.info('accuracy = {}, loss = {}'.format(acc_v, loss_v))
def load_model(self, dirname): self.init() # Try to load the model from the given directory latest_checkpoint = tf.train.latest_checkpoint(dirname) # If no model available, append current model's scoped name if latest_checkpoint is None: dirname = os.path.join(dirname, self.scoped_name) latest_checkpoint = tf.train.latest_checkpoint(dirname) if latest_checkpoint is None: raise RuntimeError( 'Model checkpoint not found at {}'.format(dirname)) with self.graph.as_default(): # Use the slim package to load the checkpoint - this gives a chance to ignore missing variables init_assign_op, init_feed_dict = slim.assign_from_checkpoint( latest_checkpoint, self.parameters, ignore_missing_vars=True) self.sess.run(init_assign_op, feed_dict=init_feed_dict) self.is_initialized = True self.reset_performance_stats()
def train(args): model = AppearanceNetwork(args) save_directory = './save/' log_file_path = './training.log' log_file = open(log_file_path, 'w') config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Graph().as_default(): global_step = tf.Variable(0, name='global_step', trainable=False) image_patches_placeholder = tf.placeholder( tf.float32, shape=[args.batch_size, 7, 128, 64, 3]) labels_placeholder = tf.placeholder(tf.float32, shape=[args.batch_size]) lr = tf.Variable(args.base_learning_rate, trainable=False, name="learning_rate") features, logits = model.inference(image_patches_placeholder) loss = model.cross_entropy_loss(logits, labels_placeholder) train_op = build_graph(args, global_step, lr, loss) sess = tf.Session() saver = tf.train.Saver(max_to_keep=100) ckpt = tf.train.get_checkpoint_state('./save') if ckpt is None: init = tf.global_variables_initializer() sess.run(init) if args.pretrained_ckpt_path is not None: # slim.get_or_create_global_step() init_assign_op, init_feed_dict = slim.assign_from_checkpoint( args.pretrained_ckpt_path, slim.get_variables_to_restore(exclude=[ "lstm", "fc_layer", "loss", "learning_rate", "softmax", "global_step" ])) sess.run(init_assign_op, feed_dict=init_feed_dict) else: print 'Loading Model from ' + ckpt.model_checkpoint_path saver.restore(sess, ckpt.model_checkpoint_path) best_epoch = -1 best_loss_epoch = 0.0 for curr_epoch in range(args.num_epoches): training_loss_epoch = 0.0 valid_loss_epoch = 0.0 ############################################# Training process ###################################### print 'Training epoch ' + str(curr_epoch + 1) + '........................' training_data_loader = DataLoader(is_valid=False) if curr_epoch % 10 == 0: sess.run( tf.assign( lr, args.base_learning_rate * (args.decay_rate**curr_epoch / 10))) training_data_loader.shuffle() training_data_loader.reset_pointer() for step in range(training_data_loader.num_batches): start_time = time.time() image_patches, labels = training_data_loader.next_batch() _, loss_batch = sess.run( [train_op, loss], feed_dict={ image_patches_placeholder: image_patches, labels_placeholder: labels }) end_time = time.time() training_loss_epoch += loss_batch print( "Training {}/{} (epoch {}), train_loss = {:.8f}, time/batch = {:.3f}" .format(step + 1, training_data_loader.num_batches, curr_epoch + 1, loss_batch, end_time - start_time)) print 'Epoch ' + str(curr_epoch + 1) + ' training is done! Saving model...' checkpoint_path = os.path.join(save_directory, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=global_step) ############################################# Validating process ###################################### print 'Validating epoch ' + str(curr_epoch + 1) + '...........................' valid_data_loader = DataLoader(is_valid=True) valid_data_loader.shuffle() valid_data_loader.reset_pointer() for step in range(valid_data_loader.num_batches): start_time = time.time() image_patches, labels = valid_data_loader.next_batch() loss_batch = sess.run(loss, feed_dict={ image_patches_placeholder: image_patches, labels_placeholder: labels }) end_time = time.time() valid_loss_epoch += loss_batch print( "Validating {}/{} (epoch {}), valid_loss = {:.8f}, time/batch = {:.3f}" .format(step + 1, valid_data_loader.num_batches, curr_epoch + 1, loss_batch, end_time - start_time)) # Update best valid epoch if best_epoch == -1 or best_loss_epoch > valid_loss_epoch: best_epoch = curr_epoch + 1 best_loss_epoch = valid_loss_epoch log_file.write('epoch ' + str(curr_epoch + 1) + '\n') log_file.write( str(curr_epoch + 1) + ',' + str(training_loss_epoch) + '\n') log_file.write( str(curr_epoch + 1) + ',' + str(valid_loss_epoch) + '\n') log_file.write(str(best_epoch) + ',' + str(best_loss_epoch) + '\n') log_file.close()
def main(): model = config.get('config', 'model') logdir = utils.get_logdir(config) if args.delete: tf.logging.warn('delete logging directory: ' + logdir) shutil.rmtree(logdir, ignore_errors=True) cachedir = utils.get_cachedir(config) with open(os.path.join(cachedir, 'names'), 'r') as f: names = [line.strip() for line in f] width = config.getint(model, 'width') height = config.getint(model, 'height') cell_width, cell_height = utils.calc_cell_width_height(config, width, height) tf.logging.warn('(width, height)=(%d, %d), (cell_width, cell_height)=(%d, %d)' % (width, height, cell_width, cell_height)) yolo = importlib.import_module('model.' + model) paths = [os.path.join(cachedir, profile + '.tfrecord') for profile in args.profile] num_examples = sum(sum(1 for _ in tf.python_io.tf_record_iterator(path)) for path in paths) tf.logging.warn('num_examples=%d' % num_examples) with tf.name_scope('batch'): image_rgb, labels = utils.data.load_image_labels(paths, len(names), width, height, cell_width, cell_height, config) with tf.name_scope('per_image_standardization'): image_std = tf.image.per_image_standardization(image_rgb) batch = tf.train.shuffle_batch((image_std,) + labels, batch_size=args.batch_size, capacity=config.getint('queue', 'capacity'), min_after_dequeue=config.getint('queue', 'min_after_dequeue'), num_threads=multiprocessing.cpu_count() ) global_step = tf.contrib.framework.get_or_create_global_step() builder = yolo.Builder(args, config) builder(batch[0], training=True) with tf.name_scope('total_loss') as name: builder.create_objectives(batch[1:]) total_loss = tf.losses.get_total_loss(name=name) variables_to_restore = slim.get_variables_to_restore(exclude=args.exclude) with tf.name_scope('optimizer'): try: decay_steps = config.getint('exponential_decay', 'decay_steps') decay_rate = config.getfloat('exponential_decay', 'decay_rate') staircase = config.getboolean('exponential_decay', 'staircase') learning_rate = tf.train.exponential_decay(args.learning_rate, global_step, decay_steps, decay_rate, staircase=staircase) tf.logging.warn('using a learning rate start from %f with exponential decay (decay_steps=%d, decay_rate=%f, staircase=%d)' % (args.learning_rate, decay_steps, decay_rate, staircase)) except (configparser.NoSectionError, configparser.NoOptionError): learning_rate = args.learning_rate tf.logging.warn('using a staionary learning rate %f' % args.learning_rate) optimizer = get_optimizer(config, args.optimizer)(learning_rate) tf.logging.warn('optimizer=' + args.optimizer) train_op = slim.learning.create_train_op(total_loss, optimizer, global_step, clip_gradient_norm=args.gradient_clip, summarize_gradients=config.getboolean('summary', 'gradients'), ) if args.transfer: path = os.path.expanduser(os.path.expandvars(args.transfer)) tf.logging.warn('transferring from ' + path) init_assign_op, init_feed_dict = slim.assign_from_checkpoint(path, variables_to_restore) def init_fn(sess): sess.run(init_assign_op, init_feed_dict) tf.logging.warn('transferring from global_step=%d, learning_rate=%f' % sess.run((global_step, learning_rate))) else: init_fn = lambda sess: tf.logging.warn('global_step=%d, learning_rate=%f' % sess.run((global_step, learning_rate))) summary(config) tf.logging.warn('tensorboard --logdir ' + logdir) slim.learning.train(train_op, logdir, master=args.master, is_chief=(args.task == 0), global_step=global_step, number_of_steps=args.steps, init_fn=init_fn, summary_writer=tf.summary.FileWriter(os.path.join(logdir, args.logname)), save_summaries_secs=args.summary_secs, save_interval_secs=args.save_secs )
def train(): seed = 8964 tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) pp = pprint.PrettyPrinter() pp.pprint(flags.FLAGS.__flags) if not os.path.exists(opt.checkpoint_dir): os.makedirs(opt.checkpoint_dir) with tf.Graph().as_default(): # Data Loader loader = DataLoader(opt) tgt_image, src_image_stack, intrinsics = loader.load_train_batch() #print(tgt_image.shape, src_image_stack.shape) # Build Model model = GeoNetModel(opt, tgt_image, src_image_stack, intrinsics) loss = model.total_loss #mask = {} #mask["mask"] = model.tgt_image_tile_pyramid[0][0:,:,3:4] #mask["target"] = model.tgt_image # Train Op summary_mask = tf.summary.image('v1_mask', model.dispnet_inputs_mask[:1, :, :, :], 1) #summary_mask_swell = tf.summary.image('v1_mask_swell', model.dispnet_inputs_mask_swell[:1,:,:,:], 1) summary_fwd_tgt_ignore = tf.summary.image( 'v1_fwd_tgt_ignore', model.fwd_tgt_ignore[0][:1, :, :, :], 1) summary_bwd_src_ignore = tf.summary.image( 'v1_bwd_src_ignore', model.bwd_src_ignore[0][:1, :, :, :], 1) summary_fwd_tgt_ignore_full = tf.summary.image( 'v1_fwd_tgt_ignore_full', model.fwd_tgt_ignore_full[0][:1, :, :, :], 1) summary_bwd_src_ignore_full = tf.summary.image( 'v1_bwd_src_ignore_full', model.bwd_src_ignore_full[0][:1, :, :, :], 1) summary_img = tf.summary.image('v1_warp_img', model.tgt_image[:1, :, :, :3], 3) summary_depth = tf.summary.image('v1_depth', model.pred_depth[0][:1], 1) summary_rigid_fwd = tf.summary.image( 'v1_rigid_fwd', tf.concat([ model.fwd_rigid_flow_origin_pyramid[0][4:5], tf.zeros((1, 128, 416, 1)) ], 3), 3) summary_rigid_bwd = tf.summary.image( 'v1_rigid_bwd', tf.concat([ model.bwd_rigid_flow_origin_pyramid[0][4:5], tf.zeros((1, 128, 416, 1)) ], 3), 3) summary_flow_fwd = tf.summary.image( 'v1_rflow_fwd', tf.concat([ model.fwd_rigid_flow_pyramid[0][4:5], tf.zeros((1, 128, 416, 1)) ], 3), 3) summary_flow_bwd = tf.summary.image( 'v1_rflow_bwd', tf.concat([ model.bwd_rigid_flow_pyramid[0][4:5], tf.zeros((1, 128, 416, 1)) ], 3), 3) summary_res_fwd = tf.summary.image( 'v1_res_fwd', tf.concat([ model.fwd_res_flow_pyramid[0][4:5], tf.zeros((1, 128, 416, 1)) ], 3), 3) summary_res_bwd = tf.summary.image( 'v1_res_bwd', tf.concat([ model.bwd_res_flow_pyramid[0][4:5], tf.zeros((1, 128, 416, 1)) ], 3), 3) summary_warp_fwd = tf.summary.image( 'v1_warp_fwd', model.fwd_rigid_warp_pyramid[0][4:5, :, :, :3], 3) summary_warp_bwd = tf.summary.image( 'v1_warp_bwd', model.fwd_rigid_warp_pyramid[0][:1, :, :, :3], 3) summary_rigid_warp_loss = tf.summary.scalar('v1_rigid_warp_loss', model.rigid_warp_loss) summary_disp_smooth_loss = tf.summary.scalar('v1_disp_smooth_loss', model.disp_smooth_loss) summary_flow_warp_loss = tf.summary.scalar('v1_flow_warp_loss', model.flow_warp_loss) summary_flow_smooth_loss = tf.summary.scalar('v1_flow_smooth_loss', model.flow_smooth_loss) summary_rigid_smooth_loss = tf.summary.scalar('v1_rigid_smooth_loss', model.rigid_smooth_loss) #summary_depth_constraint_loss = tf.summary.scalar('v1_depth_constraint_loss', model.depth_constraint_loss) summary_flow_consistency_loss = tf.summary.scalar( 'v1_flow_consistency_loss', model.flow_consistency_loss) summary_rigid_consistency_loss = tf.summary.scalar( 'v1_rigid_consistency_loss', model.rigid_consistency_loss) merged_summary = tf.summary.merge_all() if opt.mode == 'train_flow' and opt.flownet_type == "residual": # we pretrain DepthNet & PoseNet, then finetune ResFlowNetS train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "flow_net") vars_to_restore = slim.get_variables_to_restore( include=["depth_net", "pose_net"]) else: train_vars = [var for var in tf.trainable_variables()] vars_to_restore = slim.get_model_variables() if opt.init_ckpt_file != None: init_assign_op, init_feed_dict = slim.assign_from_checkpoint( opt.init_ckpt_file, vars_to_restore) optim = tf.train.AdamOptimizer(opt.learning_rate, 0.9) train_op = slim.learning.create_train_op(loss, optim, variables_to_train=train_vars) # Global Step global_step = tf.Variable(0, name='global_step', trainable=False) incr_global_step = tf.assign(global_step, global_step + 1) # Parameter Count parameter_count = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) \ for v in train_vars]) # Saver saver = tf.train.Saver([var for var in tf.model_variables()] + \ [global_step], max_to_keep=opt.max_to_keep) # Session sv = tf.train.Supervisor(logdir=opt.checkpoint_dir, save_summaries_secs=0, saver=None) config = tf.ConfigProto() config.gpu_options.allow_growth = True with sv.managed_session(config=config) as sess: writer = tf.summary.FileWriter('../logs_v1', sess.graph) print('Trainable variables: ') for var in train_vars: print(var.name) print(var.shape) print("parameter_count =", sess.run(parameter_count)) if opt.init_ckpt_file != None: sess.run(init_assign_op, init_feed_dict) start_time = time.time() for step in range(1, opt.max_steps): fetches = { "train": train_op, "global_step": global_step, "incr_global_step": incr_global_step, #"input": model.tgt_image } if step % 10 == 0: mysum = sess.run(merged_summary) writer.add_summary(mysum, step) writer.flush() if step % 100 == 0: fetches["loss"] = loss results = sess.run(fetches) #aaaa = np.array(sess.run(mask)["target"]) #print(results["input"][0,:,:,3:4]) #print aaaa[0,100:,:50,3] #print aaaa[0,100:,:50,3].max() #print aaaa[0,100:,:50,3].min() #print aaaa[0,100:,:50,3].shape if step % 100 == 0: time_per_iter = (time.time() - start_time) / 100 start_time = time.time() print('Iteration: [%7d] | Time: %4.4fs/iter | Loss: %.3f' \ % (step, time_per_iter, results["loss"])) if step % opt.save_ckpt_freq == 0: saver.save(sess, os.path.join(opt.checkpoint_dir, 'model'), global_step=step)
def train(): seed = 8964 tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) if not os.path.exists(opt.checkpoint_dir): os.makedirs(opt.checkpoint_dir) with tf.Graph().as_default(): global_step = tf.Variable(0, name='global_step', trainable=False) incr_global_step = tf.assign(global_step, global_step + 1) optim = tf.train.AdamOptimizer(opt.learning_rate, 0.9) loader = DataLoader(opt) losses = [] img_losses = [] rigid_warp_losses = [] disp_smooth_losses = [] sem_losses = [] sem_warp_losses = [] sem_mask_losses = [] sem_edge_losses = [] sem_seg_losses = [] ins0_seg_losses = [] ins1_edge_seg_losses = [] ins_losses = [] with tf.variable_scope(tf.get_variable_scope()): for i in range(opt.num_gpus): with tf.device('/gpu:{:d}'.format(i)): with tf.name_scope('gpu{:d}'.format(i)): # Get images batch from data loader tgt_image, src_image_stack, intrinsics, tgt_sem_tuple, src_sem_stack_tuple, tgt_ins_tuple, src_ins_stack_tuple = loader.load_train_batch( ) # Build Model model = SIGNetModel(opt, tgt_image, src_image_stack, intrinsics, tgt_sem_tuple, src_sem_stack_tuple, tgt_ins_tuple, src_ins_stack_tuple) # Handle losses losses.append(model.total_loss) tf.get_variable_scope().reuse_variables() img_losses.append(model.img_loss) rigid_warp_losses.append(model.rigid_warp_loss) disp_smooth_losses.append(model.disp_smooth_loss) if opt.sem_as_loss: sem_losses.append(model.sem_loss) if opt.sem_warp_explore: sem_warp_losses.append(model.sem_warp_loss) if opt.sem_mask_explore: sem_mask_losses.append(model.sem_mask_loss) if opt.sem_edge_explore: sem_edge_losses.append(model.sem_edge_loss) if opt.ins_as_loss: ins_losses.append(model.ins_loss) if opt.sem_assist and opt.add_segnet: sem_seg_losses.append(model.sem_seg_loss) ins0_seg_losses.append(model.ins0_seg_loss) ins1_edge_seg_losses.append( model.ins1_edge_seg_loss) #TODO tensorboard tf.summary.image('tgt_image_g%02d' % (i), tgt_image, max_outputs=opt.max_outputs) tf.summary.image('src_image_prev_g%02d' % (i), src_image_stack[:, :, :, :3], max_outputs=opt.max_outputs) tf.summary.image('src_image_next_g%02d' % (i), src_image_stack[:, :, :, 3:], max_outputs=opt.max_outputs) tf.summary.scalar('loss_g%02d' % (i), model.total_loss) tf.summary.scalar('img_loss_g%02d' % (i), model.img_loss) tf.summary.scalar('rigid_warp_loss_g%02d' % (i), model.rigid_warp_loss) tf.summary.scalar('disp_smooth_loss_g%02d' % (i), model.disp_smooth_loss) if opt.sem_as_loss: tf.summary.scalar('sem_loss_g%02d' % (i), model.sem_loss) if opt.sem_warp_explore: tf.summary.scalar('sem_warp_loss_g%02d' % (i), model.sem_warp_loss) if opt.ins_as_loss: tf.summary.scalar('ins_loss_g%02d' % (i), model.ins_loss) if opt.sem_assist and opt.add_segnet: tf.summary.scalar('sem_seg_loss_g%02d' % (i), model.sem_seg_loss) tf.summary.scalar('ins0_seg_loss_g%02d' % (i), model.ins0_seg_loss) tf.summary.scalar('ins1_edge_seg_loss_g%02d' % (i), model.ins1_edge_seg_loss) #TODO Add bookkeeping ops if i == 0: # Train Op if opt.mode == 'train_flow' and opt.flownet_type == "residual": train_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "flow_net") else: #TODO try to enable a solution to fix posenet weight in first stage if opt.mode == 'train_rigid' and opt.fixed_posenet: if opt.new_sem_dispnet: train_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "depth_sem_net") else: train_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, "depth_net") else: train_vars = [ var for var in tf.trainable_variables() ] loading_net = ["depth_net", "pose_net"] if opt.new_sem_dispnet: loading_net.append("depth_sem_net") if opt.new_sem_posenet: loading_net.append("pose_sem_net") vars_to_restore = slim.get_variables_to_restore( include=loading_net) if opt.init_ckpt_file != None: init_assign_op, init_feed_dict = slim.assign_from_checkpoint( opt.init_ckpt_file, vars_to_restore) #TODO Cal mean losses among gpus, and track the loss in TF Summary. loss = tf.stack(axis=0, values=losses) loss = tf.reduce_mean(loss, 0) tf.summary.scalar('loss', loss) rigid_warp_loss = tf.stack(axis=0, values=rigid_warp_losses) rigid_warp_loss = tf.reduce_mean(rigid_warp_loss, 0) tf.summary.scalar('rigid_warp_loss', rigid_warp_loss) tf.summary.scalar( 'unit_rigid_warp_loss', rigid_warp_loss / (opt.rigid_warp_weight + tf.convert_to_tensor(1e-8, dtype=tf.float32))) disp_smooth_loss = tf.stack(axis=0, values=disp_smooth_losses) disp_smooth_loss = tf.reduce_mean(disp_smooth_loss, 0) tf.summary.scalar('disp_smooth_loss', disp_smooth_loss) tf.summary.scalar( 'unit_disp_smooth_loss', disp_smooth_loss / (opt.disp_smooth_weight + tf.convert_to_tensor(1e-8, dtype=tf.float32))) img_loss = tf.stack(axis=0, values=img_losses) img_loss = tf.reduce_mean(img_loss, 0) tf.summary.scalar('img_loss', img_loss) if opt.sem_as_loss: sem_loss = tf.stack(axis=0, values=sem_losses) sem_loss = tf.reduce_mean(sem_loss, 0) tf.summary.scalar('sem_loss', sem_loss) if opt.sem_warp_explore: sem_warp_loss = tf.stack(axis=0, values=sem_warp_losses) sem_warp_loss = tf.reduce_mean(sem_warp_loss, 0) tf.summary.scalar('sem_warp_loss', model.sem_warp_loss) tf.summary.scalar( 'unit_sem_warp_loss', model.sem_warp_loss / (opt.sem_warp_weight + tf.convert_to_tensor(1e-8, dtype=tf.float32))) if opt.sem_mask_explore: sem_mask_loss = tf.stack(axis=0, values=sem_mask_losses) sem_mask_loss = tf.reduce_mean(sem_mask_loss, 0) tf.summary.scalar('sem_mask_loss', model.sem_mask_loss) tf.summary.scalar( 'unit_sem_mask_loss', model.sem_mask_loss / (opt.sem_mask_weight + tf.convert_to_tensor(1e-8, dtype=tf.float32))) if opt.sem_edge_explore: sem_edge_loss = tf.stack(axis=0, values=sem_edge_losses) sem_edge_loss = tf.reduce_mean(sem_edge_loss, 0) tf.summary.scalar('sem_edge_loss', model.sem_edge_loss) tf.summary.scalar( 'unit_sem_edge_loss', model.sem_edge_loss / (opt.sem_edge_weight + tf.convert_to_tensor(1e-8, dtype=tf.float32))) if opt.sem_assist and opt.add_segnet: sem_seg_loss = tf.stack(axis=0, values=sem_seg_losses) sem_seg_loss = tf.reduce_mean(sem_seg_loss, 0) tf.summary.scalar('sem_seg_loss', sem_seg_loss) tf.summary.scalar( 'unit_sem_seg_loss', model.sem_seg_loss / (opt.sem_seg_weight + tf.convert_to_tensor(1e-8, dtype=tf.float32))) ins0_seg_loss = tf.stack(axis=0, values=ins0_seg_losses) ins0_seg_loss = tf.reduce_mean(ins0_seg_loss, 0) tf.summary.scalar('ins0_seg_loss', ins0_seg_loss) tf.summary.scalar( 'unit_ins0_seg_loss', model.ins0_seg_loss / (opt.ins0_seg_weight + tf.convert_to_tensor(1e-8, dtype=tf.float32))) ins1_edge_seg_loss = tf.stack(axis=0, values=ins1_edge_seg_losses) ins1_edge_seg_loss = tf.reduce_mean(ins1_edge_seg_loss, 0) tf.summary.scalar('ins1_edge_seg_loss', ins1_edge_seg_loss) tf.summary.scalar( 'unit_ins1_edge_seg_loss', model.ins1_edge_seg_loss / (opt.ins1_edge_seg_weight + tf.convert_to_tensor(1e-8, dtype=tf.float32))) if opt.ins_as_loss: ins_loss = tf.stack(axis=0, values=ins_losses) ins_loss = tf.reduce_mean(ins_loss, 0) tf.summary.scalar('ins_loss', ins_loss) train_op = slim.learning.create_train_op( loss, optim, variables_to_train=train_vars, colocate_gradients_with_ops=True) # Saver saver = tf.train.Saver([var for var in tf.model_variables()] + \ [global_step], max_to_keep=opt.max_to_keep) merged_summary = tf.summary.merge_all() # Session sv = tf.train.Supervisor(logdir=opt.checkpoint_dir, save_summaries_secs=0, saver=None) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with sv.managed_session(config=config) as sess: train_writer = tf.summary.FileWriter(opt.summary_dir, sess.graph) if opt.init_ckpt_file != None: sess.run(init_assign_op, init_feed_dict) start_time = time.time() for step in range(1, opt.max_steps): fetches = { "train": train_op, "global_step": global_step, "incr_global_step": incr_global_step } if step % opt.print_interval == 0: fetches["loss"] = loss fetches["img_loss"] = img_loss if opt.sem_as_loss: fetches["sem_loss"] = sem_loss if opt.ins_as_loss: fetches["ins_loss"] = ins_loss if opt.add_segnet: fetches["sem_seg_loss"] = sem_seg_loss fetches["ins0_seg_loss"] = ins0_seg_loss fetches["ins1_edge_seg_loss"] = ins1_edge_seg_loss results = sess.run(fetches) #TODO Write TF Summary to file. if step % opt.save_summ_freq == 0: step_summary = sess.run(merged_summary) train_writer.add_summary(step_summary, step) if step % opt.print_interval == 0: time_per_iter = (time.time() - start_time) / opt.print_interval start_time = time.time() if opt.sem_as_loss: print('Iteration: [%7d] | Time: %4.4fs/iter | Loss: %.3f ImgLoss: %.3f SemLoss: %.3f' \ % (step, time_per_iter, results["loss"], results["img_loss"], results["sem_loss"])) elif opt.ins_as_loss: print('Iteration: [%7d] | Time: %4.4fs/iter | Loss: %.3f ImgLoss: %.3f InsLoss: %.3f' \ % (step, time_per_iter, results["loss"], results["img_loss"], results["ins_loss"])) else: print('Iteration: [%7d] | Time: %4.4fs/iter | ImgLoss: %.3f' \ % (step, time_per_iter, results["loss"])) if step % opt.save_ckpt_freq == 0: saver.save(sess, os.path.join(opt.checkpoint_dir, 'model'), global_step=step)
def train(): seed = 8964 tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) pp = pprint.PrettyPrinter() pp.pprint(flags.FLAGS.__flags) if not os.path.exists(opt.checkpoint_dir): os.makedirs(opt.checkpoint_dir) with tf.Graph().as_default(): # Data Loader loader = DataLoader(opt) tgt_image, src_image_stack, intrinsics = loader.load_train_batch() # Build Model model = GeoNetModel(opt, tgt_image, src_image_stack, intrinsics) loss = model.total_loss # Train Op if opt.mode == 'train_flow' and opt.flownet_type == "residual": # we pretrain DepthNet & PoseNet, then finetune ResFlowNetS train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "flow_net") vars_to_restore = slim.get_variables_to_restore(include=["depth_net", "pose_net"]) else: train_vars = [var for var in tf.trainable_variables()] vars_to_restore = slim.get_model_variables() if opt.init_ckpt_file != None: init_assign_op, init_feed_dict = slim.assign_from_checkpoint( opt.init_ckpt_file, vars_to_restore) optim = tf.train.AdamOptimizer(opt.learning_rate, 0.9) train_op = slim.learning.create_train_op(loss, optim, variables_to_train=train_vars) # Global Step global_step = tf.Variable(0, name='global_step', trainable=False) incr_global_step = tf.assign(global_step, global_step+1) # Parameter Count parameter_count = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) \ for v in train_vars]) # Saver saver = tf.train.Saver([var for var in tf.model_variables()] + \ [global_step], max_to_keep=opt.max_to_keep) # Session sv = tf.train.Supervisor(logdir=opt.checkpoint_dir, save_summaries_secs=0, saver=None) config = tf.ConfigProto() config.gpu_options.allow_growth = True with sv.managed_session(config=config) as sess: print('Trainable variables: ') for var in train_vars: print(var.name) print("parameter_count =", sess.run(parameter_count)) if opt.init_ckpt_file != None: sess.run(init_assign_op, init_feed_dict) start_time = time.time() for step in range(1, opt.max_steps): fetches = { "train": train_op, "global_step": global_step, "incr_global_step": incr_global_step } if step % 100 == 0: fetches["loss"] = loss results = sess.run(fetches) if step % 100 == 0: time_per_iter = (time.time() - start_time) / 100 start_time = time.time() print('Iteration: [%7d] | Time: %4.4fs/iter | Loss: %.3f' \ % (step, time_per_iter, results["loss"])) if step % opt.save_ckpt_freq == 0: saver.save(sess, os.path.join(opt.checkpoint_dir, 'model'), global_step=step)
def run(self, feed_generator, train_op, log_dir="/tmp/slim_trainer/", restore_path=None, variables_to_restore=None, run_id=None, max_checkpoints_to_keep=0, **kwargs): """ Run training. Parameters ---------- feed_generator : Iterator[ndarray, ...] An iterator or generator that returns batches of training data; must return a one-to-one correspondence with the `enqueue_vars` passed to the constructor of this class. train_op : tf.Tensor The training operation created with `slim.learning.create_train_op`. log_dir : Optional[str] Path to TensorFlow log directory. This value is used in conjunction with `run_id` to generate the checkpoint and summary directory; defaults to '/tmp/slim_trainer'. restore_path : Optional[str] An optional checkpoint path. If not None, resumes training from the given checkpoint. variables_to_restore : Optional[List[str]] An optional list of variable scopes. If not None, only restores variables under the given scope. This value is ignored if `restore_path` is None. run_id : Optional[str] A string that identifies this training run. The checkpoints and TensorFlow summaries are stored in `log_dir/run_id`. If None, a random ID will be generated. Point tensorboard to this directory to monitor training progress. max_checkpoints_to_keep : int Keep only the `max_checkpoints_to_keep` newest checkpoints. If 0, keep all checkpoints. kwargs: Additional named arguments passed on to tf.slim.learning.train, e.g., `number_of_steps=100` to run 100 iterations of training. """ if restore_path is not None: if variables_to_restore is None: variables_to_restore = slim.get_variables_to_restore() init_assign_op, init_feed_dict = slim.assign_from_checkpoint( restore_path, variables_to_restore) self._init_fns.append( lambda sess: sess.run(init_assign_op, init_feed_dict)) self._feed_generator = ThreadSafeIterator(feed_generator) self._coordinator = tf.train.Coordinator() if run_id is None: run_id = _generate_run_id(6) log_dir = os.path.join(log_dir, run_id) print("---------------------------------------") print("Run ID: ", run_id) print("Log directory: ", log_dir) print("---------------------------------------") saver = tf.compat.v1.train.Saver(max_to_keep=max_checkpoints_to_keep) try: slim.learning.train(train_op, log_dir, self._train_step_fn, saver=saver, **kwargs) except UnboundLocalError: # NOTE(nwojke): Due to a bug in slim, a local variable 'total_loss' # is referenced when an exception is raised during training. We # catch the exception here because it occurs whenever we close the # queue with self._stop_all_threads(). pass self._wait_for_threads()
def __init__(self, FLAGS, session, prefix='model'): '''initialize model ''' self.sess = session self.action_dim = FLAGS.action_dim # self.output_size = FLAGS.output_size self.prefix = prefix self.device = FLAGS.device self.FLAGS = FLAGS self.lr = self.FLAGS.learning_rate self.global_step = tf.Variable(0, name='global_step', trainable=False) #define the input size of the network input if self.FLAGS.network == 'mobile': self.input_size = [ mobile_net.default_image_size[FLAGS.depth_multiplier], mobile_net.default_image_size[FLAGS.depth_multiplier], 3 ] elif self.FLAGS.network == 'mobile_nfc': self.input_size = [ mobile_nfc_net.default_image_size[FLAGS.depth_multiplier], mobile_nfc_net.default_image_size[FLAGS.depth_multiplier], 3 * self.FLAGS.n_frames ] elif sum([ self.FLAGS.network.startswith(name) for name in ['alex', 'squeeze', 'tiny'] ]): versions = { 'alex': alex_net, 'alex_v1': alex_net_v1, 'alex_v2': alex_net_v2, 'alex_v3': alex_net_v3, 'alex_v4': alex_net_v4, 'squeeze': squeeze_net, 'squeeze_v1': squeeze_net_v1, 'squeeze_v2': squeeze_net_v2, 'squeeze_v3': squeeze_net_v3, 'tiny': tiny_net, 'tiny_v1': tiny_net_v1, 'tiny_v2': tiny_net_v2, 'tiny_v2r': tiny_net_v2_r, 'tiny_v3': tiny_net_v3, 'tiny_v4': tiny_net_v4, 'tiny_CAM': tiny_CAM_net } self.input_size = versions[self.FLAGS.network].default_image_size else: raise NotImplementedError('Network is unknown: ', self.FLAGS.network) self.input_size = [None] + self.input_size self.output_size = int( self.action_dim if not self.FLAGS.discrete else self.action_dim * self.FLAGS.action_quantity) # define a network for training and for evaluation self.inputs = tf.placeholder(tf.float32, shape=self.input_size, name='Inputs') self.endpoints = {} for mode in ['train', 'eval']: self.define_network(mode) params = sum([ reduce(lambda x, y: x * y, v.get_shape().as_list()) for v in tf.trainable_variables() ]) print("total number of parameters: {0}".format(params)) if self.FLAGS.discrete: self.define_discrete_bins(FLAGS.action_bound, FLAGS.action_quantity) self.add_discrete_control_layers(self.endpoints['train']) self.add_discrete_control_layers(self.endpoints['eval']) # Only feature extracting part is initialized from pretrained model if not self.FLAGS.continue_training: # make sure you exclude the prediction layers of the model list_to_exclude = ["global_step"] list_to_exclude.append("MobilenetV1/control") list_to_exclude.append("MobilenetV1/aux_depth") list_to_exclude.append("H_fc_control") list_to_exclude.append("outputs") list_to_exclude.append("MobilenetV1/q_depth") list_to_exclude.append("Omega") print("[model.py]: only load feature extracting part in network.") else: #If continue training print("[model.py]: continue training of total network.") # list_to_exclude = ["Omega"] list_to_exclude = [] # In case of lifelonglearning and continue learning: # add variables for importance weights of previous domain and keep optimal variables for previous domain if self.FLAGS.lifelonglearning or self.FLAGS.update_importance_weights: self.define_importance_weights(self.endpoints['train']) variables_to_restore = slim.get_variables_to_restore( exclude=list_to_exclude) # get latest folder out of training directory if there is no checkpoint file if self.FLAGS.checkpoint_path[0] != '/': self.FLAGS.checkpoint_path = self.FLAGS.summary_dir + self.FLAGS.checkpoint_path if not os.path.isfile(self.FLAGS.checkpoint_path + '/checkpoint'): try: self.FLAGS.checkpoint_path = self.FLAGS.checkpoint_path + '/' + [ mpath for mpath in sorted(os.listdir(self.FLAGS.checkpoint_path)) if os.path.isdir(self.FLAGS.checkpoint_path + '/' + mpath) and not mpath[-3:] == 'val' and os.path.isfile(self.FLAGS.checkpoint_path + '/' + mpath + '/checkpoint') ][-1] except: pass if not self.FLAGS.scratch: print('checkpoint: {}'.format(self.FLAGS.checkpoint_path)) try: init_assign_op, init_feed_dict = slim.assign_from_checkpoint( tf.train.latest_checkpoint(self.FLAGS.checkpoint_path), variables_to_restore) except Exception as e: print( "Failed to initialize network {0} with checkpoint {1} so training from scratch: {2}" .format(FLAGS.network, FLAGS.checkpoint_path, e.message)) FLAGS.scratch = True # create saver for checkpoints self.saver = tf.train.Saver(max_to_keep=10, keep_checkpoint_every_n_hours=1) # Add the loss and metric functions to the graph for both endpoints of train and eval. self.targets = tf.placeholder( tf.int32, [None, self.action_dim]) if FLAGS.discrete else tf.placeholder( tf.float32, [None, self.action_dim]) self.depth_targets = tf.placeholder(tf.float32, [None, 55, 74]) self.define_metrics(self.endpoints['eval']) if self.FLAGS.continue_training and self.FLAGS.lifelonglearning: self.define_star_variables(self.endpoints['train']) # Define the training op based on the total loss self.define_loss(self.endpoints['train']) self.define_train() # Define summaries self.build_summaries() init_all = tf_variables.global_variables_initializer() self.sess.run([init_all]) if not self.FLAGS.scratch: self.sess.run([init_assign_op], init_feed_dict) print('Successfully loaded model from:{}'.format( self.FLAGS.checkpoint_path)) else: print('Training model from scratch so no initialization.') # FOR DEBUGGING: # for v in tf.trainable_variables(): # # # assign importance weights 1 to everyting ==> keep everything as close as possible # # # self.sess.run([tf.assign(self.importance_weights[v.name], np.ones((v.get_shape().as_list()))) for i,v in enumerate(tf.trainable_variables())]) # # assign importance weights 1 to everything > 10**-5, else 0 ==> binair importance over the weights ==> freeze some part and leave other # old_weight = self.sess.run(self.importance_weights[v.name]) # new_weight = old_weight > 10**-2 # new_weight = new_weight.astype(np.float32) # self.sess.run(tf.assign(self.importance_weights[v.name], new_weight)) if self.FLAGS.continue_training and self.FLAGS.lifelonglearning: # print info on loaded importance weights for v in tf.trainable_variables(): weights = self.sess.run(self.importance_weights[v.name]) weights = weights.flatten() # print("{0}: {1} ({2}) min: {3} max: {4}".format(v.name, np.mean(weights), np.var(weights), np.amin(weights), np.amax(weights))) print("| {0} | {1} | {2} | {3} | ".format( v.name, np.percentile(weights, 1), np.percentile(weights, 50), np.percentile(weights, 100))) # assign star_variables after initialization self.sess.run([ tf.assign(self.star_variables[v.name], v) for v in tf.trainable_variables() ])
def unpickle(file): fo = open(file, 'rb') dict = cPickle.load(fo) fo.close() data = dict['data'] imgs = np.transpose(np.reshape(data,(-1,32,32,3), order='F'),axes=(0,2,1,3)) #order batch,x,y,color y = np.asarray(dict['labels'], dtype='uint8') return y, imgs y, imgs = unpickle('/Users/oli/Dropbox/data/CIFAR-10/cifar-10-batches-py/test_batch') y.shape, imgs.shape tf.reset_default_graph() images = tf.placeholder(tf.float32, [None, None, None, 3]) imgs_scaled = tf.image.resize_images(images, (224,224)) slim.nets.vgg.vgg_16(imgs_scaled, is_training=False) variables_to_restore = slim.get_variables_to_restore() print('Number of variables to restore {}'.format(len(variables_to_restore))) init_assign_op, init_feed_dict = slim.assign_from_checkpoint('/Users/oli/Dropbox/server_sync/tf_slim_models/vgg_16.ckpt', variables_to_restore) sess = tf.Session() sess.run(init_assign_op, init_feed_dict) g = tf.get_default_graph() feed = g.get_tensor_by_name('Placeholder:0') fetch = g.get_tensor_by_name('vgg_16/fc6/BiasAdd:0') # Feeding 3 images through the net just for testing feed_vals = imgs[0:3] res = sess.run(fetch, feed_dict={feed:feed_vals}) np.shape(feed_vals), res.shape
def create_image_trainer(image_shape, num_classes, epochs, batch_size=32, learning_rate_base=0.01, learning_rate_decay_interval=7500, learning_rate_decay=0.99, checkpoint_path=None, model_save_path='model', max_to_keep=100, log_file_path='log/train.log'): image_var = tf.placeholder(tf.uint8, (None, ) + image_shape) labels_ = tf.placeholder(tf.float32, (None, num_classes)) preprocessed_image_var = tf.map_fn( lambda x: _preprocess(x, is_training=True), tf.cast(image_var, tf.float32)) l2_normalize = True factory_fn = _network_factory(num_classes=num_classes, is_training=True, weight_decay=1e-8) feature_var, logits_var = factory_fn(preprocessed_image_var, l2_normalize=l2_normalize, reuse=None) feature_dim = feature_var.get_shape().as_list()[-1] classification_loss = slim.losses.softmax_cross_entropy( logits_var, labels_) total_loss = slim.losses.get_total_loss(add_regularization_losses=True) global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(learning_rate_base, global_step, learning_rate_decay_interval, learning_rate_decay) optimizer = tf.train.AdamOptimizer(learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_step = optimizer.minimize(total_loss, global_step=global_step) session = tf.Session() #from tensorflow.python import debug as tf_debug #session = tf_debug.LocalCLIDebugWrapperSession(session) if checkpoint_path is not None: saver = tf.train.Saver(slim.get_variables_to_restore()) init_assign_op, init_feed_dict = slim.assign_from_checkpoint( checkpoint_path, slim.get_variables_to_restore()) session.run(init_assign_op, feed_dict=init_feed_dict) else: session.run(tf.global_variables_initializer()) def trainer(data_x_paths, data_y): data_y = np.asarray(data_y) saver = tf.train.Saver(max_to_keep=max_to_keep) logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG) data_len = len(data_x_paths) num_batches = int(data_len / batch_size) for epoch in range(epochs): indexs = np.arange(data_len) np.random.shuffle(indexs) s, e = 0, 0 for i in range(num_batches): s, e = i * batch_size, (i + 1) * batch_size data_x_batch = np.asarray([ cv2.resize( cv2.imread(data_x_paths[index], cv2.IMREAD_COLOR), (image_shape[1], image_shape[0])) for index in indexs[s:e] ]) data_y_batch = data_y[indexs[s:e]] batch_data_dict = { image_var: data_x_batch, labels_: data_y_batch } _, loss_value, global_step_value = session.run( [train_step, total_loss, global_step], feed_dict=batch_data_dict) print(global_step_value, loss_value) logging.info("%d: %g" % (global_step_value, loss_value)) if (i + 1) % 1000 == 0: print("save model checkpoint for %d..." % global_step_value) saver.save(session, os.path.join(model_save_path, 'model.ckpt'), global_step=global_step) if e < data_len: data_x_batch = np.asarray([ cv2.resize( cv2.imread(data_x_paths[index], cv2.IMREAD_COLOR), (image_shape[1], image_shape[0])) for index in indexs[e:] ]) data_y_batch = data_y[indexs[s:e]] batch_data_dict = { image_var: data_x_batch, labels_: data_y_batch } _, loss_value, global_step_value = session.run( [train_step, total_loss, global_step], feed_dict=batch_data_dict) print(global_step_value, loss_value) logging.info("%d: %g" % (global_step_value, loss_value)) print("save model checkpoint for %d..." % global_step_value) saver.save(session, os.path.join(model_save_path, 'model.ckpt'), global_step=global_step) return trainer
name='Anchor_Placeholder') positive_inputs = tf.placeholder(tf.float32, [None, 224, 224, 3], name='Positive_Placeholder') with slim.arg_scope(vgg_arg_scope()): positive, net1 = vgg_16(positive_inputs) with slim.arg_scope(vgg_arg_scope(reuse=True)): anchor, net2 = vgg_16(anchor_inputs) #print net2 #for key,value in net2.items(): # print '{}\t : {}'.format(key,value.shape) variables_to_restore = slim.get_variables_to_restore( exclude=['vgg_16/fc8']) #variables_to_restore = slim.get_variables_to_restore() init_assign_op, init_feed_dict = slim.assign_from_checkpoint( './vgg_16.ckpt', variables_to_restore) print 'LOSS.........................................................' with tf.name_scope('contrastive-loss'): loss = tf.contrib.losses.metric_learning.contrastive_loss( labels=label, embeddings_anchor=anchor, embeddings_positive=positive, margin=margin) print 'contrastive-loss::', loss.op.name with tf.name_scope('Prediction'): pred = tf.norm(positive - anchor, ord='euclidean') #pred = tf.sqrt(tf.reduce_mean(tf.square(positive-anchor),1)) with tf.Session(config=tf.ConfigProto(
def _create_image_encoder(preprocess_fn, factory_fn, image_shape, batch_size=32, session=None, checkpoint_path=None, loss_mode="cosine"): image_var = tf.placeholder(tf.uint8, (None, ) + image_shape) image_var = tf.Print(image_var, [image_var], message="placeholder") preprocessed_image_var = tf.map_fn( lambda x: preprocess_fn(x, is_training=False), tf.cast(image_var, tf.float32)) l2_normalize = loss_mode == "cosine" feature_var, _ = factory_fn(preprocessed_image_var, l2_normalize=l2_normalize, reuse=None) feature_dim = feature_var.get_shape().as_list()[-1] if session is None: session = tf.Session() if checkpoint_path is not None: slim.get_or_create_global_step() init_assign_op, init_feed_dict = slim.assign_from_checkpoint( checkpoint_path, slim.get_variables_to_restore()) session.run(init_assign_op, feed_dict=init_feed_dict) def get_print_val(sess, name, feed_dict=None, doprint=True): if feed_dict: output = sess.run(name, feed_dict=feed_dict) if doprint: print name, ' transposed ', output.transpose(0, 3, 1, 2).shape print output.transpose(0, 3, 1, 2)[0, 0, 0, 0:20] print output.transpose(0, 3, 1, 2)[0, 0, 1, 0:20] else: output = sess.run(name) if doprint: print name, output.shape print output return output def encoder(data_x): out = np.zeros((len(data_x), feature_dim), np.float32) print 'datax:', data_x.shape print(data_x[0, 0, 0:100, 0]) print(data_x[0, 0, 0:100, 1]) print(data_x[0, 0, 0:100, 2]) #print(data_x[1,0,0:100,0]) #print(data_x[1,0,0:100,1]) #print(data_x[1,0,0:100,2]) #_run_in_batches( # lambda x: session.run(feature_var, feed_dict=x), # #lambda x: session.run('conv1_1/Elu:0', feed_dict=x), # {image_var: data_x}, out, batch_size) if 1: elu_1_data = get_print_val(session, 'Elu_1:0', feed_dict={image_var: data_x}) get_print_val(session, 'conv3_1/1/Elu:0', feed_dict={image_var: data_x}) #else: conv3_1_weights = get_print_val(session, 'conv3_1/1/weights:0', doprint=False) mean = get_print_val(session, 'conv3_1/1/conv3_1/1/bn/moving_mean:0') var = get_print_val(session, 'conv3_1/1/conv3_1/1/bn/moving_variance:0') offset = get_print_val(session, 'conv3_1/1/conv3_1/1/bn/beta:0') print 'offset ', offset.shape, type(offset[0]) session.close() scale = np.ones(offset.shape, dtype=np.float32) print 'scale ', scale.shape, type(scale[0]) ####### new graph elu_1 = tf.placeholder(tf.float32, (None, ) + (64, 32, 32)) print 'elu_1 shape', elu_1.get_shape() dd = tf.nn.conv2d(elu_1, conv3_1_weights, [1, 2, 2, 1], 'SAME', name='conv3_1') dd = tf.nn.batch_normalization(dd, mean, var, offset, scale, 1e-3) dd = tf.nn.elu(dd, name='conv3_1/elu') #cn = elu_1.get_shape().as_list()[-1]*2 #scope = 'conv3_1' #conv_weight_init = tf.truncated_normal_initializer(stddev=1e-3) #conv_bias_init = tf.zeros_initializer() #conv_regularizer = slim.l2_regularizer(1e-8) #elu_1_conv = slim.conv2d( # elu_1, cn, [3, 3], stride=2, activation_fn=tf.nn.elu, padding="SAME", # normalizer_fn=_batch_norm_fn, weights_initializer=conv_weight_init, # biases_initializer=conv_bias_init, weights_regularizer=conv_regularizer, # scope=scope + "/1") sess = tf.Session() get_print_val(sess, 'conv3_1:0', feed_dict={elu_1: elu_1_data}) ############ return out return encoder
def train(): seed = 8964 tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) pp = pprint.PrettyPrinter() pp.pprint(flags.FLAGS.__flags) if not os.path.exists(opt.checkpoint_dir): os.makedirs(opt.checkpoint_dir) with tf.Graph().as_default(): # Data Loader loader = DataLoader(opt) tgt_image, src_image_stack, intrinsics = loader.load_train_batch() # Build Model model = GeoNetModel(opt, tgt_image, src_image_stack, intrinsics) loss = model.total_loss # Train Op if opt.mode == 'train_flow' and opt.flownet_type == "residual": # we pretrain DepthNet & PoseNet, then finetune ResFlowNetS train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "flow_net") vars_to_restore = slim.get_variables_to_restore(include=["depth_net", "pose_net"]) else: train_vars = [var for var in tf.trainable_variables()] vars_to_restore = slim.get_model_variables() if opt.init_ckpt_file != None: init_assign_op, init_feed_dict = slim.assign_from_checkpoint( opt.init_ckpt_file, vars_to_restore) optim = tf.train.AdamOptimizer(opt.learning_rate, 0.9) train_op = slim.learning.create_train_op(loss, optim, variables_to_train=train_vars) # Global Step global_step = tf.Variable(0, name='global_step', trainable=False) incr_global_step = tf.assign(global_step, global_step+1) # Parameter Count parameter_count = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) \ for v in train_vars]) # Saver saver = tf.train.Saver([var for var in tf.model_variables()] + \ [global_step], max_to_keep=opt.max_to_keep) # Session sv = tf.train.Supervisor(logdir=opt.checkpoint_dir, save_summaries_secs=0, saver=None) config = tf.ConfigProto() config.gpu_options.allow_growth = True with sv.managed_session(config=config) as sess: print('Trainable variables: ') for var in train_vars: print(var.name) print("parameter_count =", sess.run(parameter_count)) if opt.init_ckpt_file != None: sess.run(init_assign_op, init_feed_dict) start_time = time.time() for step in range(1, opt.max_steps): fetches = { "train": train_op, "global_step": global_step, "incr_global_step": incr_global_step } if step % 100 == 0: fetches["loss"] = loss results = sess.run(fetches) if step % 100 == 0: time_per_iter = (time.time() - start_time) / 100 start_time = time.time() print('Iteration: [%7d] | Time: %4.4fs/iter | Loss: %.3f' \ % (step, time_per_iter, results["loss"])) if step % opt.save_ckpt_freq == 0: saver.save(sess, os.path.join(opt.checkpoint_dir, 'model'), global_step=step)
def train(finetune): is_training = True # data pipeline imgs, true_boxes = gen_data_batch(cfg.data_path, cfg.batch_size*cfg.train.num_gpus) imgs_split = tf.split(imgs, cfg.train.num_gpus) true_boxes_split = tf.split(true_boxes, cfg.train.num_gpus) global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0.), trainable=False) #lr = tf.train.piecewise_constant(global_step, cfg.train.lr_steps, cfg.train.learning_rate) #optimizer = tf.train.AdamOptimizer(learning_rate=lr) learn_rate_decay_step = int(cfg.train.num_samples / cfg.batch_size / cfg.train.num_gpus * cfg.train.learn_rate_decay_epoch) learning_rate = tf.train.exponential_decay(cfg.train.learn_rate, global_step, learn_rate_decay_step, cfg.train.learn_rate_decay, staircase=True) #optimizer = tf.train.GradientDescentOptimizer(learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # Calculate the gradients for each model tower. tower_grads = [] with tf.variable_scope(tf.get_variable_scope()): for i in range(cfg.train.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (cfg.train.tower, i)) as scope: model = SenseClsNet(imgs_split[i], true_boxes_split[i], is_training) loss = model.compute_loss() tf.get_variable_scope().reuse_variables() grads = optimizer.compute_gradients(loss) tower_grads.append(grads) if i == 0: current_loss = loss update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # print(tf.GraphKeys.UPDATE_OPS) # print(update_op) # print(grads) # vars_det = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="BioRecNet") grads = average_gradients(tower_grads) with tf.control_dependencies(update_op): apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) train_op = tf.group(apply_gradient_op,*update_op) # GPU config config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Create a saver saver = tf.train.Saver(max_to_keep=1000) ckpt_dir = cfg.ckpt_path if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) # init sess.run(tf.global_variables_initializer()) if finetune: checkpoint = './pre_train.ckpt' # variables_to_restore = slim.get_variables_to_restore() # init_assign_op, init_feed_dict = slim.assign_from_checkpoint(checkpoint, variables_to_restore, ignore_missing_vars=True) # sess.run(init_assign_op, init_feed_dict) variables_to_restore = get_variables_to_restore(exclude_global_pool=True) init_assign_op, init_feed_dict = slim.assign_from_checkpoint(checkpoint, variables_to_restore, ignore_missing_vars=True) sess.run(init_assign_op, init_feed_dict) # running cnt_epoch = 0 for i in range(1, cfg.train.max_batches): _, loss_, lr_ = sess.run([train_op, current_loss, learning_rate]) if(i % 5 == 0): print(i,': ', loss_, ' lr: ', lr_) if int(i) % int(cfg.train.num_samples / cfg.train.num_gpus / cfg.batch_size) == 0: cnt_epoch += 1 saver.save(sess, ckpt_dir+'senceCls', global_step=cnt_epoch, write_meta_graph=True)